-
Mathieu Loiseau authoredbb67b48b
wikstraktor.py 27.03 KiB
#!/usr/bin/env python3
import pywikibot
import wikitextparser
import importlib
import json
from wikstraktor_version import version as the_version
from wikstraklog import Wikstraklog
from copy import deepcopy as dc
import re
def get_list_string_level(wikitext):
list_chars = {"*", "#", ":"}
i = 0
while i < len(wikitext) and wikitext[i] in list_chars:
i+=1
return i
class SubInfo:
next_id = 1
prfx = "err"
@classmethod
def inc_n_id(cls):
cls.next_id += 1
@classmethod
def reset(cls):
cls.next_id = 0
def __init__(self, prefix = None):
self.id = self.__class__.next_id
self.__class__.inc_n_id()
self.label = None
self.set_id(prefix)
def set_id(self, prefix, force = False):
if (self.label == None or force) and prefix != None:
self.label = f"{prefix}_{self.__class__.prfx}{self.id}"
return self.label
def replace_src_in_id(self, former_src, new_src, copy=False):
##Attention si on nettoie en mettant des sources partout, il faudra changer
res = None
if self.label != None and former_src != None and new_src != None :
if not copy:
self.label = re.sub(r'^([\w\.]+)-('+str(former_src)+')',r"\1-"+str(new_src), self.label)
res = self.label
else:
new_obj = dc(self)
new_obj.label = re.sub(r'^([\w\.]+)-('+str(former_src)+')',r"\1-"+str(new_src), self.label)
res = new_obj
return res
def get_src_from_id(self):
res = None
if self.label != None:
gp = re.match(r'^[\w\.]+-(\d{1,2})', self.label)
if gp:
res = int(gp.group(1))
return res
def serializable(self, prefix = None):
res = {}
if prefix != None:
res["id"] = self.set_id(prefix)
return res
#######
# Oral
#######
class Sound:
def __init__(self, url, accent):
self.url = url
self.accent = accent
def __eq__(self, other):
return isinstance(other, self.__class__) and self.url == other.url and self.accent == other.accent
def serializable(self):
if self.accent == None:
res = {"url":self.url}
else:
res = {"accent":self.accent, "url":self.url}
return res
class Pronunciation(SubInfo):
prfx = "prn"
def __init__(self, prefix = None):
super().__init__(prefix)
self.ipa = None
self.sounds = []
self.accents = set()
def set_transcription(self, tscpt):
self.ipa = tscpt
def set_accent(self, accent):
self.accents.add(accent)
def has_accents(self):
return len(self.accents) > 0
def has_sounds(self):
return len(self.sounds) > 0
def add_sound(self, url, accent=None):
self.sounds.append(Sound(url,accent))
def serializable(self, prefix = None):
res = super().serializable(prefix)
snds = []
for s in self.sounds:
snds.append(s.serializable())
res['transcript'] = self.ipa
if self.has_accents():
res['accents'] = list(self.accents)
res['sounds'] = snds
return res
def __str__(self):
return json.dumps(self.serializable(''))
def __eq__(self, other):
res = isinstance(other, self.__class__) and self.ipa == other.ipa and self.accents == other.accents and len(self.sounds)==len(other.sounds)
i = 0
while res and i<len(self.sounds):
res = self.sounds[i] == other.sounds[i]
i += 1
return res
#######
# Metadata
## TODO:
# * POS : créer une classe POS avec les traits dépendants (ex: masc en fr)
#######
#######
# Senses
# TODO: créer une classe Translations
#######
class Definition(SubInfo):
prfx = "def"
key = "definition"
def __init__(self, lang, text, prefix=None):
super().__init__(prefix)
if text != "":
self.lang = lang
self.text = text
else:
raise ValueError(f"Definition.__init__: “{text}” empty definition.")
def __eq__(self, other):
return isinstance(other, self.__class__) and self.lang == other.lang and self.text == other.text
def serializable(self, prefix = None):
res = super().serializable(prefix)
res["lang"] = self.lang
res[self.__class__.key] = self.text
return res
class Translation(Definition):
prfx = "trad"
key = "translation"
class Example(SubInfo):
prfx = "ex"
def __init__(self, transcript, source=None, url=None, prefix=None):
super().__init__(prefix)
if transcript != "":
self.text = transcript
self.source = source
self.url = url
else:
raise ValueError(f"Example.__init__: “{transcript}” empty example.")
def __eq__(self, other):
return isinstance(other, self.__class__) and self.text==other.text and self.source==other.source and self.url==other.url
def serializable(self, prefix = None):
res = super().serializable(prefix)
res["example"]=self.text
if self.source != None:
res["source"] = self.source
if self.url != None:
res["url"] = self.url
return res
class Sense(SubInfo):
prfx = "s"
@classmethod
def reset_sub_counters(cls):
if cls.__name__ == "Sense":
Definition.reset()
Example.reset()
Translation.reset()
SubSense.reset()
def __init__(self, lang=None, definition=None, wiki_lang=None, prefix=None):
super().__init__(prefix)
self.lang = lang
#On réinitialise les identifiants des sous-éléments
Sense.reset_sub_counters()
self.definitions = [] #liste des définitions (elles auront une langue et un texte)
self.subsenses = [] #liste des sous-définitions (récursif…)
self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels)
self.translations = [] #liste des traductions dans d'autres langues
self.domain = None #domaine d'usage du mot dans ce sens
self.metadata = {}
self.regions = set()
if definition != None:
try:
self.add_def(wiki_lang, definition)
except ValueError as err:
raise ValueError(f"Sense.__init__() with empty definition\n{err}")
def add_metadata(self, key, value):
if self.metadata_exists(key):
print("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata[key]}” by {key}:“{value}”")
self.metadata[key]=value
def add_to_metadata(self, key, value):
if not self.metadata_exists(key):
self.metadata[key] = []
self.metadata[key].append(value)
def add_region(self, region):
self.regions.add(region)
#to add at the end of the metadata, if empty add_metadata not add_to_metadata
def extend_metadata(self, key, value, separator=""):
if not self.metadata_exists(key):
self.add_metadata(key, value)
elif type(self.metadata[key]) == list:
self.metadata[key][-1] += separator+value
else:
self.metadata[key] += separator+value
def metadata_exists(self, key):
return key in self.metadata.keys()
def get_id(self):
return self.label
def set_domain(self, d):
self.domain = d
def add_def(self, lang, definition):
if isinstance(definition, Definition):
theDef = definition
else:
theDef = Definition(lang, definition)
if theDef != None and theDef not in self.definitions:
theDef.set_id(self.get_id())
self.definitions.append(theDef)
def add_example(self, transcript, src=None, url=None, prefix=None):
try:
theEx = Example(transcript, src, url, prefix)
if theEx != None and theEx not in self.examples:
theEx.set_id(self.get_id())
self.examples.append(theEx)
except ValueError as e:
print(f"Skipped empty example")
def add_translation(self, lang=None, translation=None):
theTranslation = Translation(lang, translation)
if theTranslation != None and theTranslation not in self.translations:
theTranslation.set_id(self.get_id())
self.translations.append(theTranslation)
def add_subsense(self, subsense):
if self.label!=None:
subsense.set_id(self.set_id())
if subsense not in self.subsenses:
self.subsenses.append(subsense)
def __eq__(self, other):
res = isinstance(other, self.__class__) and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain and len(other.metadata) == len(self.metadata) and other.regions == self.regions #and self.id == other.id
i = 0
while res and i < len(self.examples):
res = self.examples[i] in other.examples
i+=1
i = 0
while res and i < len(self.translations):
res = self.translations[i] in other.translations
i+=1
i = 0
while res and i < len(self.definitions):
res = self.definitions[i] in other.definitions
i+=1
i = 0
while res and i < len(self.subsenses):
res = self.subsenses[i] in other.subsenses
i+=1
i = 0
l = list(self.metadata.keys())
while res and i < len(l):
res = l[i] in other.metadata.keys() and type(self.metadata[l[i]]) == type(other.metadata[l[i]])
if res and type(self.metadata[l[i]]) == list and len(self.metadata[l[i]]) == len(other.metadata[l[i]]):
j = 0
while res and j < len(self.metadata[l[i]]):
res = self.metadata[l[i]][j] in other.metadata[l[i]]
j+=1
i+=1
return res
def serializable(self, id = True):
res = {}
if id:
prefix = self.get_id()
Sense.reset_sub_counters()
if self.domain != None:
res["Domain"] = self.domain
if len(self.regions) > 0:
res['Regions'] = list(self.regions)
if len(self.definitions) > 0:
res["Definitions"] = []
for d in self.definitions:
res["Definitions"].append(d.serializable(prefix))
if len(self.metadata.keys()) > 0 :
res["Metadata"] = self.metadata
if len(self.examples) > 0 :
res["Examples"] = []
for e in self.examples:
res["Examples"].append(e.serializable(prefix))
if len(self.translations) > 0:
res["Translations"] = []
for t in self.translations:
res["Translations"].append(t.serializable(prefix))
if len(self.subsenses) > 0:
res["Subsenses"] = {}
for t in self.subsenses:
res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix)
return res
def __str__(self):
return json.dumps(self.serializable())
class SubSense(Sense):
def set_id(self, prefix, force = False):
if (self.label == None or force) and prefix != None:
self.label = f"{prefix}.{self.id}"
return self.label
class Entry:
#version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id)
def __init__(self, lemma, lang, wiki_lang, version_id, wkskt_version):
self.lemma = lemma
self.lang = lang
#Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile
self.sources = []
self.add_source(wiki_lang, version_id, wkskt_version)
self.pronunciations = []
self.pos = None
self.senses = []
#l'identifiant unique de la version de la page du wiktionnaire
Sense.reset()
def set_pos(self, pos):
self.pos = pos
def add_source(self, wiki_lang, version_id, wkskt_version):
self.sources.append({"wiktionary_language":wiki_lang,"permanentId":version_id,"wikstraktor_version":wkskt_version})
self.current_source = len(self.sources)-1
def set_current_source(self, src):
self.current_source = src
def get_prefix(self, source_id=-1):
if self.pos != None:
pos = "."+self.pos
else:
pos = ""
if source_id == -1:
source_id = self.current_source
return f"{self.lang}.{self.lemma}{pos}-{source_id}"
def get_id(self):
return f"{self.lang}.{self.lemma}.{self.pos}"
def set_pronunciations(self, pron):
if isinstance(pron, Pronunciation):
self.add_pronunciation(pron)
elif type(pron) == list:
for p in pron:
if isinstance(p, Pronunciation):
self.add_pronunciation(p)
else:
raise ValueError(f"Entry.set_pronunciations: {p} is not a Pronunciation object ({p.__class__.__name__}).")
else:
raise ValueError(f"Entry.set_pronunciations: {pron} is not a Pronunciation object ({pron.__class__.__name__}).")
def add_pronunciation(self, p):
if p not in self.pronunciations:
p.set_id(self.get_prefix(), True) #pro often parsed without context
self.pronunciations.append(p)
def set_senses(self, senses):
for s in senses:
if isinstance(s, Sense):
self.add_sense(s)
else:
raise ValueError(f"Entry.set_senses: {s} is not a Sense object ({p.__class__.__name__}).")
def add_sense(self, s):
if s not in self.senses:
s.set_id(self.get_prefix())
self.senses.append(s)
def is_valid(self):
return self.lemma != None and self.pos != None and len(self.senses) > 0
# and len(self.pronunciations) > 0 ← must work without pronounciations
def same(self, other):
return isinstance(other, self.__class__) and self.lemma == other.lemma and self.lang == other.lang and self.pos == other.pos
def merge(self, other):
if self.same(other) and self != other:
i = 0
src_map = []
max_id = len(self.sources) - 1
while i < len(other.sources):
if other.sources[i] in self.sources:
src_map.append(self.sources.index(other.sources[i])) #should append at rank i
else :
self.add_source(other.sources[i]["wiktionary_language"],other.sources[i]["permanentId"],other.sources[i]["wikstraktor_version"])
src_map.append(self.current_source) #should append at rank i
i += 1
for p in other.pronunciations:
src = p.get_src_from_id()
if src != None and src <= max_id and src < len(src_map) and src_map[src] != src:
#max_id, c'est parce qu'un même objet peut être
#à plusieurs endroits et avoir déjà été modifié
p=p.replace_src_in_id(src, src_map[src], True)
self.add_pronunciation(p)
for s in other.senses:
src = s.get_src_from_id()
if src != None and src_map[src] != src:
s.replace_src_in_id(src, src_map[src])
for ss in s.subsenses:
ss.replace_src_in_id(src, src_map[src])
for d in ss.definitions:
d.replace_src_in_id(src, src_map[src])
for e in ss.examples:
e.replace_src_in_id(src, src_map[src])
for t in ss.translations:
t.replace_src_in_id(src, src_map[src])
for d in s.definitions:
d.replace_src_in_id(src, src_map[src])
for e in s.examples:
e.replace_src_in_id(src, src_map[src])
for t in s.translations:
t.replace_src_in_id(src, src_map[src])
self.add_sense(s)
else:
if isinstance(other, self.__class__):
raise TypeError(f"Entry.merge() error : {other.__class__} object cannot be merged with Entry")
else:
raise ValueError(f"Entry.merge() error : {self.id} cannot be merged with {other.id}")
def __eq__(self, other):
res = self.same(other) and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses)
i = 0
while res and i < len(self.senses):
res = self.senses[i] == other.senses[i]
i += 1
i = 0
while res and i < len(self.pronunciations):
res = self.pronunciations[i] == other.pronunciations[i]
i += 1
return res
def serializable(self, id=True):
res = {}
res['sources'] = self.sources
if id:
prefix = self.get_prefix()
res['id'] = self.get_id()
else:
prefix == None
res[self.lemma] = {"pos":self.pos}
res[self.lemma]["pronunciations"] = []
for p in self.pronunciations:
res[self.lemma]["pronunciations"].append(p.serializable(prefix))
res[self.lemma]["senses"] = {}
for s in self.senses:
res[self.lemma]["senses"][s.get_id()]=s.serializable(id)
return res
def __str__(self):
res = f"{self.lemma}_{self.lang} ({self.pos})\n"
for p in self.pronunciations:
res += f"{str(p)}\n"
for s in self.senses:
res += f"{str(s)}\n"
return res
class ParserContext:
def __init__(self, entry, lang, wiki_lang, wversion_id, version_id):
self.lemma = entry
self.lang = lang
self.wiki_lang = wiki_lang
self.page_version_id = wversion_id
self.wikstraktor_version = version_id
self.context = []
self.entries = []
#reset counters
Sense.reset()
Sense.reset_sub_counters()
Pronunciation.reset()
def get_level(self):
if len(self.context) == 0:
res = -1
else:
res = self.context[-1]["wiki"].level
return res
def push(self, wiki_context):
self.context.append({"wiki":wiki_context})
def pop(self, testNewEntry = True):
if testNewEntry:
self.create_entries()
return self.context.pop()
def flush(self):
while len(self.context) > 0:
self.pop(True)
def set_top_wiki(self, wiki_context):
if len(self.context) == 0:
self.push(wiki_context)
else:
self.context[-1]['wiki'] = wiki_context
def set_top_entry_info(self, key, entry_context, testNewEntry=True):
if len(self.context) == 0:
raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.")
else:
self.context[-1][key] = entry_context
if testNewEntry:
self.create_entries()
def create_entries(self):
#In the key dict there are traits that describe every thing (ety, pro) and different entities (POS:senses)
tmp = {}
res = 0
pro = None
for l in self.context:
for k,v in l.items():
if k == "pro":
pro = v
elif k == "ety" or k == "wiki":
#wiki context is not necessary
pass #On ignore l'étymologie pour le moment
else:
tmp[k]=v
if len(tmp)>0 : #There can be no pronounciations
for pos,senses in tmp.items():
e = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version)
if pro != None:
e.set_pronunciations(pro)
e.set_pos(pos)
e.set_senses(senses)
#an improvement would be to remove that sense from context, but we test not to add doubles
if e.is_valid() and e not in self.entries:
res += 1
self.entries.append(e)
return res
def debug_top(self):
res = "Context: "
if len(self.context) == 0 :
res += "0"
else:
info = ""
for k,v in self.context[-1].items():
if k != 'wiki':
if info != "":
info += "\n\t\t\t"
info += f"{k} → {str(v)}"
res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}"
return res
def __str__(self):
res = ""
i=0
for c in self.context:
res += f"====={i}======\n"
for k,v in c.items():
if k!= "wiki":
res+=f" {k}→{v}\n"
else:
res+=f" {k}→{len(v)}\n"
i+=1
return res+f"nb of entries: {len(self.entries)}"
class Wikstraktor:
@classmethod
def get_instance(cls, wiki_language, entry_language):
try:
m_name = f"{wiki_language}_{entry_language}".capitalize()
instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")()
instance.version = the_version
instance.log = Wikstraklog(the_version, entry_language, wiki_language)
except ModuleNotFoundError:
print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module")
instance = None
return instance
def __init__(self):
self.entries = {}
self.pwb = pywikibot
self.wtp = wikitextparser
self.parserContext = None
def get_file_url(self, file_page_name):
res = None
try:
f = self.pwb.FilePage(self.site, file_page_name)
res = f.get_file_url()
except pywikibot.exceptions.NoPageError:
print(f"{file_page_name} does not exist in {self.site}.")
return res
def add_entry(self, e):
if e.get_id() in self.entries.keys():
if e != self.entries[e.get_id()]:
self.entries[e.get_id()].merge(e)
else:
self.entries[e.get_id()] = e
#retrieves the content of a page and processes it (adding the entries to the list of entries)
#returns the number of entries added
def fetch(self, graphy):
nb_entries_added = 0
page = self.pwb.Page(self.site, graphy)
to_parse = []
if page.text != "":
sections = self.wtp.parse(page.text).sections
found = False
i = 0
### find language
while i < len(sections) and not found:
found = sections[i].title != None and sections[i].title.capitalize() == self.constants[self.entry_language]
if not found:
i += 1
if found:
nb_entries_added = self.parse(page.title(), page.latest_revision_id, sections[i].sections)
else:
self.log.add_log("Wikstraktor.fetch", f"“{graphy}” page not found")
return nb_entries_added
def parse(self, entry, v_id, sections):
self.parserContext = ParserContext(entry, self.entry_language, self.wiki_language, v_id, self.version)
self.log.set_context(entry, v_id)
for s in sections:
if s.title != None :
#handle wiki context
if self.parserContext.get_level() < s.level:
self.parserContext.push(s)
else:
while self.parserContext.get_level() > s.level:
self.parserContext.pop(True)
self.parserContext.set_top_wiki(s)
#get section title
stitle = self.wtp.parse(s.title).templates
if stitle == []:
stitle = s.title
else:
stitle = stitle[0].arguments[0].value
if self.isPro(stitle):
self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents)))
elif self.isEty(stitle):
self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents)))
else:
#Edit to process other types of sections
pos = self.process_POS(stitle)
if pos != None :
self.parserContext.set_top_entry_info(pos, self.process_senses(self.wtp.parse(s.contents)))
self.parserContext.flush()
res = len(self.parserContext.entries)
if res > 0:
for e in self.parserContext.entries:
self.add_entry(e)
return res
def isPro(self, title):
if type(self.constants['pro']) == str:
res = title == self.constants['pro']
else:
res = title in self.constants['pro']
return res
def isEty(self, title):
if type(self.constants['ety']) == str:
res = title == self.constants['ety']
else:
res = title in self.constants['ety']
return res
#recognizes POS and returns None if it can't
def process_POS(self, parsedwikitext):
pass#in subclass
def process_pronunciation(self, parsedwikitext):
pass#in subclass
def process_etymology(self, parsedwikitext):
pass#in subclass
def parse_alt_spell(self, templates):
the_def = None
for t in templates:
if t.normal_name() == self.constants['t_alt']:
the_def = Definition(self.entry_language, f"Alternate spelling of “{t.arguments[self.constants['t_alt_param']].value}”")
break
return the_def
#can be overloaded
def process_example(self, example_wiki_text):
k = 0
isEx = 0
res = None
#process templates
while k < len(self.wtp.parse(example_wiki_text).templates) and isEx == 0 :
if (self.wtp.parse(example_wiki_text).templates[k].normal_name() in self.constants['t_ex']):
res = self.wtp.parse(example_wiki_text).templates[0].arguments[-1].value
isEx = 1
k += 1
if isEx == 0:
res = self.wtp.parse(example_wiki_text).plain_text().strip()
return res
#can be overloaded
def parse_definition(self, definition_wikitext):
if type(definition_wikitext) == str:
res = self.wtp.parse(definition_wikitext).plain_text().strip()
elif isinstance(definition_wikitext, wikitextparser.WikiText):
res = definition_wikitext.plain_text().strip()
return res
#can be overloaded
def get_sense_metadata(self, sense, definition_wikitext):
pass
#can be overloaded
def process_definition(self, definition, sub_items, def_level = True):
#does not process wk_en quotations
try:
parsed_def = self.wtp.parse(definition)
if def_level:
newSense = Sense(self.entry_language, self.parse_definition(parsed_def), self.wiki_language)
self.get_sense_metadata(newSense, parsed_def)
pattern_ex = self.constants['sense_pattern'][0]["ex"]
pattern_subdef = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["def"]
else:
newSense = SubSense(self.entry_language, self.parse_definition(parsed_def), self.wiki_language)
self.get_sense_metadata(newSense, parsed_def)
pattern_subdef = None
pattern_ex = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["ex"]
#Process examples
a = 0
for item_list in sub_items:
if item_list.pattern == pattern_ex:
for item in item_list.items:
newSense.add_example(self.process_example(item))
#Si on veut traiter les sous items (ex traductions), on peut utiliser
if def_level and item_list.pattern == pattern_subdef:
b = 0
for item in item_list.items:
try:
sub_sub = item_list.sublists(b)
except IndexError as err:
sub_sub = []
print(f"There is an error in the selection of subitems:\n\t{b}th item of\n\t{itm_list.sublists()}\ntriggered {err}")
newSense.add_subsense(self.process_definition(item, sub_sub, False))
b += 1
a += 1
except ValueError as err:
self.log.add_log("Wikstraktor.process_definition", f"“{definition}” processed as empty")
raise ValueError(f"Wikstraktor.process_definition with empty definition\n{err}")
return newSense
def process_senses(self, sensesContent):
l = sensesContent.get_lists((self.constants['sense_pattern'][0]["def"]))
senses = []
if len(l) > 1:
self.log.add_log("Wikstraktor.process_senses", f"===== WARNING ======\nmore than one sense list, make sure we don't forget anything\nignored lists : \n{l[1:]}\n===================")
elif len(l) == 0:
self.log.add_log("Wikstraktor.process_senses", f"===== WARNING ======\nno sense")
l = l[0] #l now contains a list of list items
if l.pattern == self.constants['sense_pattern'][0]["def"]:
i = 0
for item in l.items:
try:
senses.append(self.process_definition(item, l.sublists(i)))
i += 1
except ValueError as err:
print("Skipped empty definition")
return senses
def __add__(self, other):
if isinstance(other, Wikstraktor):
for k,e in other.entries.items():
if k in self.entries.keys():
self.entries[k].merge(e)
else:
self.entries[k] = e
else:
raise TypeError(f"Wikstraktor '+' : {other.__class__} object cannot be added to {self.__class__}")
return self
def __str__(self):
return self.export()
def serialize(self, id=True):
res = []
for e in self.entries.values():
res.append(e.serializable(id))
return res
def export(self, id=True, ascii=False, compact=False):
if compact:
return json.dumps(self.serialize(id), ensure_ascii=ascii)
else:
return json.dumps(self.serialize(id), ensure_ascii=ascii, indent=4)
if __name__ == "__main__":
import argparse
from argparse import RawTextHelpFormatter #pour le formattage de l'aide
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire
\033[1m\033[32mex :\033[0m
‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m
‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m
‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -n -A -C\033[0m""")
parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en")
parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en")
parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None)
parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None)
parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true")
parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true")
parser.add_argument("-n", "--no_id", help="json sans id", action="store_true")
args = parser.parse_args()
wiki_languages = args.wiki_language.split("+")
languages = args.language.split("+")
if args.mot != None:
resp = None
for w_l in wiki_languages:
for l in languages :
w = Wikstraktor.get_instance(w_l, l)
if w.fetch(args.mot) > 0:
if resp == None:
resp = w
else:
resp += w
print(resp != None)
if resp != None :
resp = resp.export(not args.no_id, args.force_ascii, args.compact)
if args.destination_file != None:
f = open(args.destination_file, "w")
f.write(resp)
f.close
else:
print(resp)
else:
raise NameError("Pas de mot demandé")