diff --git a/blue.json b/blue.json new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/parsers/en_en.py b/parsers/en_en.py index 7344a36da194cf41dfb6721f66b022b501420ebe..1e1a5de3f2d6447ec2219d7b193d677b201c9c37 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -from wikstraktor import Wikstraktor, Pronunciation, Sense +from wikstraktor import Wikstraktor, Pronunciation, Sense, SubSense from parsers.en_constants import string_values @@ -17,10 +17,9 @@ class En_en_straktor(Wikstraktor): # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux l = proContent.get_lists()[0] i = 0 - cpt = 0 pronunciations = [] while i < len(l.fullitems): - p = Pronunciation() + p = Pronunciation(len(pronunciations)) templates = self.wtp.parse(l.fullitems[i]).templates a = None for j, t in enumerate(templates): @@ -33,8 +32,6 @@ class En_en_straktor(Wikstraktor): p.add_sound(self.get_file_url(t.arguments[1].value), a) if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : if p.ipa != None or p.accent != None: - cpt += 1 - p.id= f"p_{cpt}" pronunciations.append(p) p = Pronunciation() i += 1 @@ -51,71 +48,54 @@ class En_en_straktor(Wikstraktor): pos = self.constants['POS'][parsedwikitext] return pos - def process_senses(self, entry, pos, sensesContent): - baseId = f"{entry}_{pos}_" + def process_senses(self, sensesContent): + print("process_senses") l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) i = 0 senses = [] - nombreDef = 0 while i < len(l): if l[i].pattern == '\\# ': - nombreDef += 1 -<<<<<<< HEAD - newSense = Sense(f"{baseId}{nombreDef}", self.entry_language) + newSense = Sense(lang=self.entry_language) newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) -======= - newSense = Sense(f"{baseId}{nombreDef}") - newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"d_{nombreDef}") - newSense.add_translation(f"t_{nombreDef}_0") ->>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe + #newSence.add_translation() elif l[i].pattern == '\\#:': - cptEx=0 for j in l[i].items: k = 0 isEx = 0 while k < len(self.wtp.parse(j).templates) and isEx == 0 : if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - cptEx +=1 - newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"e_{nombreDef}_{cptEx}") + newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) isEx = 1 k += 1 if isEx == 0: - newSense.add_example(self.wtp.parse(j).plain_text().strip(), f"e_{nombreDef}_{cptEx}") + newSense.add_example(self.wtp.parse(j).plain_text().strip()) if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': senses.append(newSense) cnt = 0 - nombreSousDef = 0 while i < len(l) and l[i].level == 3 : cnt +=1 if l[i].pattern == '\\## ': - nombreSousDef += 1 -<<<<<<< HEAD - newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language) + newSense2 = SubSense(lang=self.entry_language) newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) -======= - newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") - newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"sd_{nombreDef}_{nombreSousDef}") - newSense2.add_translation(f"t_{nombreDef}_{nombreSousDef}_0") ->>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe + #newSense2.add_translation() elif l[i].pattern == '\\##:': - cptex2 = 0 for j in l[i].items: k = 0 isEx = 0 while k < len(self.wtp.parse(j).templates) and isEx == 0 : if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - cptex2 +=1 - newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"se_{nombreDef}_{nombreSousDef}_{cptex2}") + newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) isEx = 1 k += 1 if isEx == 0: - newSense2.add_example(self.wtp.parse(j).plain_text().strip(), f"se_{nombreDef}_{nombreSousDef}_{cptex2}") + newSense2.add_example(self.wtp.parse(j).plain_text().strip()) if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': newSense.add_subsense(newSense2) i += 1 if cnt > 0: i -= 1 i += 1 + print(f"process_senses done {senses[-1].serializable()}") return senses if __name__ == "__main__": diff --git a/parsers/fr_en.py b/parsers/fr_en.py index 30cb0a9ead49ae0fa06d414d6dd1423fa614a291..a44ff5d04de1e640647b320130720056621fe31a 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -56,25 +56,15 @@ class Fr_en_straktor(Wikstraktor): print(pos) return pos - def process_senses(self, entry, pos, sensesContent): - baseId = f"{entry}_{pos}_" + def process_senses(self, sensesContent): l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) i = 0 senses = [] - nombreDef = 0 while i < len(l): if l[i].pattern == '\\# ': -<<<<<<< HEAD - nombreDef += 1 - newSense = Sense(f"{baseId}{nombreDef}", self.entry_language) + newSense = Sense(lang=self.entry_language) newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) -======= - #A revoir ça, très douteux - for nbDef in l[i].items : - nombreDef += 1 - newSense = Sense(f"{baseId}{nombreDef}") - newSense.add_def(self.wiki_language, self.wtp.parse(nbDef).plain_text().strip()) ->>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe + #la version d'enzo ajoute +ieurs defs (for i in l[i].items) elif l[i].pattern == '\\#:': for j in l[i].items: k = 0 @@ -89,12 +79,10 @@ class Fr_en_straktor(Wikstraktor): if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': senses.append(newSense) cnt = 0 - nombreSousDef = 0 while i < len(l) and l[i].level == 3 : cnt +=1 if l[i].pattern == '\\## ': - nombreSousDef += 1 - newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language) + newSense2 = SubSense(lang=self.entry_language) newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) elif l[i].pattern == '\\##:': for j in l[i].items: diff --git a/wikstraktor.py b/wikstraktor.py index d6f6c400d0d7ecc1bd7f1ee537c50b7cac88cdf3..23a81f4eda96e86a1645056232ab9a1c11619428 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -4,6 +4,37 @@ import wikitextparser import importlib import json + +class SubInfo: + next_id = 1 + prfx = "err" + + @classmethod + def inc_n_id(cls): + cls.next_id += 1 + + @classmethod + def reset(cls): + cls.next_id = 0 + + def __init__(self, prefix = None): + self.id = None + self.set_id(prefix) + + def set_id(self, prefix): + if self.id == None and prefix != None: + self.id = f"{prefix}_{self.__class__.prfx}{self.__class__.next_id}" + self.__class__.inc_n_id() + return self.id + + def serializable(self, prefix = None): + res = {} + if self.set_id(prefix) != None: + res["id"] = self.id + return res + + + ####### # Oral ####### @@ -19,15 +50,17 @@ class Sound: if self.accent == None: res = {"url":self.url} else: - res = { "accent":self.accent, "url":self.url} + res = {"accent":self.accent, "url":self.url} return res -class Pronunciation: - def __init__(self): +class Pronunciation(SubInfo): + prfx = "prn" + + def __init__(self, prefix = None): + super().__init__(prefix) self.ipa = None self.sounds = [] self.accent = None - self.id = None def set_transcription(self, tscpt): self.ipa = tscpt @@ -38,18 +71,19 @@ class Pronunciation: def add_sound(self, url, accent=None): self.sounds.append(Sound(url,accent)) - def serializable(self): + def serializable(self, prefix = None): snds = [] for s in self.sounds: snds.append(s.serializable()) - if self.accent == None: - res = {"ID":self.id, "transcript":self.ipa, "sounds":snds} - else: - res = {"ID":self.id,"accent":self.accent, "transcript":self.ipa, "sounds":snds} + res = super().serializable(prefix) + res['transcript'] = self.ipa + if self.accent != None: + res['accent'] = self.accent + res['sounds'] = snds return res def __str__(self): - return f"{self.serializable()}" + return f"{self.serializable('')}" def __eq__(self, other): res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) @@ -70,127 +104,99 @@ class Pronunciation: # TODO: créer une classe Translations ####### -class Definition: -<<<<<<< HEAD - def __init__(self, lang, text, id=None): -======= - def __init__(self, lang, text, id): - self.id = id ->>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe +class Definition(SubInfo): + prfx = "def" + key = "definition" + + def __init__(self, lang, text, prefix=None): + super().__init__(prefix) self.lang = lang self.text = text - self.id = id - - def set_id(self, id): - self.id = id def __eq__(self, other): return self.lang == other.lang and self.text == other.text -<<<<<<< HEAD - def serializable(self, id = True): - res = {} - if id and self.id != None: - res["id"] = self.id + def serializable(self, prefix = None): + res = super().serializable(prefix) res["lang"] = self.lang - res["definition"] = self.text + res[self.__class__.key] = self.text return res class Translation(Definition): - def serializable(self, id = True): - res = {} - if id and self.id != None: - res["id"] = self.id - res["lang"] = self.lang - res["translation"] = self.text - return res + prfx = "trad" + key = "translation" -class Example: - def __init__(self, transcript, id=None, source=None, url=None): -======= - def serializable(self): - return {"ID":self.id, "lang":self.lang, "definition":self.text} +class Example(SubInfo): + prfx = "ex" -class Translation(): - def __init__(self, id, lang=None, text=None): - self.id = id - self.lang = lang - self.text = text - def serializable(self): - return {"ID:" : self.id, "lang":self.lang, "translation":self.text} - -class Example: - def __init__(self, transcript, id, source=None, url=None): ->>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe + def __init__(self, transcript, source=None, url=None, prefix=None): + super().__init__(prefix) self.text = transcript self.source = source self.url = url - self.id = id def __eq__(self, other): return self.text==other.text and self.source==other.source and self.url==other.url -<<<<<<< HEAD - def set_id(self, id): - self.id = id - - def serializable(self, id = True): - if id: - res = {"id":self.id, "example":self.text} - else: - res = {"example":self.text} -======= - def serializable(self): - res = {"ID":self.id, "example":self.text} ->>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe + def serializable(self, prefix = None): + res = super().serializable(prefix) + res["example"]=self.text if self.source != None: res["source"] = self.source if self.url != None: res["url"] = self.url return res -class Sense: - def __init__(self, label, lang): +class Sense(SubInfo): + prfx = "" + + def __init__(self, prefix=None, lang=None): self.lang = lang - self.label = lang+"."+label #l'identifiant du sens + self.label = None + self.set_id(prefix) + #On réinitialise les identifiants des sous-éléments + if not isinstance(self, SubSense): + Definition.reset() + Example.reset() + Translation.reset() + SubSense.reset() + self.definitions = [] #liste des définitions (elles auront une langue et un texte) self.subsenses = [] #liste des sous-définitions (récursif…) self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) self.translations = [] #liste des traductions dans d'autres langues self.domain = None #domaine d'usage du mot dans ce sens + def set_id(self, prefix): + if prefix != None and self.label == None: + self.label = f"{self.lang}.{prefix}_{self.__class__.next_id}" #l'identifiant du sens + self.__class__.inc_n_id() + return self.label + def set_domain(self, d): self.domain = d -<<<<<<< HEAD def add_def(self, lang, definition): theDef = Definition(lang, definition) if theDef not in self.definitions: - theDef.set_id(f"{self.label}_def{len(self.definitions)}") + theDef.set_id(self.set_id()) self.definitions.append(theDef) -======= - def add_def(self, lang, definition, id): - theDef = Definition(lang, definition, id) - if self.definition == None: - self.definition = theDef - elif self.definition != theDef: - raise ValueError(f"Superposition de deux définitions:\n\t{self.definition}\nremplacée par\n\t{theDef}") ->>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe - - def add_example(self, transcript, id, src=None, url=None): - theEx = Example(transcript, id, src, url) + def add_example(self, transcript, src=None, url=None, prefix=None): + theEx = Example(transcript, src, url, prefix) if theEx not in self.examples: - theEx.set_id(f"{self.label}_ex{len(self.examples)}") + theEx.set_id(self.set_id()) self.examples.append(theEx) - def add_translation(self, id, lang=None, translation=None): - theTranslation = Translation(id, lang, translation) + def add_translation(self, lang=None, translation=None): + theTranslation = Translation(lang, translation) if theTranslation not in self.translations: - theTranslation.set_id(f"{self.label}_trad{len(self.translations)}") + theTranslation.set_id(self.set_id()) self.translations.append(theTranslation) def add_subsense(self, subsense): + if self.label!=None: + subsense.set_id(self.set_id()) if subsense not in self.subsenses: self.subsenses.append(subsense) @@ -214,90 +220,88 @@ class Sense: i+=1 return res - def serializable(self, key = False): + def serializable(self, prefix = None): res = {} -<<<<<<< HEAD if self.domain != None: res["Domain"] = self.domain if len(self.definitions) > 0: res["Definitions"] = [] for d in self.definitions: - res["Definitions"].append(d.serializable(key)) + res["Definitions"].append(d.serializable(prefix)) if len(self.subsenses) > 0: res["Subsenses"] = {} for t in self.subsenses: - res["Subsenses"][t.label]= t.serializable(key) + res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix) if len(self.examples) > 0 : res["Examples"] = [] for e in self.examples: - res["Examples"].append(e.serializable(key)) + res["Examples"].append(e.serializable(prefix)) if len(self.translations) > 0: res["Translations"] = [] for t in self.translations: - res["Translations"].append(t.serializable(key)) -======= - if key: - res[self.label]={} - if self.domain != None: - res[self.label]["Domain"] = self.domain - res[self.label]["Definition"] = self.definition.serializable() - if len(self.subsenses) > 0: - res[self.label]["Subsenses"] = [] - for t in self.subsenses: - res[self.label]["Subsenses"].append(t.serializable()) - if len(self.examples) > 0 : - res[self.label]["Examples"] = [] - for e in self.examples: - res[self.label]["Examples"].append(e.serializable()) - #if len(self.translations) > 0: - res[self.label]["Translations"] = [] - for t in self.translations: - res[self.label]["Translations"].append(t.serializable()) - else: - if self.domain != None: - res["Domain"] = self.domain - res["Definition"] = self.definition.serializable() - if len(self.subsenses) > 0: - res["Subsenses"] = {} - for t in self.subsenses: - res["Subsenses"][t.label]= t.serializable(key) - if len(self.examples) > 0 : - res["Examples"] = [] - for e in self.examples: - res["Examples"].append(e.serializable()) - #if len(self.translations) > 0: - res["Translations"] = [] - for t in self.translations: - res["Translations"].append(t.serializable()) ->>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe + res["Translations"].append(t.serializable(prefix)) return res +class SubSense(Sense): + def set_id(self, prefix): + if prefix != None and self.label == None: + self.label = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens + self.__class__.inc_n_id() + return self.label class Entry: - def __init__(self, lemma, lang): + #version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id) + def __init__(self, lemma, lang, wiki_lang, version_id): self.lemma = lemma self.lang = lang + #Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile + self.sources = [] + self.sources.append({wiki_lang:version_id}) + self.current_source = 0 self.pronunciations = [] self.pos = None self.senses = [] + #l'identifiant unique de la version de la page du wiktionnaire + Sense.reset() + + def set_pos(self, pos): + self.pos = pos + + def get_id(self, source_id=0): + if self.pos !=None: + pos = "_"+self.pos + else: + pos = "" + return f"{self.lang}-{source_id}.{self.lemma}{pos}" def set_pronunciations(self, pron): if isinstance(pron, Pronunciation): - self.pronunciations.append(pron) + self.add_pronunciation(pron) elif type(pron) == list: for p in pron: if isinstance(p, Pronunciation): - self.pronunciations.append(p) + self.add_pronunciation(p) else: - raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).") + raise ValueError(f"Entry.set_pronunciations: {p} is not a Pronunciation object ({p.__class__.__name__}).") else: - raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") + raise ValueError(f"Entry.set_pronunciations: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") - def set_pos(self, pos): - self.pos = pos + def add_pronunciation(self, p): + if p not in self.pronunciations: + p.set_id(self.get_id()) + self.pronunciations.append(p) def set_senses(self, senses): - self.senses = senses + for s in self.senses: + if isinstance(s, Sense): + self.add_sense(s) + else: + raise ValueError(f"Entry.set_senses: {s} is not a Sense object ({p.__class__.__name__}).") + + def add_sense(self, s): + if s not in self.senses: + s.set_id(self.get_id()) + self.senses.append(s)##ICITE def is_valid(self): return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 @@ -314,16 +318,22 @@ class Entry: i += 1 return res - def serializable(self, id = False): + def serializable(self, id=True): res = {} - res['id'] = self.lemma+"_"+self.pos+"."+self.lang + res['sources'] = self.sources + if id: + id = self.get_id() + res['id'] = id + else: + id == None res[self.lemma] = {"pos":self.pos} res[self.lemma]["pronunciations"] = [] for p in self.pronunciations: - res[self.lemma]["pronunciations"].append(p.serializable()) + res[self.lemma]["pronunciations"].append(p.serializable(id)) res[self.lemma]["senses"] = {} for s in self.senses: - res[self.lemma]["senses"][s.label]=s.serializable(id) + res[self.lemma]["senses"][s.get_id()]=s.serializable(id) + print(f"Entry:{res}")## return res def __str__(self): @@ -335,9 +345,11 @@ class Entry: return res class ParserContext: - def __init__(self, entry, lang): + def __init__(self, entry, lang, wiki_lang, version_id): self.lemma = entry self.lang = lang + self.wiki_lang = wiki_lang + self.version_id = version_id self.context = [] self.entries = [] @@ -372,12 +384,8 @@ class ParserContext: #Pb là dedans def create_entry(self): -<<<<<<< HEAD #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS - res = Entry(self.lemma, self.lang) -======= - res = Entry(self.lemma) ->>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe + res = Entry(self.lemma, self.lang, self.wiki_lang, self.version_id) for l in self.context: #print(l.keys()) if "pro" in l.keys(): @@ -389,6 +397,7 @@ class ParserContext: if "senses" in l.keys(): res.set_senses(l['senses']) # TODO: Ajouter les autres types + print(res) if res.is_valid() and res not in self.entries: self.entries.append(res) else: @@ -452,11 +461,11 @@ class Wikstraktor: if not found: i += 1 if found: - nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections) + nb_entries_added = self.parse(page.title(), page.latest_revision_id, sections[i].sections)#self.wtp.parse(s.contents).sections) return nb_entries_added - def parse(self, entry, sections): - self.parserContext = ParserContext(entry, self.entry_language) + def parse(self, entry, v_id, sections): + self.parserContext = ParserContext(entry, self.entry_language, self.wiki_language, v_id) for s in sections: if s.title != None : #handle wiki context @@ -480,7 +489,7 @@ class Wikstraktor: pos = self.process_POS(stitle) if pos != None : self.parserContext.set_top_entry_info('POS', pos, False) - self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) #cette ligne le prob + self.parserContext.set_top_entry_info('senses', self.process_senses(self.wtp.parse(s.contents))) #cette ligne le prob # self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus. res = len(self.parserContext.entries) if res > 0: @@ -512,7 +521,7 @@ class Wikstraktor: def process_etymology(self, parsedwikitext): pass#in subclass - def process_senses(self, entry, pos, parsedwikitext): + def process_senses(self, parsedwikitext): pass#in subclass def __str__(self): @@ -522,6 +531,7 @@ class Wikstraktor: res = [] for e in self.entries: res.append(e.serializable(id)) + print(f"Export{res}")## if compact: return json.dumps(res, ensure_ascii=ascii) else: @@ -548,6 +558,8 @@ if __name__ == "__main__": resp = None if w.fetch(args.mot) > 0: resp = w.export(not args.no_id, args.force_ascii, args.compact) + print(w)## + print(resp)## if args.destination_file != None: f = open(args.destination_file, "w") f.write(resp)