diff --git a/parsers/en_en.py b/parsers/en_en.py index a76ebe64f68d0153e1789360c64a264d4fba424c..7a9c53720f5a0fe7c87b41577d6114917aeb3b50 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -17,6 +17,7 @@ class En_en_straktor(Wikstraktor): # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux l = proContent.get_lists()[0] i = 0 + cpt = 0 pronunciations = [] while i < len(l.fullitems): p = Pronunciation() @@ -32,6 +33,8 @@ class En_en_straktor(Wikstraktor): p.add_sound(self.get_file_url(t.arguments[1].value), a) if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : if p.ipa != None or p.accent != None: + cpt += 1 + p.id= f"p_{cpt}" pronunciations.append(p) p = Pronunciation() i += 1 @@ -58,18 +61,21 @@ class En_en_straktor(Wikstraktor): if l[i].pattern == '\\# ': nombreDef += 1 newSense = Sense(f"{baseId}{nombreDef}") - newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"d_{nombreDef}") + newSense.add_translation(f"t_{nombreDef}_0") elif l[i].pattern == '\\#:': + cptEx=0 for j in l[i].items: k = 0 isEx = 0 while k < len(self.wtp.parse(j).templates) and isEx == 0 : if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + cptEx +=1 + newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"e_{nombreDef}_{cptEx}") isEx = 1 k += 1 if isEx == 0: - newSense.add_example(self.wtp.parse(j).plain_text().strip()) + newSense.add_example(self.wtp.parse(j).plain_text().strip(), f"e_{nombreDef}_{cptEx}") if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': senses.append(newSense) cnt = 0 @@ -79,18 +85,21 @@ class En_en_straktor(Wikstraktor): if l[i].pattern == '\\## ': nombreSousDef += 1 newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") - newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"sd_{nombreDef}_{nombreSousDef}") + newSense2.add_translation(f"t_{nombreDef}_{nombreSousDef}_0") elif l[i].pattern == '\\##:': + cptex2 = 0 for j in l[i].items: k = 0 isEx = 0 while k < len(self.wtp.parse(j).templates) and isEx == 0 : if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + cptex2 +=1 + newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"se_{nombreDef}_{nombreSousDef}_{cptex2}") isEx = 1 k += 1 if isEx == 0: - newSense2.add_example(self.wtp.parse(j).plain_text().strip()) + newSense2.add_example(self.wtp.parse(j).plain_text().strip(), f"se_{nombreDef}_{nombreSousDef}_{cptex2}") if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': newSense.add_subsense(newSense2) i += 1 diff --git a/parsers/fr_en.py b/parsers/fr_en.py index 1ca6b67db5fa722517371659ad7fc822c906716e..6dae8f0cf488effafd01c0538adbe1b3544fb7e0 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -53,7 +53,7 @@ class Fr_en_straktor(Wikstraktor): keys = list(self.constants['POS'].keys()) pos = keys[ik] ik += 1 -# print(pos) + print(pos) return pos def process_senses(self, entry, pos, sensesContent): @@ -64,9 +64,11 @@ class Fr_en_straktor(Wikstraktor): nombreDef = 0 while i < len(l): if l[i].pattern == '\\# ': - nombreDef += 1 - newSense = Sense(f"{baseId}{nombreDef}") - newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + #A revoir ça, très douteux + for nbDef in l[i].items : + nombreDef += 1 + newSense = Sense(f"{baseId}{nombreDef}") + newSense.add_def(self.wiki_language, self.wtp.parse(nbDef).plain_text().strip()) elif l[i].pattern == '\\#:': for j in l[i].items: k = 0 diff --git a/test_wikstraktor.py b/test_wikstraktor.py index f601218dbfa53cf970abfb402d867593df2c978a..00d62280ee1bf3a8a0904ec8f91e8be141fb94b9 100644 --- a/test_wikstraktor.py +++ b/test_wikstraktor.py @@ -5,7 +5,7 @@ if __name__ == "__main__": # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) #e.fetch("water") - f.fetch("blue") + f.fetch("water") # print(e.fetch("test"), "entries added") #print(e) file_path = 'test.json' @@ -22,3 +22,5 @@ if __name__ == "__main__": # print(p.get_file_url()) #print(e) #Entry("test", wtp.parse(page.text))) + + # PRENDS PAS LE FICHIER AUDIO POUR "LIVE" EN_EN diff --git a/wikstraktor.py b/wikstraktor.py index 7f5620574d94f8d36abbc3c3510f478483a06fe6..1c5ce27fa39d2d8eb92fc5649d165cc85edf6add 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -19,7 +19,7 @@ class Sound: if self.accent == None: res = {"url":self.url} else: - res = {"accent":self.accent, "url":self.url} + res = { "accent":self.accent, "url":self.url} return res class Pronunciation: @@ -27,6 +27,7 @@ class Pronunciation: self.ipa = None self.sounds = [] self.accent = None + self.id = None def set_transcription(self, tscpt): self.ipa = tscpt @@ -42,9 +43,9 @@ class Pronunciation: for s in self.sounds: snds.append(s.serializable()) if self.accent == None: - res = {"transcript":self.ipa, "sounds":snds} + res = {"ID":self.id, "transcript":self.ipa, "sounds":snds} else: - res = {"accent":self.accent, "transcript":self.ipa, "sounds":snds} + res = {"ID":self.id,"accent":self.accent, "transcript":self.ipa, "sounds":snds} return res def __str__(self): @@ -70,7 +71,8 @@ class Pronunciation: ####### class Definition: - def __init__(self, lang, text): + def __init__(self, lang, text, id): + self.id = id self.lang = lang self.text = text @@ -78,23 +80,28 @@ class Definition: return self.lang == other.lang and self.text == other.text def serializable(self): - return {"lang":self.lang, "definition":self.text} + return {"ID":self.id, "lang":self.lang, "definition":self.text} -class Translation(Definition): +class Translation(): + def __init__(self, id, lang=None, text=None): + self.id = id + self.lang = lang + self.text = text def serializable(self): - return {"lang":self.lang, "translation":self.text} + return {"ID:" : self.id, "lang":self.lang, "translation":self.text} class Example: - def __init__(self, transcript, source=None, url=None): + def __init__(self, transcript, id, source=None, url=None): self.text = transcript self.source = source self.url = url + self.id = id def __eq__(self, other): return self.text==other.text and self.source==other.source and self.url==other.url def serializable(self): - res = {"example":self.text} + res = {"ID":self.id, "example":self.text} if self.source != None: res["source"] = self.source if self.url != None: @@ -113,20 +120,20 @@ class Sense: def set_domain(self, d): self.domain = d - def add_def(self, lang, definition): - theDef = Definition(lang, definition) + def add_def(self, lang, definition, id): + theDef = Definition(lang, definition, id) if self.definition == None: self.definition = theDef elif self.definition != theDef: raise ValueError(f"Superposition de deux définitions:\n\t{self.definition}\nremplacée par\n\t{theDef}") - def add_example(self, transcript, src=None, url=None): - theEx = Example(transcript, src, url) + def add_example(self, transcript, id, src=None, url=None): + theEx = Example(transcript, id, src, url) if theEx not in self.examples: self.examples.append(theEx) - def add_translation(self, lang, translation): - theTranslation = Translation(lang, translation) + def add_translation(self, id, lang=None, translation=None): + theTranslation = Translation(id, lang, translation) if theTranslation not in self.translations: self.translations.append(theTranslation) @@ -165,10 +172,10 @@ class Sense: res[self.label]["Examples"] = [] for e in self.examples: res[self.label]["Examples"].append(e.serializable()) - if len(self.translations) > 0: - res[self.label]["Translations"] = [] - for t in self.translations: - res[self.label]["Translations"].append(t.serializable()) + #if len(self.translations) > 0: + res[self.label]["Translations"] = [] + for t in self.translations: + res[self.label]["Translations"].append(t.serializable()) else: if self.domain != None: res["Domain"] = self.domain @@ -181,10 +188,10 @@ class Sense: res["Examples"] = [] for e in self.examples: res["Examples"].append(e.serializable()) - if len(self.translations) > 0: - res["Translations"] = [] - for t in self.translations: - res["Translations"].append(t.serializable()) + #if len(self.translations) > 0: + res["Translations"] = [] + for t in self.translations: + res["Translations"].append(t.serializable()) return res @@ -282,8 +289,8 @@ class ParserContext: if testNewEntry: self.create_entry() +#Pb là dedans def create_entry(self): - #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS res = Entry(self.lemma) for l in self.context: #print(l.keys()) @@ -387,7 +394,8 @@ class Wikstraktor: pos = self.process_POS(stitle) if pos != None : self.parserContext.set_top_entry_info('POS', pos, False) - self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) + self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) #cette ligne le prob + # self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus. res = len(self.parserContext.entries) if res > 0: for e in self.parserContext.entries: