diff --git a/README.md b/README.md index d16ed75e693fe59401decd9f75cacafa7e45bba3..84975f020255eacdfc3c63e9e36ec364c510f1a9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ wikstraktor =========== -A python tool to query the [wiktionary](https://wiktionary.org) and extract structured lexical data. +A python tool to query the [wiktionary](https://wiktionary.org) and extract [structured lexical data](https://gitlab.liris.cnrs.fr/lex-game/wikstraktor/-/wikis/Entry-structure). ## Dependencies This project does depend on python packages. diff --git a/parsers/en_en.py b/parsers/en_en.py index e7622c7810829c1b22e985142dc80cee6595735f..7344a36da194cf41dfb6721f66b022b501420ebe 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -17,6 +17,7 @@ class En_en_straktor(Wikstraktor): # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux l = proContent.get_lists()[0] i = 0 + cpt = 0 pronunciations = [] while i < len(l.fullitems): p = Pronunciation() @@ -32,6 +33,8 @@ class En_en_straktor(Wikstraktor): p.add_sound(self.get_file_url(t.arguments[1].value), a) if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : if p.ipa != None or p.accent != None: + cpt += 1 + p.id= f"p_{cpt}" pronunciations.append(p) p = Pronunciation() i += 1 @@ -57,19 +60,27 @@ class En_en_straktor(Wikstraktor): while i < len(l): if l[i].pattern == '\\# ': nombreDef += 1 +<<<<<<< HEAD newSense = Sense(f"{baseId}{nombreDef}", self.entry_language) newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) +======= + newSense = Sense(f"{baseId}{nombreDef}") + newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"d_{nombreDef}") + newSense.add_translation(f"t_{nombreDef}_0") +>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe elif l[i].pattern == '\\#:': + cptEx=0 for j in l[i].items: k = 0 isEx = 0 while k < len(self.wtp.parse(j).templates) and isEx == 0 : if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + cptEx +=1 + newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"e_{nombreDef}_{cptEx}") isEx = 1 k += 1 if isEx == 0: - newSense.add_example(self.wtp.parse(j).plain_text().strip()) + newSense.add_example(self.wtp.parse(j).plain_text().strip(), f"e_{nombreDef}_{cptEx}") if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': senses.append(newSense) cnt = 0 @@ -78,19 +89,27 @@ class En_en_straktor(Wikstraktor): cnt +=1 if l[i].pattern == '\\## ': nombreSousDef += 1 +<<<<<<< HEAD newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language) newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) +======= + newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") + newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"sd_{nombreDef}_{nombreSousDef}") + newSense2.add_translation(f"t_{nombreDef}_{nombreSousDef}_0") +>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe elif l[i].pattern == '\\##:': + cptex2 = 0 for j in l[i].items: k = 0 isEx = 0 while k < len(self.wtp.parse(j).templates) and isEx == 0 : if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + cptex2 +=1 + newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"se_{nombreDef}_{nombreSousDef}_{cptex2}") isEx = 1 k += 1 if isEx == 0: - newSense2.add_example(self.wtp.parse(j).plain_text().strip()) + newSense2.add_example(self.wtp.parse(j).plain_text().strip(), f"se_{nombreDef}_{nombreSousDef}_{cptex2}") if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': newSense.add_subsense(newSense2) i += 1 diff --git a/parsers/fr_en.py b/parsers/fr_en.py index 124c0fd138779f15f2904647201cf6d0d8a12d45..30cb0a9ead49ae0fa06d414d6dd1423fa614a291 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -53,7 +53,7 @@ class Fr_en_straktor(Wikstraktor): keys = list(self.constants['POS'].keys()) pos = keys[ik] ik += 1 -# print(pos) + print(pos) return pos def process_senses(self, entry, pos, sensesContent): @@ -64,9 +64,17 @@ class Fr_en_straktor(Wikstraktor): nombreDef = 0 while i < len(l): if l[i].pattern == '\\# ': +<<<<<<< HEAD nombreDef += 1 newSense = Sense(f"{baseId}{nombreDef}", self.entry_language) newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) +======= + #A revoir ça, très douteux + for nbDef in l[i].items : + nombreDef += 1 + newSense = Sense(f"{baseId}{nombreDef}") + newSense.add_def(self.wiki_language, self.wtp.parse(nbDef).plain_text().strip()) +>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe elif l[i].pattern == '\\#:': for j in l[i].items: k = 0 diff --git a/test_wikstraktor.py b/test_wikstraktor.py index f601218dbfa53cf970abfb402d867593df2c978a..00d62280ee1bf3a8a0904ec8f91e8be141fb94b9 100644 --- a/test_wikstraktor.py +++ b/test_wikstraktor.py @@ -5,7 +5,7 @@ if __name__ == "__main__": # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) #e.fetch("water") - f.fetch("blue") + f.fetch("water") # print(e.fetch("test"), "entries added") #print(e) file_path = 'test.json' @@ -22,3 +22,5 @@ if __name__ == "__main__": # print(p.get_file_url()) #print(e) #Entry("test", wtp.parse(page.text))) + + # PRENDS PAS LE FICHIER AUDIO POUR "LIVE" EN_EN diff --git a/wikstraktor.py b/wikstraktor.py index ac30492226fbac298d7904eb3dfb0776ee10f22d..d6f6c400d0d7ecc1bd7f1ee537c50b7cac88cdf3 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -19,7 +19,7 @@ class Sound: if self.accent == None: res = {"url":self.url} else: - res = {"accent":self.accent, "url":self.url} + res = { "accent":self.accent, "url":self.url} return res class Pronunciation: @@ -27,6 +27,7 @@ class Pronunciation: self.ipa = None self.sounds = [] self.accent = None + self.id = None def set_transcription(self, tscpt): self.ipa = tscpt @@ -42,9 +43,9 @@ class Pronunciation: for s in self.sounds: snds.append(s.serializable()) if self.accent == None: - res = {"transcript":self.ipa, "sounds":snds} + res = {"ID":self.id, "transcript":self.ipa, "sounds":snds} else: - res = {"accent":self.accent, "transcript":self.ipa, "sounds":snds} + res = {"ID":self.id,"accent":self.accent, "transcript":self.ipa, "sounds":snds} return res def __str__(self): @@ -70,7 +71,12 @@ class Pronunciation: ####### class Definition: +<<<<<<< HEAD def __init__(self, lang, text, id=None): +======= + def __init__(self, lang, text, id): + self.id = id +>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe self.lang = lang self.text = text self.id = id @@ -81,6 +87,7 @@ class Definition: def __eq__(self, other): return self.lang == other.lang and self.text == other.text +<<<<<<< HEAD def serializable(self, id = True): res = {} if id and self.id != None: @@ -100,6 +107,21 @@ class Translation(Definition): class Example: def __init__(self, transcript, id=None, source=None, url=None): +======= + def serializable(self): + return {"ID":self.id, "lang":self.lang, "definition":self.text} + +class Translation(): + def __init__(self, id, lang=None, text=None): + self.id = id + self.lang = lang + self.text = text + def serializable(self): + return {"ID:" : self.id, "lang":self.lang, "translation":self.text} + +class Example: + def __init__(self, transcript, id, source=None, url=None): +>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe self.text = transcript self.source = source self.url = url @@ -108,6 +130,7 @@ class Example: def __eq__(self, other): return self.text==other.text and self.source==other.source and self.url==other.url +<<<<<<< HEAD def set_id(self, id): self.id = id @@ -116,6 +139,10 @@ class Example: res = {"id":self.id, "example":self.text} else: res = {"example":self.text} +======= + def serializable(self): + res = {"ID":self.id, "example":self.text} +>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe if self.source != None: res["source"] = self.source if self.url != None: @@ -135,21 +162,30 @@ class Sense: def set_domain(self, d): self.domain = d +<<<<<<< HEAD def add_def(self, lang, definition): theDef = Definition(lang, definition) if theDef not in self.definitions: theDef.set_id(f"{self.label}_def{len(self.definitions)}") self.definitions.append(theDef) - - def add_example(self, transcript, src=None, url=None): - theEx = Example(transcript, src, url) +======= + def add_def(self, lang, definition, id): + theDef = Definition(lang, definition, id) + if self.definition == None: + self.definition = theDef + elif self.definition != theDef: + raise ValueError(f"Superposition de deux définitions:\n\t{self.definition}\nremplacée par\n\t{theDef}") +>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe + + def add_example(self, transcript, id, src=None, url=None): + theEx = Example(transcript, id, src, url) if theEx not in self.examples: theEx.set_id(f"{self.label}_ex{len(self.examples)}") self.examples.append(theEx) - def add_translation(self, lang, translation): - theTranslation = Translation(lang, translation) + def add_translation(self, id, lang=None, translation=None): + theTranslation = Translation(id, lang, translation) if theTranslation not in self.translations: theTranslation.set_id(f"{self.label}_trad{len(self.translations)}") self.translations.append(theTranslation) @@ -180,6 +216,7 @@ class Sense: def serializable(self, key = False): res = {} +<<<<<<< HEAD if self.domain != None: res["Domain"] = self.domain if len(self.definitions) > 0: @@ -198,6 +235,41 @@ class Sense: res["Translations"] = [] for t in self.translations: res["Translations"].append(t.serializable(key)) +======= + if key: + res[self.label]={} + if self.domain != None: + res[self.label]["Domain"] = self.domain + res[self.label]["Definition"] = self.definition.serializable() + if len(self.subsenses) > 0: + res[self.label]["Subsenses"] = [] + for t in self.subsenses: + res[self.label]["Subsenses"].append(t.serializable()) + if len(self.examples) > 0 : + res[self.label]["Examples"] = [] + for e in self.examples: + res[self.label]["Examples"].append(e.serializable()) + #if len(self.translations) > 0: + res[self.label]["Translations"] = [] + for t in self.translations: + res[self.label]["Translations"].append(t.serializable()) + else: + if self.domain != None: + res["Domain"] = self.domain + res["Definition"] = self.definition.serializable() + if len(self.subsenses) > 0: + res["Subsenses"] = {} + for t in self.subsenses: + res["Subsenses"][t.label]= t.serializable(key) + if len(self.examples) > 0 : + res["Examples"] = [] + for e in self.examples: + res["Examples"].append(e.serializable()) + #if len(self.translations) > 0: + res["Translations"] = [] + for t in self.translations: + res["Translations"].append(t.serializable()) +>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe return res @@ -298,9 +370,14 @@ class ParserContext: if testNewEntry: self.create_entry() +#Pb là dedans def create_entry(self): +<<<<<<< HEAD #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS res = Entry(self.lemma, self.lang) +======= + res = Entry(self.lemma) +>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe for l in self.context: #print(l.keys()) if "pro" in l.keys(): @@ -403,7 +480,8 @@ class Wikstraktor: pos = self.process_POS(stitle) if pos != None : self.parserContext.set_top_entry_info('POS', pos, False) - self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) + self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) #cette ligne le prob + # self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus. res = len(self.parserContext.entries) if res > 0: for e in self.parserContext.entries: