diff --git a/parsers/en_en.py b/parsers/en_en.py index a76ebe64f68d0153e1789360c64a264d4fba424c..e7622c7810829c1b22e985142dc80cee6595735f 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -57,7 +57,7 @@ class En_en_straktor(Wikstraktor): while i < len(l): if l[i].pattern == '\\# ': nombreDef += 1 - newSense = Sense(f"{baseId}{nombreDef}") + newSense = Sense(f"{baseId}{nombreDef}", self.entry_language) newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) elif l[i].pattern == '\\#:': for j in l[i].items: @@ -78,7 +78,7 @@ class En_en_straktor(Wikstraktor): cnt +=1 if l[i].pattern == '\\## ': nombreSousDef += 1 - newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") + newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language) newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) elif l[i].pattern == '\\##:': for j in l[i].items: diff --git a/parsers/fr_en.py b/parsers/fr_en.py index 1ca6b67db5fa722517371659ad7fc822c906716e..124c0fd138779f15f2904647201cf6d0d8a12d45 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -65,7 +65,7 @@ class Fr_en_straktor(Wikstraktor): while i < len(l): if l[i].pattern == '\\# ': nombreDef += 1 - newSense = Sense(f"{baseId}{nombreDef}") + newSense = Sense(f"{baseId}{nombreDef}", self.entry_language) newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) elif l[i].pattern == '\\#:': for j in l[i].items: @@ -86,7 +86,7 @@ class Fr_en_straktor(Wikstraktor): cnt +=1 if l[i].pattern == '\\## ': nombreSousDef += 1 - newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") + newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language) newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) elif l[i].pattern == '\\##:': for j in l[i].items: diff --git a/wikstraktor.py b/wikstraktor.py index 7f5620574d94f8d36abbc3c3510f478483a06fe6..ac30492226fbac298d7904eb3dfb0776ee10f22d 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -70,31 +70,52 @@ class Pronunciation: ####### class Definition: - def __init__(self, lang, text): + def __init__(self, lang, text, id=None): self.lang = lang self.text = text + self.id = id + + def set_id(self, id): + self.id = id def __eq__(self, other): return self.lang == other.lang and self.text == other.text - def serializable(self): - return {"lang":self.lang, "definition":self.text} + def serializable(self, id = True): + res = {} + if id and self.id != None: + res["id"] = self.id + res["lang"] = self.lang + res["definition"] = self.text + return res class Translation(Definition): - def serializable(self): - return {"lang":self.lang, "translation":self.text} + def serializable(self, id = True): + res = {} + if id and self.id != None: + res["id"] = self.id + res["lang"] = self.lang + res["translation"] = self.text + return res class Example: - def __init__(self, transcript, source=None, url=None): + def __init__(self, transcript, id=None, source=None, url=None): self.text = transcript self.source = source self.url = url + self.id = id def __eq__(self, other): return self.text==other.text and self.source==other.source and self.url==other.url - def serializable(self): - res = {"example":self.text} + def set_id(self, id): + self.id = id + + def serializable(self, id = True): + if id: + res = {"id":self.id, "example":self.text} + else: + res = {"example":self.text} if self.source != None: res["source"] = self.source if self.url != None: @@ -102,9 +123,10 @@ class Example: return res class Sense: - def __init__(self, label): - self.label = label #l'identifiant du sens - self.definition = None #liste des définitions (elles auront une langue et un texte) + def __init__(self, label, lang): + self.lang = lang + self.label = lang+"."+label #l'identifiant du sens + self.definitions = [] #liste des définitions (elles auront une langue et un texte) self.subsenses = [] #liste des sous-définitions (récursif…) self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) self.translations = [] #liste des traductions dans d'autres langues @@ -115,19 +137,21 @@ class Sense: def add_def(self, lang, definition): theDef = Definition(lang, definition) - if self.definition == None: - self.definition = theDef - elif self.definition != theDef: - raise ValueError(f"Superposition de deux définitions:\n\t{self.definition}\nremplacée par\n\t{theDef}") + if theDef not in self.definitions: + theDef.set_id(f"{self.label}_def{len(self.definitions)}") + self.definitions.append(theDef) + def add_example(self, transcript, src=None, url=None): theEx = Example(transcript, src, url) if theEx not in self.examples: + theEx.set_id(f"{self.label}_ex{len(self.examples)}") self.examples.append(theEx) def add_translation(self, lang, translation): theTranslation = Translation(lang, translation) if theTranslation not in self.translations: + theTranslation.set_id(f"{self.label}_trad{len(self.translations)}") self.translations.append(theTranslation) def add_subsense(self, subsense): @@ -135,7 +159,7 @@ class Sense: self.subsenses.append(subsense) def __eq__(self, other): - res = self.label == other.label and self.definition == other.definition and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain + res = self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain i = 0 while res and i < len(self.examples): res = self.examples[i] in other.examples @@ -145,6 +169,10 @@ class Sense: res = self.translations[i] in other.translations i+=1 i = 0 + while res and i < len(self.definitions): + res = self.definitions[i] in other.definitions + i+=1 + i = 0 while res and i < len(self.subsenses): res = self.subsenses[i] in other.subsenses i+=1 @@ -152,45 +180,31 @@ class Sense: def serializable(self, key = False): res = {} - if key: - res[self.label]={} - if self.domain != None: - res[self.label]["Domain"] = self.domain - res[self.label]["Definition"] = self.definition.serializable() - if len(self.subsenses) > 0: - res[self.label]["Subsenses"] = [] - for t in self.subsenses: - res[self.label]["Subsenses"].append(t.serializable()) - if len(self.examples) > 0 : - res[self.label]["Examples"] = [] - for e in self.examples: - res[self.label]["Examples"].append(e.serializable()) - if len(self.translations) > 0: - res[self.label]["Translations"] = [] - for t in self.translations: - res[self.label]["Translations"].append(t.serializable()) - else: - if self.domain != None: - res["Domain"] = self.domain - res["Definition"] = self.definition.serializable() - if len(self.subsenses) > 0: - res["Subsenses"] = {} - for t in self.subsenses: - res["Subsenses"][t.label]= t.serializable(key) - if len(self.examples) > 0 : - res["Examples"] = [] - for e in self.examples: - res["Examples"].append(e.serializable()) - if len(self.translations) > 0: - res["Translations"] = [] - for t in self.translations: - res["Translations"].append(t.serializable()) + if self.domain != None: + res["Domain"] = self.domain + if len(self.definitions) > 0: + res["Definitions"] = [] + for d in self.definitions: + res["Definitions"].append(d.serializable(key)) + if len(self.subsenses) > 0: + res["Subsenses"] = {} + for t in self.subsenses: + res["Subsenses"][t.label]= t.serializable(key) + if len(self.examples) > 0 : + res["Examples"] = [] + for e in self.examples: + res["Examples"].append(e.serializable(key)) + if len(self.translations) > 0: + res["Translations"] = [] + for t in self.translations: + res["Translations"].append(t.serializable(key)) return res class Entry: - def __init__(self, lemma): + def __init__(self, lemma, lang): self.lemma = lemma + self.lang = lang self.pronunciations = [] self.pos = None self.senses = [] @@ -217,7 +231,7 @@ class Entry: return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 def __eq__(self, other): - res = self.lemma == other.lemma and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) + res = self.lemma == other.lemma and self.lang == other.lang and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) i = 0 while res and i < len(self.senses): res = self.senses[i] == other.senses[i] @@ -228,19 +242,20 @@ class Entry: i += 1 return res - def serializable(self): + def serializable(self, id = False): res = {} + res['id'] = self.lemma+"_"+self.pos+"."+self.lang res[self.lemma] = {"pos":self.pos} res[self.lemma]["pronunciations"] = [] for p in self.pronunciations: res[self.lemma]["pronunciations"].append(p.serializable()) res[self.lemma]["senses"] = {} for s in self.senses: - res[self.lemma]["senses"][s.label]=s.serializable(False) + res[self.lemma]["senses"][s.label]=s.serializable(id) return res def __str__(self): - res = f"{self.lemma} ({self.pos})\n" + res = f"{self.lemma}_{self.lang} ({self.pos})\n" for p in self.pronunciations: res += f"{str(p)}\n" for s in self.senses: @@ -248,8 +263,9 @@ class Entry: return res class ParserContext: - def __init__(self, entry): + def __init__(self, entry, lang): self.lemma = entry + self.lang = lang self.context = [] self.entries = [] @@ -284,7 +300,7 @@ class ParserContext: def create_entry(self): #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS - res = Entry(self.lemma) + res = Entry(self.lemma, self.lang) for l in self.context: #print(l.keys()) if "pro" in l.keys(): @@ -363,7 +379,7 @@ class Wikstraktor: return nb_entries_added def parse(self, entry, sections): - self.parserContext = ParserContext(entry) + self.parserContext = ParserContext(entry, self.entry_language) for s in sections: if s.title != None : #handle wiki context @@ -424,10 +440,10 @@ class Wikstraktor: def __str__(self): return self.export() - def export(self, ascii=False, compact=False): + def export(self, id=True, ascii=False, compact=False): res = [] for e in self.entries: - res.append(e.serializable()) + res.append(e.serializable(id)) if compact: return json.dumps(res, ensure_ascii=ascii) else: @@ -440,19 +456,20 @@ if __name__ == "__main__": \033[1m\033[32mex :\033[0m ‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m ‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m - ‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C\033[0m""") + ‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -n -A -C\033[0m""") parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en") parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en") parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None) parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None) parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true") parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true") + parser.add_argument("-n", "--no_id", help="json sans id", action="store_true") args = parser.parse_args() if args.mot != None: w = Wikstraktor.get_instance(args.wiki_language, args.language) resp = None if w.fetch(args.mot) > 0: - resp = w.export(args.force_ascii, args.compact) + resp = w.export(not args.no_id, args.force_ascii, args.compact) if args.destination_file != None: f = open(args.destination_file, "w") f.write(resp)