diff --git a/parsers/Structure_json.json b/parsers/Structure_json.json index c38503c4148023192a7ae08469ed5bf33adee85b..bb0cd9b8062f6ed235cf06ffe1295b7c73a14031 100644 --- a/parsers/Structure_json.json +++ b/parsers/Structure_json.json @@ -15,8 +15,8 @@ "url1":"https://upload.wikimedia.org/wikipedia/commons/1/19/LL-Q1860_%28eng%29-Back_ache-water.wav" } ], - "Senses":[ - { + "Senses":{ + "v1":{ "Translations":[ "translation1", "...", @@ -26,16 +26,16 @@ "Stilles Mineralwasser.jpg", "..." ], - "Definition":"blabla", + "Definition":{"lang":"fr", "definition" : "blabla"}, "Examples":[ "blabla", "blabli", "blablou" ], - "subSense":[ + "SubSenses":[ { - "subdef":"blabla", - "subex":[ + "Definition":{"lang":"en", "definition" : "whatnot"}, + "Examples":[ "subexa", "subexb", "subexz" @@ -43,7 +43,7 @@ } ] } - ] + } } ] } @@ -61,4 +61,3 @@ \"Supplementary field for devs 5\" ... \"Supplementary field for devs 10\ */ - diff --git a/parsers/en_en.py b/parsers/en_en.py index b31790825d55566011c8c7ed01217b2b85123b81..a76ebe64f68d0153e1789360c64a264d4fba424c 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -41,7 +41,7 @@ class En_en_straktor(Wikstraktor): global debugEty debugEty += 1 return "Etymology" + str(debugEty) - + def process_POS(self,parsedwikitext): pos = None if parsedwikitext in self.constants['POS'].keys(): @@ -92,7 +92,7 @@ class En_en_straktor(Wikstraktor): if isEx == 0: newSense2.add_example(self.wtp.parse(j).plain_text().strip()) if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - senses.append(newSense2) + newSense.add_subsense(newSense2) i += 1 if cnt > 0: i -= 1 diff --git a/parsers/fr_en.py b/parsers/fr_en.py index 175f22b6c6cc585ab887e1b3964cfb866d8a12b4..1ca6b67db5fa722517371659ad7fc822c906716e 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -43,7 +43,7 @@ class Fr_en_straktor(Wikstraktor): global debugEty debugEty += 1 return "Etymology" + str(debugEty) - + def process_POS(self,parsedwikitext): pos = None ik = 0 @@ -100,7 +100,7 @@ class Fr_en_straktor(Wikstraktor): if isEx == 0: newSense2.add_example(self.wtp.parse(j).plain_text().strip()) if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - senses.append(newSense2) + newSense.add_subsense(newSense2) i += 1 if cnt > 0: i -= 1 @@ -110,5 +110,3 @@ class Fr_en_straktor(Wikstraktor): if __name__ == "__main__": ensk = Fr_en_straktor() print(ensk.fetch("test"), "entries added") - - diff --git a/wikstraktor.py b/wikstraktor.py index b9780d8e58b3fb8ac2eea8b5a244c75a626375fd..7f5620574d94f8d36abbc3c3510f478483a06fe6 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -104,7 +104,8 @@ class Example: class Sense: def __init__(self, label): self.label = label #l'identifiant du sens - self.definitions = [] #liste des définitions (elles auront une langue et un texte) + self.definition = None #liste des définitions (elles auront une langue et un texte) + self.subsenses = [] #liste des sous-définitions (récursif…) self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) self.translations = [] #liste des traductions dans d'autres langues self.domain = None #domaine d'usage du mot dans ce sens @@ -114,8 +115,10 @@ class Sense: def add_def(self, lang, definition): theDef = Definition(lang, definition) - if theDef not in self.definitions: - self.definitions.append(theDef) + if self.definition == None: + self.definition = theDef + elif self.definition != theDef: + raise ValueError(f"Superposition de deux définitions:\n\t{self.definition}\nremplacée par\n\t{theDef}") def add_example(self, transcript, src=None, url=None): theEx = Example(transcript, src, url) @@ -127,8 +130,12 @@ class Sense: if theTranslation not in self.translations: self.translations.append(theTranslation) + def add_subsense(self, subsense): + if subsense not in self.subsenses: + self.subsenses.append(subsense) + def __eq__(self, other): - res = self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain + res = self.label == other.label and self.definition == other.definition and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain i = 0 while res and i < len(self.examples): res = self.examples[i] in other.examples @@ -138,25 +145,46 @@ class Sense: res = self.translations[i] in other.translations i+=1 i = 0 - while res and i < len(self.definitions): - res = self.definitions[i] in other.definitions + while res and i < len(self.subsenses): + res = self.subsenses[i] in other.subsenses i+=1 return res - def serializable(self): + def serializable(self, key = False): res = {} - res[self.label]={} - if self.domain != None: - res[self.label]["domain"] = self.domain - res[self.label]["defs"] = [] - for d in self.definitions: - res[self.label]["defs"].append(d.serializable()) - res[self.label]["exs"] = [] - for e in self.examples: - res[self.label]["exs"].append(e.serializable()) - res[self.label]["trad"] = [] - for t in self.translations: - res[self.label]["trad"].append(t.serializable()) + if key: + res[self.label]={} + if self.domain != None: + res[self.label]["Domain"] = self.domain + res[self.label]["Definition"] = self.definition.serializable() + if len(self.subsenses) > 0: + res[self.label]["Subsenses"] = [] + for t in self.subsenses: + res[self.label]["Subsenses"].append(t.serializable()) + if len(self.examples) > 0 : + res[self.label]["Examples"] = [] + for e in self.examples: + res[self.label]["Examples"].append(e.serializable()) + if len(self.translations) > 0: + res[self.label]["Translations"] = [] + for t in self.translations: + res[self.label]["Translations"].append(t.serializable()) + else: + if self.domain != None: + res["Domain"] = self.domain + res["Definition"] = self.definition.serializable() + if len(self.subsenses) > 0: + res["Subsenses"] = {} + for t in self.subsenses: + res["Subsenses"][t.label]= t.serializable(key) + if len(self.examples) > 0 : + res["Examples"] = [] + for e in self.examples: + res["Examples"].append(e.serializable()) + if len(self.translations) > 0: + res["Translations"] = [] + for t in self.translations: + res["Translations"].append(t.serializable()) return res @@ -206,9 +234,9 @@ class Entry: res[self.lemma]["pronunciations"] = [] for p in self.pronunciations: res[self.lemma]["pronunciations"].append(p.serializable()) - res[self.lemma]["senses"] = [] + res[self.lemma]["senses"] = {} for s in self.senses: - res[self.lemma]["senses"].append(s.serializable()) + res[self.lemma]["senses"][s.label]=s.serializable(False) return res def __str__(self):