From 2827aaf28c8886e20d78f7ce41573e5383d58764 Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Thu, 16 Mar 2023 09:50:57 +0100 Subject: [PATCH] Create Sense with definition --- .gitignore | 4 ++-- parsers/en_en.py | 9 ++++---- parsers/fr_en.py | 6 ++---- wikstraktor.py | 53 ++++++++++++++++++++++++++++++++---------------- 4 files changed, 44 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index a240499..69e9892 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ user-config.py user-password.py -pywikibot.lwp __pycache__ apicache-py3 logs @@ -8,5 +7,6 @@ throttle.ctrl user_list.py KNM.csv .~lock* -test.json +*.json +*.lwp wikstraktorenv diff --git a/parsers/en_en.py b/parsers/en_en.py index 8761a51..cf93078 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -54,8 +54,10 @@ class En_en_straktor(Wikstraktor): senses = [] while i < len(l): if l[i].pattern == '\\# ': - newSense = Sense(lang=self.entry_language) - newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + theDef = self.wtp.parse(l[i].items[0]).plain_text().strip() + if theDef != "": + print(theDef)# DEBUG: + newSense = Sense(self.entry_language, theDef, self.wiki_language) #newSence.add_translation() elif l[i].pattern == '\\#:': for j in l[i].items: @@ -74,8 +76,7 @@ class En_en_straktor(Wikstraktor): while i < len(l) and l[i].level == 3 : cnt +=1 if l[i].pattern == '\\## ': - newSense2 = SubSense(lang=self.entry_language) - newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + newSense2 = SubSense(self.entry_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), self.wiki_language) #newSense2.add_translation() elif l[i].pattern == '\\##:': for j in l[i].items: diff --git a/parsers/fr_en.py b/parsers/fr_en.py index 49f9896..fcaa931 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -61,8 +61,7 @@ class Fr_en_straktor(Wikstraktor): senses = [] while i < len(l): if l[i].pattern == '\\# ': - newSense = Sense(lang=self.entry_language) - newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + newSense = Sense(self.entry_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), self.wiki_language) #la version d'enzo ajoute +ieurs defs (for i in l[i].items) elif l[i].pattern == '\\#:': for j in l[i].items: @@ -81,8 +80,7 @@ class Fr_en_straktor(Wikstraktor): while i < len(l) and l[i].level == 3 : cnt +=1 if l[i].pattern == '\\## ': - newSense2 = SubSense(lang=self.entry_language) - newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + newSense2 = SubSense(self.entry_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), self.wiki_language) elif l[i].pattern == '\\##:': for j in l[i].items: k = 0 diff --git a/wikstraktor.py b/wikstraktor.py index 0a38474..c1e98fb 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -4,6 +4,8 @@ import wikitextparser import importlib import json +#ICITE : fr marche pas, en prend des trucs vides à virer (cf. yellow… def & example) + class SubInfo: next_id = 1 @@ -44,7 +46,7 @@ class Sound: self.accent = accent def __eq__(self, other): - return self.url == other.url and self.accent == other.accent + return isinstance(other, self.__class__) and self.url == other.url and self.accent == other.accent def serializable(self): if self.accent == None: @@ -86,7 +88,7 @@ class Pronunciation(SubInfo): return json.dumps(self.serializable('')) def __eq__(self, other): - res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) + res = isinstance(other, self.__class__) and self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) i = 0 while res and i<len(self.sounds): res = self.sounds[i] == other.sounds[i] @@ -110,11 +112,14 @@ class Definition(SubInfo): def __init__(self, lang, text, prefix=None): super().__init__(prefix) - self.lang = lang - self.text = text + if text != "": + self.lang = lang + self.text = text + else: + raise ValueError(f"Definition.__init__: “{text}†empty definition.") def __eq__(self, other): - return self.lang == other.lang and self.text == other.text + return isinstance(other, self.__class__) and self.lang == other.lang and self.text == other.text def serializable(self, prefix = None): res = super().serializable(prefix) @@ -131,12 +136,16 @@ class Example(SubInfo): def __init__(self, transcript, source=None, url=None, prefix=None): super().__init__(prefix) - self.text = transcript - self.source = source - self.url = url + if transcript != "": + self.text = transcript + self.source = source + self.url = url + else: + raise ValueError(f"Example.__init__: “{transcript}†empty example.") + def __eq__(self, other): - return self.text==other.text and self.source==other.source and self.url==other.url + return isinstance(other, self.__class__) and self.text==other.text and self.source==other.source and self.url==other.url def serializable(self, prefix = None): res = super().serializable(prefix) @@ -150,7 +159,7 @@ class Example(SubInfo): class Sense(SubInfo): prfx = "" - def __init__(self, prefix=None, lang=None): + def __init__(self, lang=None, definition=None, wiki_lang=None, prefix=None): self.lang = lang self.label = None self.set_id(prefix) @@ -166,6 +175,11 @@ class Sense(SubInfo): self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) self.translations = [] #liste des traductions dans d'autres langues self.domain = None #domaine d'usage du mot dans ce sens + if definition != None: + try: + self.add_def(wiki_lang, definition) + except ValueError as err: + raise ValueError(f"Sense.__init__() with empty definition\n{err}") def set_id(self, prefix=None): if prefix != None and self.label == None: @@ -181,19 +195,22 @@ class Sense(SubInfo): def add_def(self, lang, definition): theDef = Definition(lang, definition) - if theDef not in self.definitions: + if theDef != None and theDef not in self.definitions: theDef.set_id(self.set_id()) self.definitions.append(theDef) def add_example(self, transcript, src=None, url=None, prefix=None): - theEx = Example(transcript, src, url, prefix) - if theEx not in self.examples: - theEx.set_id(self.set_id()) - self.examples.append(theEx) + try: + theEx = Example(transcript, src, url, prefix) + if theEx != None and theEx not in self.examples: + theEx.set_id(self.set_id()) + self.examples.append(theEx) + except ValueError as e: + print(f"Skipped empty example") def add_translation(self, lang=None, translation=None): theTranslation = Translation(lang, translation) - if theTranslation not in self.translations: + if theTranslation != None and theTranslation not in self.translations: theTranslation.set_id(self.set_id()) self.translations.append(theTranslation) @@ -204,7 +221,7 @@ class Sense(SubInfo): self.subsenses.append(subsense) def __eq__(self, other): - res = self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain + res = isinstance(other, self.__class__) and self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain i = 0 while res and i < len(self.examples): res = self.examples[i] in other.examples @@ -314,7 +331,7 @@ class Entry: return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 def __eq__(self, other): - res = self.lemma == other.lemma and self.lang == other.lang and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) + res = isinstance(other, self.__class__) and self.lemma == other.lemma and self.lang == other.lang and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) i = 0 while res and i < len(self.senses): res = self.senses[i] == other.senses[i] -- GitLab