From 73c27df30a39e9a9962e6ef5e1f1fc9c20abd353 Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Mon, 3 Oct 2022 21:12:09 +0200 Subject: [PATCH] Correction bug doublon --- parsers/en_en.py | 7 +---- pronunciation.py | 11 ++++++++ wikstraktor.py | 68 +++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 68 insertions(+), 18 deletions(-) diff --git a/parsers/en_en.py b/parsers/en_en.py index 785b470..2f86b7b 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -27,19 +27,15 @@ class En_en_straktor(Wikstraktor): templates.append(t) a = None for t in templates: - print(t.normal_name()) if t.normal_name() == self.constants['t_acc']: a = t.arguments[0].value elif t.normal_name() == self.constants['t_ipa']: p.set_transcription(t.arguments[1].value) p.set_accent(a) - print(t, t.arguments, t.arguments[0].value) elif t.normal_name() == self.constants['t_snd']: p.add_sound(self.get_file_url(t.arguments[1].value), a) - print(t, t.arguments, t.arguments[1].value) pronunciations.append(p) i += 1 - print(pronunciations[0], pronunciations[1]) return pronunciations def process_etymology(self, etyContent): @@ -48,8 +44,7 @@ class En_en_straktor(Wikstraktor): return "Etymology" + str(debugEty) def process_senses(self, sensesContent): - import random as r - return "Cool"+r.choice(['a', 'b', 'c', 'd', 'e', 'f', 'g']) + return ["Cool."+sensesContent.plain_text()[3:15]] if __name__ == "__main__": ensk = En_en_straktor() diff --git a/pronunciation.py b/pronunciation.py index 6cc9b67..9525292 100644 --- a/pronunciation.py +++ b/pronunciation.py @@ -4,6 +4,9 @@ class Sound: self.url = url self.accent = accent + def __eq__(self, other): + return self.url == other.url and self.accent == other.accent + def serializable(self): if self.accent == None: res = {"url":self.url} @@ -38,3 +41,11 @@ class Pronunciation: def __str__(self): return f"{self.serializable()}" + + def __eq__(self, other): + res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) + i = 0 + while res and i<len(self.sounds): + res = self.sounds[i] == other.sounds[i] + i += 1 + return res diff --git a/wikstraktor.py b/wikstraktor.py index 6303389..5be3681 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -8,26 +8,56 @@ from pronunciation import Pronunciation class Entry: def __init__(self, lemma): self.lemma = lemma + self.pronunciations = [] + self.pos = None + self.senses = [] def set_pronunciations(self, pron): if isinstance(pron, Pronunciation): - self.pronunciations = pron + self.pronunciations.append(pron) + elif type(pron) == list: + for p in pron: + if isinstance(p, Pronunciation): + self.pronunciations.append(p) + else: + raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).") else: raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") - def set_POS(self, pos): + def set_pos(self, pos): self.pos = pos + def set_senses(self, senses): + self.senses = senses + + def is_valid(self): + return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 + + def __eq__(self, other): + res = self.lemma == other.lemma and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) + i = 0 + while res and i < len(self.senses): + res = self.senses[i] == other.senses[i] + i += 1 + i = 0 + while res and i < len(self.pronunciations): + res = self.pronunciations[i] == other.pronunciations[i] + i += 1 + return res + def __str__(self): res = f"{self.lemma} ({self.pos})\n" for p in self.pronunciations: res += f"{str(p)}\n" + for s in self.senses: + res += f"{str(s)}\n" return res class ParserContext: def __init__(self, entry): self.lemma = entry self.context = [] + self.entries = [] def get_level(self): if len(self.context) == 0: @@ -39,7 +69,9 @@ class ParserContext: def push(self, wiki_context): self.context.append({"wiki":wiki_context}) - def pop(self): + def pop(self, testNewEntry = True): + if testNewEntry: + self.create_entry() return self.context.pop() def set_top_wiki(self, wiki_context): @@ -48,22 +80,30 @@ class ParserContext: else: self.context[-1]['wiki'] = wiki_context - def set_top_entry_info(self, key, entry_context): + def set_top_entry_info(self, key, entry_context, testNewEntry=True): if len(self.context) == 0: raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.") else: self.context[-1][key] = entry_context + if testNewEntry: + self.create_entry() def create_entry(self): res = Entry(self.lemma) for l in self.context: - if l['pro'] != None: - res.set_pronunciations(l['entry_info']) - if l['ety'] != None: + if "pro" in l.keys(): + res.set_pronunciations(l['pro']) + if "ety" in l.keys(): pass #On ignore l'étymologie pour le moment - if l['POS'] != None: + if "POS" in l.keys(): res.set_pos(l['POS']) + if "senses" in l.keys(): + res.set_senses(l['senses']) # TODO: Ajouter les autres types + if res.is_valid() and res not in self.entries: + self.entries.append(res) + else: + res = None return res def debug_top(self): @@ -142,15 +182,18 @@ class Wikstraktor: elif self.constants['ety'] in s.title: self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents))) elif s.title in self.constants['POS'].keys(): - self.parserContext.set_top_entry_info('POS', self.constants['POS'][s.title]) + self.parserContext.set_top_entry_info('POS', self.constants['POS'][s.title], False) self.parserContext.set_top_entry_info('senses', self.process_senses(self.wtp.parse(s.contents))) - print(self.parserContext.debug_top()) - print("ok") + res = len(self.parserContext.entries) + if res > 0: + for e in self.parserContext.entries: + self.entries.append(e) + return res def __str__(self): res = "" for e in self.entries: - res += f"{e}\n" + res += f"{str(e)}\n" return res @@ -159,6 +202,7 @@ if __name__ == "__main__": # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) print(e.fetch("test"), "entries added") + print(e) # site = pywikibot.Site(f'wiktionary:en') # p = pywikibot.FilePage(site, "File:LL-Q1860 (eng)-Nattes à chat----parent.wav") # print(p) -- GitLab