From 5bf2f6f7161bfd64b2de8d86c9020833678f4068 Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Thu, 16 Mar 2023 01:00:04 +0100 Subject: [PATCH] v1 avec identifiants --- parsers/en_en.py | 4 +--- parsers/fr_en.py | 1 - wikstraktor.py | 31 ++++++++++++++++--------------- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/parsers/en_en.py b/parsers/en_en.py index 1e1a5de..8761a51 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -19,7 +19,7 @@ class En_en_straktor(Wikstraktor): i = 0 pronunciations = [] while i < len(l.fullitems): - p = Pronunciation(len(pronunciations)) + p = Pronunciation() templates = self.wtp.parse(l.fullitems[i]).templates a = None for j, t in enumerate(templates): @@ -49,7 +49,6 @@ class En_en_straktor(Wikstraktor): return pos def process_senses(self, sensesContent): - print("process_senses") l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) i = 0 senses = [] @@ -95,7 +94,6 @@ class En_en_straktor(Wikstraktor): if cnt > 0: i -= 1 i += 1 - print(f"process_senses done {senses[-1].serializable()}") return senses if __name__ == "__main__": diff --git a/parsers/fr_en.py b/parsers/fr_en.py index a44ff5d..49f9896 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -53,7 +53,6 @@ class Fr_en_straktor(Wikstraktor): keys = list(self.constants['POS'].keys()) pos = keys[ik] ik += 1 - print(pos) return pos def process_senses(self, sensesContent): diff --git a/wikstraktor.py b/wikstraktor.py index 23a81f4..0a38474 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -83,7 +83,7 @@ class Pronunciation(SubInfo): return res def __str__(self): - return f"{self.serializable('')}" + return json.dumps(self.serializable('')) def __eq__(self, other): res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) @@ -167,12 +167,15 @@ class Sense(SubInfo): self.translations = [] #liste des traductions dans d'autres langues self.domain = None #domaine d'usage du mot dans ce sens - def set_id(self, prefix): + def set_id(self, prefix=None): if prefix != None and self.label == None: - self.label = f"{self.lang}.{prefix}_{self.__class__.next_id}" #l'identifiant du sens + self.label = f"{prefix}_{self.__class__.next_id}" #l'identifiant du sens self.__class__.inc_n_id() return self.label + def get_id(self): + return f"{self.lang}.{self.label}" + def set_domain(self, d): self.domain = d @@ -242,8 +245,11 @@ class Sense(SubInfo): res["Translations"].append(t.serializable(prefix)) return res + def __str__(self): + return json.dumps(self.serializable()) + class SubSense(Sense): - def set_id(self, prefix): + def set_id(self, prefix=None): if prefix != None and self.label == None: self.label = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens self.__class__.inc_n_id() @@ -268,8 +274,9 @@ class Entry: self.pos = pos def get_id(self, source_id=0): - if self.pos !=None: - pos = "_"+self.pos + #TODO: remplacer un jour le source id par la bonne source + if self.pos != None: + pos = self.pos else: pos = "" return f"{self.lang}-{source_id}.{self.lemma}{pos}" @@ -292,7 +299,7 @@ class Entry: self.pronunciations.append(p) def set_senses(self, senses): - for s in self.senses: + for s in senses: if isinstance(s, Sense): self.add_sense(s) else: @@ -301,7 +308,7 @@ class Entry: def add_sense(self, s): if s not in self.senses: s.set_id(self.get_id()) - self.senses.append(s)##ICITE + self.senses.append(s) def is_valid(self): return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 @@ -333,7 +340,6 @@ class Entry: res[self.lemma]["senses"] = {} for s in self.senses: res[self.lemma]["senses"][s.get_id()]=s.serializable(id) - print(f"Entry:{res}")## return res def __str__(self): @@ -387,7 +393,6 @@ class ParserContext: #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS res = Entry(self.lemma, self.lang, self.wiki_lang, self.version_id) for l in self.context: - #print(l.keys()) if "pro" in l.keys(): res.set_pronunciations(l['pro']) if "ety" in l.keys(): @@ -397,7 +402,6 @@ class ParserContext: if "senses" in l.keys(): res.set_senses(l['senses']) # TODO: Ajouter les autres types - print(res) if res.is_valid() and res not in self.entries: self.entries.append(res) else: @@ -491,6 +495,7 @@ class Wikstraktor: self.parserContext.set_top_entry_info('POS', pos, False) self.parserContext.set_top_entry_info('senses', self.process_senses(self.wtp.parse(s.contents))) #cette ligne le prob # self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus. + res = len(self.parserContext.entries) if res > 0: for e in self.parserContext.entries: @@ -502,7 +507,6 @@ class Wikstraktor: res = title == self.constants['pro'] else: res = title in self.constants['pro'] - #print(title, res) return res def isEty(self, title): @@ -531,7 +535,6 @@ class Wikstraktor: res = [] for e in self.entries: res.append(e.serializable(id)) - print(f"Export{res}")## if compact: return json.dumps(res, ensure_ascii=ascii) else: @@ -558,8 +561,6 @@ if __name__ == "__main__": resp = None if w.fetch(args.mot) > 0: resp = w.export(not args.no_id, args.force_ascii, args.compact) - print(w)## - print(resp)## if args.destination_file != None: f = open(args.destination_file, "w") f.write(resp) -- GitLab