diff --git a/wikstraktor.py b/wikstraktor.py index 0444891dbb909bd56879b8f15669e0070cc78c09..095f285fe04f349d3294920aa80ff1aa23afbcd3 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -5,6 +5,7 @@ import importlib import json from wikstraktor_version import version as the_version from wikstraklog import Wikstraklog +import re def get_list_string_level(wikitext): list_chars = {"*", "#", ":"} @@ -35,10 +36,26 @@ class SubInfo: self.__class__.inc_n_id() return self.id + def replace_src_in_id(self, former_src, new_src): + ##Attention si on nettoie en mettant des sources partout, il faudra changer + res = None + if self.id != None and former_src != None and new_src != None : + self.id = re.sub(r'^([\w\.]+)-('+str(former_src)+')',r"\1-"+str(new_src), self.id) + res = self.id + return res + + def get_src_from_id(self): + res = None + if self.id != None: + gp = re.match(r'^[\w\.]+-(\d{1,2})', self.id) + if gp: + res = int(gp.group(1)) + return res + def serializable(self, prefix = None): res = {} - if self.set_id(prefix) != None: - res["id"] = self.id + if prefix != None: + res["id"] = self.set_id(prefix) return res @@ -83,10 +100,10 @@ class Pronunciation(SubInfo): self.sounds.append(Sound(url,accent)) def serializable(self, prefix = None): + res = super().serializable(prefix) snds = [] for s in self.sounds: snds.append(s.serializable()) - res = super().serializable(prefix) res['transcript'] = self.ipa if self.has_accents(): res['accents'] = list(self.accents) @@ -166,12 +183,12 @@ class Example(SubInfo): return res class Sense(SubInfo): - prfx = "" + prfx = "s" def __init__(self, lang=None, definition=None, wiki_lang=None, prefix=None): + print(prefix)## + super().__init__(prefix) self.lang = lang - self.label = None - self.set_id(prefix) #On réinitialise les identifiants des sous-éléments if not isinstance(self, SubSense): Definition.reset() @@ -217,14 +234,8 @@ class Sense(SubInfo): def metadata_exists(self, key): return key in self.metadata.keys() - def set_id(self, prefix=None): - if prefix != None and self.label == None: - self.label = f"{prefix}_{self.__class__.next_id}" #l'identifiant du sens - self.__class__.inc_n_id() - return self.label - def get_id(self): - return f"{self.lang}.{self.label}" + return self.id def set_domain(self, d): self.domain = d @@ -235,14 +246,15 @@ class Sense(SubInfo): else: theDef = Definition(lang, definition) if theDef != None and theDef not in self.definitions: - theDef.set_id(self.set_id()) + print("def set id", self.get_id())## + theDef.set_id(self.get_id()) self.definitions.append(theDef) def add_example(self, transcript, src=None, url=None, prefix=None): try: theEx = Example(transcript, src, url, prefix) if theEx != None and theEx not in self.examples: - theEx.set_id(self.set_id()) + theEx.set_id(self.get_id()) self.examples.append(theEx) except ValueError as e: print(f"Skipped empty example") @@ -250,17 +262,17 @@ class Sense(SubInfo): def add_translation(self, lang=None, translation=None): theTranslation = Translation(lang, translation) if theTranslation != None and theTranslation not in self.translations: - theTranslation.set_id(self.set_id()) + theTranslation.set_id(self.get_id()) self.translations.append(theTranslation) def add_subsense(self, subsense): - if self.label!=None: + if self.id!=None: subsense.set_id(self.set_id()) if subsense not in self.subsenses: self.subsenses.append(subsense) def __eq__(self, other): - res = isinstance(other, self.__class__) and self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain and len(other.metadata) == len(self.metadata) and other.regions == self.regions + res = isinstance(other, self.__class__) and self.id == other.id and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain and len(other.metadata) == len(self.metadata) and other.regions == self.regions i = 0 while res and i < len(self.examples): res = self.examples[i] in other.examples @@ -289,8 +301,10 @@ class Sense(SubInfo): i+=1 return res - def serializable(self, prefix = None): + def serializable(self, id = True): res = {} + if id: + prefix = self.get_id() if self.domain != None: res["Domain"] = self.domain if len(self.regions) > 0: @@ -312,7 +326,7 @@ class Sense(SubInfo): if len(self.subsenses) > 0: res["Subsenses"] = {} for t in self.subsenses: - res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix) + res["Subsenses"][t.set_id(self.id)]= t.serializable(prefix) return res def __str__(self): @@ -320,10 +334,10 @@ class Sense(SubInfo): class SubSense(Sense): def set_id(self, prefix=None): - if prefix != None and self.label == None: - self.label = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens + if prefix != None and self.id == None: + self.id = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens self.__class__.inc_n_id() - return self.label + return self.id class Entry: #version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id) @@ -332,8 +346,7 @@ class Entry: self.lang = lang #Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile self.sources = [] - self.sources.append({"wiktionary_language":wiki_lang,"permanentId":version_id,"wikstraktor_version":wkskt_version}) - self.current_source = 0 + self.add_source(wiki_lang, version_id, wkskt_version) self.pronunciations = [] self.pos = None self.senses = [] @@ -343,13 +356,24 @@ class Entry: def set_pos(self, pos): self.pos = pos - def get_id(self, source_id=0): - #TODO: remplacer un jour le source id par la bonne source + def add_source(self, wiki_lang, version_id, wkskt_version): + self.sources.append({"wiktionary_language":wiki_lang,"permanentId":version_id,"wikstraktor_version":wkskt_version}) + self.current_source = len(self.sources)-1 + + def set_current_source(self, src): + self.current_source = src + + def get_prefix(self, source_id=-1): if self.pos != None: - pos = self.pos + pos = "."+self.pos else: pos = "" - return f"{self.lang}-{source_id}.{self.lemma}{pos}" + if source_id == -1: + source_id = self.current_source + return f"{self.lang}.{self.lemma}{pos}-{source_id}" + + def get_id(self): + return f"{self.lang}.{self.lemma}.{self.pos}" def set_pronunciations(self, pron): if isinstance(pron, Pronunciation): @@ -365,7 +389,7 @@ class Entry: def add_pronunciation(self, p): if p not in self.pronunciations: - p.set_id(self.get_id()) + p.set_id(self.get_prefix()) self.pronunciations.append(p) def set_senses(self, senses): @@ -377,15 +401,62 @@ class Entry: def add_sense(self, s): if s not in self.senses: - s.set_id(self.get_id()) + s.set_id(self.get_prefix()) self.senses.append(s) def is_valid(self): return self.lemma != None and self.pos != None and len(self.senses) > 0 # and len(self.pronunciations) > 0 ↠must work without pronounciations + def same(self, other): + return isinstance(other, self.__class__) and self.lemma == other.lemma and self.lang == other.lang and self.pos == other.pos + + def merge(self, other): + if self.same(other) and self != other: + i = 0 + src_map = [] + max_id = len(self.sources) - 1 + while i < len(other.sources): + if other.sources[i] in self.sources: + src_map.append(self.sources.index(other.sources[i])) #should append at rank i + else : + self.add_source(other.sources[i]["wiktionary_language"],other.sources[i]["permanentId"],other.sources[i]["wikstraktor_version"]) + src_map.append(self.current_source) #should append at rank i + i += 1 + for p in other.pronunciations: + src = p.get_src_from_id() + if src != None and src <= max_id and src_map[src] != src: + #max_id, c'est parce qu'un même objet peut être + #à plusieurs endroits et avoir déjà été modifié + p.replace_src_in_id(src, src_map[src]) + self.add_pronunciation(p) + for s in other.senses: + src = s.get_src_from_id() + if src != None and src_map[src] != src: + s.replace_src_in_id(src, src_map[src]) + for ss in s.subsenses: + ss.replace_src_in_id(src, src_map[src]) + for d in ss.definitions: + d.replace_src_in_id(src, src_map[src]) + for e in ss.examples: + e.replace_src_in_id(src, src_map[src]) + for t in ss.translations: + t.replace_src_in_id(src, src_map[src]) + for d in s.definitions: + d.replace_src_in_id(src, src_map[src]) + for e in s.examples: + e.replace_src_in_id(src, src_map[src]) + for t in s.translations: + t.replace_src_in_id(src, src_map[src]) + self.add_sense(s) + else: + if isinstance(other, self.__class__): + raise TypeError(f"Entry.merge() error : {other.__class__} object cannot be merged with Entry") + else: + raise ValueError(f"Entry.merge() error : {self.id} cannot be merged with {other.id}") + def __eq__(self, other): - res = isinstance(other, self.__class__) and self.lemma == other.lemma and self.lang == other.lang and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) + res = self.same(other) and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) i = 0 while res and i < len(self.senses): res = self.senses[i] == other.senses[i] @@ -400,14 +471,14 @@ class Entry: res = {} res['sources'] = self.sources if id: - id = self.get_id() - res['id'] = id + prefix = self.get_prefix() + res['id'] = self.get_id() else: - id == None + prefix == None res[self.lemma] = {"pos":self.pos} res[self.lemma]["pronunciations"] = [] for p in self.pronunciations: - res[self.lemma]["pronunciations"].append(p.serializable(id)) + res[self.lemma]["pronunciations"].append(p.serializable(prefix)) res[self.lemma]["senses"] = {} for s in self.senses: res[self.lemma]["senses"][s.get_id()]=s.serializable(id) @@ -522,10 +593,10 @@ class ParserContext: class Wikstraktor: @classmethod - def get_instance(cls, wiki_language, entry_language, existing_entries=None): + def get_instance(cls, wiki_language, entry_language): try: m_name = f"{wiki_language}_{entry_language}".capitalize() - instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")(existing_entries) + instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() instance.version = the_version instance.log = Wikstraklog(the_version, entry_language, wiki_language) except ModuleNotFoundError: @@ -533,11 +604,8 @@ class Wikstraktor: instance = None return instance - def __init__(self, existing_entries=None): - if existing_entries = None: - self.entries = [] - else: - self.entries = existing_entries + def __init__(self): + self.entries = {} self.pwb = pywikibot self.wtp = wikitextparser self.parserContext = None @@ -551,6 +619,13 @@ class Wikstraktor: print(f"{file_page_name} does not exist in {self.site}.") return res + def add_entry(self, e): + if e.get_id() in self.entries.keys(): + if e != self.entries[e.get_id()]: + self.entries[e.get_id()].merge(e) + else: + self.entries[e.get_id()] = e + #retrieves the content of a page and processes it (adding the entries to the list of entries) #returns the number of entries added def fetch(self, graphy): @@ -603,7 +678,7 @@ class Wikstraktor: res = len(self.parserContext.entries) if res > 0: for e in self.parserContext.entries: - self.entries.append(e) + self.add_entry(e) return res def isPro(self, title): @@ -722,12 +797,23 @@ class Wikstraktor: print("Skipped empty definition") return senses + def __add__(self, other): + if isinstance(other, Wikstraktor): + for k,e in other.entries.items(): + if k in self.entries.keys(): + self.entries[k].merge(e) + else: + self.entries[k] = e + else: + raise TypeError(f"Wikstraktor '+' : {other.__class__} object cannot be added to {self.__class__}") + return self + def __str__(self): return self.export() def serialize(self, id=True): res = [] - for e in self.entries: + for e in self.entries.values(): res.append(e.serializable(id)) return res @@ -737,12 +823,6 @@ class Wikstraktor: else: return json.dumps(self.serialize(id), ensure_ascii=ascii, indent=4) -def export_multi_wikt(serialized, ascii=False, compact=False): - if compact: - return json.dumps(serialized, ensure_ascii=ascii) - else: - return json.dumps(serialized, ensure_ascii=ascii, indent=4) - if __name__ == "__main__": import argparse from argparse import RawTextHelpFormatter #pour le formattage de l'aide @@ -762,14 +842,18 @@ if __name__ == "__main__": wiki_languages = args.wiki_language.split("+") languages = args.language.split("+") if args.mot != None: - resp = [] + resp = None for w_l in wiki_languages: for l in languages : w = Wikstraktor.get_instance(w_l, l) if w.fetch(args.mot) > 0: - resp += w.serialize(not args.no_id) - if len(resp) > 0 : - resp = export_multi_wikt(resp, args.force_ascii, args.compact) + if resp == None: + resp = w + else: + resp += w + print(resp != None) + if resp != None : + resp = resp.export(not args.no_id, args.force_ascii, args.compact) if args.destination_file != None: f = open(args.destination_file, "w") f.write(resp)