#!/usr/bin/env python3 import pywikibot import wikitextparser import importlib import json from wikstraktor_version import version as the_version from wikstraklog import Wikstraklog #ICITE : fr marche pas, en prend des trucs vides à virer (cf. yellow… def & example) class SubInfo: next_id = 1 prfx = "err" @classmethod def inc_n_id(cls): cls.next_id += 1 @classmethod def reset(cls): cls.next_id = 0 def __init__(self, prefix = None): self.id = None self.set_id(prefix) def set_id(self, prefix): if self.id == None and prefix != None: self.id = f"{prefix}_{self.__class__.prfx}{self.__class__.next_id}" self.__class__.inc_n_id() return self.id def serializable(self, prefix = None): res = {} if self.set_id(prefix) != None: res["id"] = self.id return res ####### # Oral ####### class Sound: def __init__(self, url, accent): self.url = url self.accent = accent def __eq__(self, other): return isinstance(other, self.__class__) and self.url == other.url and self.accent == other.accent def serializable(self): if self.accent == None: res = {"url":self.url} else: res = {"accent":self.accent, "url":self.url} return res class Pronunciation(SubInfo): prfx = "prn" def __init__(self, prefix = None): super().__init__(prefix) self.ipa = None self.sounds = [] self.accent = None def set_transcription(self, tscpt): self.ipa = tscpt def set_accent(self, accent): self.accent = accent def add_sound(self, url, accent=None): self.sounds.append(Sound(url,accent)) def serializable(self, prefix = None): snds = [] for s in self.sounds: snds.append(s.serializable()) res = super().serializable(prefix) res['transcript'] = self.ipa if self.accent != None: res['accent'] = self.accent res['sounds'] = snds return res def __str__(self): return json.dumps(self.serializable('')) def __eq__(self, other): res = isinstance(other, self.__class__) and self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) i = 0 while res and i<len(self.sounds): res = self.sounds[i] == other.sounds[i] i += 1 return res ####### # Metadata ## TODO: # * POS : créer une classe POS avec les traits dépendants (ex: masc en fr) ####### ####### # Senses # TODO: créer une classe Translations ####### class Definition(SubInfo): prfx = "def" key = "definition" def __init__(self, lang, text, prefix=None): super().__init__(prefix) if text != "": self.lang = lang self.text = text else: raise ValueError(f"Definition.__init__: “{text}” empty definition.") def __eq__(self, other): return isinstance(other, self.__class__) and self.lang == other.lang and self.text == other.text def serializable(self, prefix = None): res = super().serializable(prefix) res["lang"] = self.lang res[self.__class__.key] = self.text return res class Translation(Definition): prfx = "trad" key = "translation" class Example(SubInfo): prfx = "ex" def __init__(self, transcript, source=None, url=None, prefix=None): super().__init__(prefix) if transcript != "": self.text = transcript self.source = source self.url = url else: raise ValueError(f"Example.__init__: “{transcript}” empty example.") def __eq__(self, other): return isinstance(other, self.__class__) and self.text==other.text and self.source==other.source and self.url==other.url def serializable(self, prefix = None): res = super().serializable(prefix) res["example"]=self.text if self.source != None: res["source"] = self.source if self.url != None: res["url"] = self.url return res class Sense(SubInfo): prfx = "" def __init__(self, lang=None, definition=None, wiki_lang=None, prefix=None): self.lang = lang self.label = None self.set_id(prefix) #On réinitialise les identifiants des sous-éléments if not isinstance(self, SubSense): Definition.reset() Example.reset() Translation.reset() SubSense.reset() self.definitions = [] #liste des définitions (elles auront une langue et un texte) self.subsenses = [] #liste des sous-définitions (récursif…) self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) self.translations = [] #liste des traductions dans d'autres langues self.domain = None #domaine d'usage du mot dans ce sens if definition != None: try: self.add_def(wiki_lang, definition) except ValueError as err: raise ValueError(f"Sense.__init__() with empty definition\n{err}") def set_id(self, prefix=None): if prefix != None and self.label == None: self.label = f"{prefix}_{self.__class__.next_id}" #l'identifiant du sens self.__class__.inc_n_id() return self.label def get_id(self): return f"{self.lang}.{self.label}" def set_domain(self, d): self.domain = d def add_def(self, lang, definition): theDef = Definition(lang, definition) if theDef != None and theDef not in self.definitions: theDef.set_id(self.set_id()) self.definitions.append(theDef) def add_example(self, transcript, src=None, url=None, prefix=None): try: theEx = Example(transcript, src, url, prefix) if theEx != None and theEx not in self.examples: theEx.set_id(self.set_id()) self.examples.append(theEx) except ValueError as e: print(f"Skipped empty example") def add_translation(self, lang=None, translation=None): theTranslation = Translation(lang, translation) if theTranslation != None and theTranslation not in self.translations: theTranslation.set_id(self.set_id()) self.translations.append(theTranslation) def add_subsense(self, subsense): if self.label!=None: subsense.set_id(self.set_id()) if subsense not in self.subsenses: self.subsenses.append(subsense) def __eq__(self, other): res = isinstance(other, self.__class__) and self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain i = 0 while res and i < len(self.examples): res = self.examples[i] in other.examples i+=1 i = 0 while res and i < len(self.translations): res = self.translations[i] in other.translations i+=1 i = 0 while res and i < len(self.definitions): res = self.definitions[i] in other.definitions i+=1 i = 0 while res and i < len(self.subsenses): res = self.subsenses[i] in other.subsenses i+=1 return res def serializable(self, prefix = None): res = {} if self.domain != None: res["Domain"] = self.domain if len(self.definitions) > 0: res["Definitions"] = [] for d in self.definitions: res["Definitions"].append(d.serializable(prefix)) if len(self.subsenses) > 0: res["Subsenses"] = {} for t in self.subsenses: res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix) if len(self.examples) > 0 : res["Examples"] = [] for e in self.examples: res["Examples"].append(e.serializable(prefix)) if len(self.translations) > 0: res["Translations"] = [] for t in self.translations: res["Translations"].append(t.serializable(prefix)) return res def __str__(self): return json.dumps(self.serializable()) class SubSense(Sense): def set_id(self, prefix=None): if prefix != None and self.label == None: self.label = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens self.__class__.inc_n_id() return self.label class Entry: #version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id) def __init__(self, lemma, lang, wiki_lang, version_id, wkskt_version): self.lemma = lemma self.lang = lang #Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile self.sources = [] self.sources.append({"wiktionary_language":wiki_lang,"permanentId":version_id,"wikstraktor_version":wkskt_version}) self.current_source = 0 self.pronunciations = [] self.pos = None self.senses = [] #l'identifiant unique de la version de la page du wiktionnaire Sense.reset() def set_pos(self, pos): self.pos = pos def get_id(self, source_id=0): #TODO: remplacer un jour le source id par la bonne source if self.pos != None: pos = self.pos else: pos = "" return f"{self.lang}-{source_id}.{self.lemma}{pos}" def set_pronunciations(self, pron): if isinstance(pron, Pronunciation): self.add_pronunciation(pron) elif type(pron) == list: for p in pron: if isinstance(p, Pronunciation): self.add_pronunciation(p) else: raise ValueError(f"Entry.set_pronunciations: {p} is not a Pronunciation object ({p.__class__.__name__}).") else: raise ValueError(f"Entry.set_pronunciations: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") def add_pronunciation(self, p): if p not in self.pronunciations: p.set_id(self.get_id()) self.pronunciations.append(p) def set_senses(self, senses): for s in senses: if isinstance(s, Sense): self.add_sense(s) else: raise ValueError(f"Entry.set_senses: {s} is not a Sense object ({p.__class__.__name__}).") def add_sense(self, s): if s not in self.senses: s.set_id(self.get_id()) self.senses.append(s) def is_valid(self): return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 def __eq__(self, other): res = isinstance(other, self.__class__) and self.lemma == other.lemma and self.lang == other.lang and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) i = 0 while res and i < len(self.senses): res = self.senses[i] == other.senses[i] i += 1 i = 0 while res and i < len(self.pronunciations): res = self.pronunciations[i] == other.pronunciations[i] i += 1 return res def serializable(self, id=True): res = {} res['sources'] = self.sources if id: id = self.get_id() res['id'] = id else: id == None res[self.lemma] = {"pos":self.pos} res[self.lemma]["pronunciations"] = [] for p in self.pronunciations: res[self.lemma]["pronunciations"].append(p.serializable(id)) res[self.lemma]["senses"] = {} for s in self.senses: res[self.lemma]["senses"][s.get_id()]=s.serializable(id) return res def __str__(self): res = f"{self.lemma}_{self.lang} ({self.pos})\n" for p in self.pronunciations: res += f"{str(p)}\n" for s in self.senses: res += f"{str(s)}\n" return res class ParserContext: def __init__(self, entry, lang, wiki_lang, wversion_id, version_id): self.lemma = entry self.lang = lang self.wiki_lang = wiki_lang self.page_version_id = wversion_id self.wikstraktor_version = version_id self.context = [] self.entries = [] def get_level(self): if len(self.context) == 0: res = -1 else: res = self.context[-1]["wiki"].level return res def push(self, wiki_context): self.context.append({"wiki":wiki_context}) def pop(self, testNewEntry = True): if testNewEntry: self.create_entries() return self.context.pop() def flush(self): while len(self.context) > 0: self.pop(True) def set_top_wiki(self, wiki_context): if len(self.context) == 0: self.push(wiki_context) else: self.context[-1]['wiki'] = wiki_context def set_top_entry_info(self, key, entry_context, testNewEntry=True): if len(self.context) == 0: raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.") else: self.context[-1][key] = entry_context if testNewEntry: self.create_entries() def create_entries(self): #In the key dict there are traits that describe every thing (ety, pro) and different entities (POS:senses) tmp = {} res = 0 pro = None for l in self.context: for k,v in l.items(): if k == "pro": pro = v elif k == "ety" or k == "wiki": #wiki context is not necessary pass #On ignore l'étymologie pour le moment else: tmp[k]=v if(pro!=None and len(tmp)>0): for pos,senses in tmp.items(): e = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version) e.set_pronunciations(pro) e.set_pos(pos) e.set_senses(senses) #an improvement would be to remove that sense from context, but we test not to add doubles if e.is_valid() and e not in self.entries: res += 1 self.entries.append(e) return res def debug_top(self): res = "Context: " if len(self.context) == 0 : res += "0" else: info = "" for k,v in self.context[-1].items(): if k != 'wiki': if info != "": info += "\n\t\t\t" info += f"{k} → {str(v)}" res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}" return res def __str__(self): res = "" i=0 for c in self.context: res += f"====={i}======\n" for k,v in c.items(): if k!= "wiki": res+=f" {k}→{v}\n" else: res+=f" {k}→{len(v)}\n" i+=1 return res+f"nb of entries: {len(self.entries)}" class Wikstraktor: @classmethod def get_instance(cls, wiki_language, entry_language): try: m_name = f"{wiki_language}_{entry_language}".capitalize() instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() instance.version = the_version instance.log = Wikstraklog(the_version, entry_language, wiki_language) except ModuleNotFoundError: print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module") instance = None return instance def __init__(self): self.entries = [] self.pwb = pywikibot self.wtp = wikitextparser self.parserContext = None def get_file_url(self, file_page_name): res = None try: f = self.pwb.FilePage(self.site, file_page_name) res = f.get_file_url() except pywikibot.exceptions.NoPageError: print(f"{file_page_name} does not exist in {self.site}.") return res #retrieves the content of a page and processes it (adding the entries to the list of entries) #returns the number of entries added def fetch(self, graphy): nb_entries_added = 0 page = self.pwb.Page(self.site, graphy) to_parse = [] if page.text != "": sections = self.wtp.parse(page.text).sections found = False i = 0 ### find language while i < len(sections) and not found: found = sections[i].title != None and sections[i].title.capitalize() == self.constants[self.entry_language] if not found: i += 1 if found: nb_entries_added = self.parse(page.title(), page.latest_revision_id, sections[i].sections)#self.wtp.parse(s.contents).sections) return nb_entries_added def parse(self, entry, v_id, sections): self.parserContext = ParserContext(entry, self.entry_language, self.wiki_language, v_id, self.version) self.log.set_context(entry, v_id) for s in sections: if s.title != None : #handle wiki context if self.parserContext.get_level() < s.level: self.parserContext.push(s) else: while self.parserContext.get_level() > s.level: self.parserContext.pop(True) self.parserContext.set_top_wiki(s) #get section title stitle = self.wtp.parse(s.title).templates if stitle == []: stitle = s.title else: stitle = stitle[0].arguments[0].value if self.isPro(stitle): self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents))) elif self.isEty(stitle): self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents))) else: #Edit to process other types of sections pos = self.process_POS(stitle) if pos != None : self.parserContext.set_top_entry_info(pos, self.process_senses(self.wtp.parse(s.contents))) self.parserContext.flush() res = len(self.parserContext.entries) if res > 0: for e in self.parserContext.entries: self.entries.append(e) return res def isPro(self, title): if type(self.constants['pro']) == str: res = title == self.constants['pro'] else: res = title in self.constants['pro'] return res def isEty(self, title): if type(self.constants['ety']) == str: res = title == self.constants['ety'] else: res = title in self.constants['ety'] return res #recognizes POS and returns None if it can't def process_POS(self, parsedwikitext): pass#in subclass def process_pronunciation(self, parsedwikitext): pass#in subclass def process_etymology(self, parsedwikitext): pass#in subclass def process_example(self, example_wiki_text): pass#in subclass def process_definition(self, definition, sub_items, def_level = True): pass#in subclass def process_senses(self, parsedwikitext): pass#in subclass def __str__(self): return self.export() def export(self, id=True, ascii=False, compact=False): res = [] for e in self.entries: res.append(e.serializable(id)) if compact: return json.dumps(res, ensure_ascii=ascii) else: return json.dumps(res, ensure_ascii=ascii, indent=4) if __name__ == "__main__": import argparse from argparse import RawTextHelpFormatter #pour le formattage de l'aide parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire \033[1m\033[32mex :\033[0m ‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m ‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m ‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -n -A -C\033[0m""") parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en") parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en") parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None) parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None) parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true") parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true") parser.add_argument("-n", "--no_id", help="json sans id", action="store_true") args = parser.parse_args() if args.mot != None: w = Wikstraktor.get_instance(args.wiki_language, args.language) resp = None if w.fetch(args.mot) > 0: resp = w.export(not args.no_id, args.force_ascii, args.compact) if args.destination_file != None: f = open(args.destination_file, "w") f.write(resp) f.close else: print(resp) else: raise NameError("Pas de mot demandé")