From 0eb66f3f403574be5fffc589964d9d373f1b9bc2 Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Tue, 26 Sep 2023 17:35:13 +0200 Subject: [PATCH] old wikstraktor --- wikstraktor_old_pierre.py | 435 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 435 insertions(+) create mode 100644 wikstraktor_old_pierre.py diff --git a/wikstraktor_old_pierre.py b/wikstraktor_old_pierre.py new file mode 100644 index 0000000..b9780d8 --- /dev/null +++ b/wikstraktor_old_pierre.py @@ -0,0 +1,435 @@ +#!/usr/bin/env python3 +import pywikibot +import wikitextparser +import importlib +import json + +####### +# Oral +####### +class Sound: + def __init__(self, url, accent): + self.url = url + self.accent = accent + + def __eq__(self, other): + return self.url == other.url and self.accent == other.accent + + def serializable(self): + if self.accent == None: + res = {"url":self.url} + else: + res = {"accent":self.accent, "url":self.url} + return res + +class Pronunciation: + def __init__(self): + self.ipa = None + self.sounds = [] + self.accent = None + + def set_transcription(self, tscpt): + self.ipa = tscpt + + def set_accent(self, accent): + self.accent = accent + + def add_sound(self, url, accent=None): + self.sounds.append(Sound(url,accent)) + + def serializable(self): + snds = [] + for s in self.sounds: + snds.append(s.serializable()) + if self.accent == None: + res = {"transcript":self.ipa, "sounds":snds} + else: + res = {"accent":self.accent, "transcript":self.ipa, "sounds":snds} + return res + + def __str__(self): + return f"{self.serializable()}" + + def __eq__(self, other): + res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) + i = 0 + while res and i<len(self.sounds): + res = self.sounds[i] == other.sounds[i] + i += 1 + return res + +####### +# Metadata +## TODO: +# * POS : créer une classe POS avec les traits dépendants (ex: masc en fr) +####### + +####### +# Senses +# TODO: créer une classe Translations +####### + +class Definition: + def __init__(self, lang, text): + self.lang = lang + self.text = text + + def __eq__(self, other): + return self.lang == other.lang and self.text == other.text + + def serializable(self): + return {"lang":self.lang, "definition":self.text} + +class Translation(Definition): + def serializable(self): + return {"lang":self.lang, "translation":self.text} + +class Example: + def __init__(self, transcript, source=None, url=None): + self.text = transcript + self.source = source + self.url = url + + def __eq__(self, other): + return self.text==other.text and self.source==other.source and self.url==other.url + + def serializable(self): + res = {"example":self.text} + if self.source != None: + res["source"] = self.source + if self.url != None: + res["url"] = self.url + return res + +class Sense: + def __init__(self, label): + self.label = label #l'identifiant du sens + self.definitions = [] #liste des définitions (elles auront une langue et un texte) + self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) + self.translations = [] #liste des traductions dans d'autres langues + self.domain = None #domaine d'usage du mot dans ce sens + + def set_domain(self, d): + self.domain = d + + def add_def(self, lang, definition): + theDef = Definition(lang, definition) + if theDef not in self.definitions: + self.definitions.append(theDef) + + def add_example(self, transcript, src=None, url=None): + theEx = Example(transcript, src, url) + if theEx not in self.examples: + self.examples.append(theEx) + + def add_translation(self, lang, translation): + theTranslation = Translation(lang, translation) + if theTranslation not in self.translations: + self.translations.append(theTranslation) + + def __eq__(self, other): + res = self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain + i = 0 + while res and i < len(self.examples): + res = self.examples[i] in other.examples + i+=1 + i = 0 + while res and i < len(self.translations): + res = self.translations[i] in other.translations + i+=1 + i = 0 + while res and i < len(self.definitions): + res = self.definitions[i] in other.definitions + i+=1 + return res + + def serializable(self): + res = {} + res[self.label]={} + if self.domain != None: + res[self.label]["domain"] = self.domain + res[self.label]["defs"] = [] + for d in self.definitions: + res[self.label]["defs"].append(d.serializable()) + res[self.label]["exs"] = [] + for e in self.examples: + res[self.label]["exs"].append(e.serializable()) + res[self.label]["trad"] = [] + for t in self.translations: + res[self.label]["trad"].append(t.serializable()) + return res + + +class Entry: + def __init__(self, lemma): + self.lemma = lemma + self.pronunciations = [] + self.pos = None + self.senses = [] + + def set_pronunciations(self, pron): + if isinstance(pron, Pronunciation): + self.pronunciations.append(pron) + elif type(pron) == list: + for p in pron: + if isinstance(p, Pronunciation): + self.pronunciations.append(p) + else: + raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).") + else: + raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") + + def set_pos(self, pos): + self.pos = pos + + def set_senses(self, senses): + self.senses = senses + + def is_valid(self): + return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 + + def __eq__(self, other): + res = self.lemma == other.lemma and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) + i = 0 + while res and i < len(self.senses): + res = self.senses[i] == other.senses[i] + i += 1 + i = 0 + while res and i < len(self.pronunciations): + res = self.pronunciations[i] == other.pronunciations[i] + i += 1 + return res + + def serializable(self): + res = {} + res[self.lemma] = {"pos":self.pos} + res[self.lemma]["pronunciations"] = [] + for p in self.pronunciations: + res[self.lemma]["pronunciations"].append(p.serializable()) + res[self.lemma]["senses"] = [] + for s in self.senses: + res[self.lemma]["senses"].append(s.serializable()) + return res + + def __str__(self): + res = f"{self.lemma} ({self.pos})\n" + for p in self.pronunciations: + res += f"{str(p)}\n" + for s in self.senses: + res += f"{str(s)}\n" + return res + +class ParserContext: + def __init__(self, entry): + self.lemma = entry + self.context = [] + self.entries = [] + + def get_level(self): + if len(self.context) == 0: + res = -1 + else: + res = self.context[-1]["wiki"].level + return res + + def push(self, wiki_context): + self.context.append({"wiki":wiki_context}) + + def pop(self, testNewEntry = True): + if testNewEntry: + self.create_entry() + return self.context.pop() + + def set_top_wiki(self, wiki_context): + if len(self.context) == 0: + self.push(wiki_context) + else: + self.context[-1]['wiki'] = wiki_context + + def set_top_entry_info(self, key, entry_context, testNewEntry=True): + if len(self.context) == 0: + raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.") + else: + self.context[-1][key] = entry_context + if testNewEntry: + self.create_entry() + + def create_entry(self): + #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS + res = Entry(self.lemma) + for l in self.context: + #print(l.keys()) + if "pro" in l.keys(): + res.set_pronunciations(l['pro']) + if "ety" in l.keys(): + pass #On ignore l'étymologie pour le moment + if "POS" in l.keys(): + res.set_pos(l['POS']) + if "senses" in l.keys(): + res.set_senses(l['senses']) + # TODO: Ajouter les autres types + if res.is_valid() and res not in self.entries: + self.entries.append(res) + else: + res = None + return res + + def debug_top(self): + res = "Context: " + if len(self.context) == 0 : + res += "0" + else: + info = "" + for k,v in self.context[-1].items(): + if k != 'wiki': + if info != "": + info += "\n\t\t\t" + info += f"{k} → {str(v)}" + res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}" + return res + + +class Wikstraktor: + @classmethod + def get_instance(cls, wiki_language, entry_language): + try: + m_name = f"{wiki_language}_{entry_language}".capitalize() + instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() + except ModuleNotFoundError: + print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module") + instance = None + return instance + + def __init__(self): + self.entries = [] + self.pwb = pywikibot + self.wtp = wikitextparser + self.parserContext = None + + def get_file_url(self, file_page_name): + res = None + try: + f = self.pwb.FilePage(self.site, file_page_name) + res = f.get_file_url() + except pywikibot.exceptions.NoPageError: + print(f"{file_page_name} does not exist in {self.site}.") + return res + + #retrieves the content of a page and processes it (adding the entries to the list of entries) + #returns the number of entries added + def fetch(self, graphy): + nb_entries_added = 0 + page = self.pwb.Page(self.site, graphy) + to_parse = [] + if page.text != "": + sections = self.wtp.parse(page.text).sections + found = False + i = 0 + ### find language + while i < len(sections) and not found: + found = sections[i].title != None and sections[i].title.capitalize() == self.constants[self.entry_language] + if not found: + i += 1 + if found: + nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections) + return nb_entries_added + + def parse(self, entry, sections): + self.parserContext = ParserContext(entry) + for s in sections: + if s.title != None : + #handle wiki context + if self.parserContext.get_level() < s.level: + self.parserContext.push(s) + else: + while self.parserContext.get_level() > s.level: + self.parserContext.pop() + self.parserContext.set_top_wiki(s) + stitle = self.wtp.parse(s.title).templates + if stitle == []: + stitle = s.title + else: + stitle = stitle[0].arguments[0].value + if self.isPro(stitle): + self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents))) + elif self.isEty(stitle): + self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents))) +# elif stitle in self.constants['POS'].keys(): + else: + pos = self.process_POS(stitle) + if pos != None : + self.parserContext.set_top_entry_info('POS', pos, False) + self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) + res = len(self.parserContext.entries) + if res > 0: + for e in self.parserContext.entries: + self.entries.append(e) + return res + + def isPro(self, title): + if type(self.constants['pro']) == str: + res = title == self.constants['pro'] + else: + res = title in self.constants['pro'] + #print(title, res) + return res + + def isEty(self, title): + if type(self.constants['ety']) == str: + res = title == self.constants['ety'] + else: + res = title in self.constants['ety'] + return res + + def process_POS(self, parsedwikitext): + pass#in subclass + + def process_pronunciation(self, parsedwikitext): + pass#in subclass + + def process_etymology(self, parsedwikitext): + pass#in subclass + + def process_senses(self, entry, pos, parsedwikitext): + pass#in subclass + + def __str__(self): + return self.export() + + def export(self, ascii=False, compact=False): + res = [] + for e in self.entries: + res.append(e.serializable()) + if compact: + return json.dumps(res, ensure_ascii=ascii) + else: + return json.dumps(res, ensure_ascii=ascii, indent=4) + +if __name__ == "__main__": + import argparse + from argparse import RawTextHelpFormatter #pour le formattage de l'aide + parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire + \033[1m\033[32mex :\033[0m + ‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m + ‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m + ‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C\033[0m""") + parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en") + parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en") + parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None) + parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None) + parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true") + parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true") + args = parser.parse_args() + if args.mot != None: + w = Wikstraktor.get_instance(args.wiki_language, args.language) + resp = None + if w.fetch(args.mot) > 0: + resp = w.export(args.force_ascii, args.compact) + if args.destination_file != None: + f = open(args.destination_file, "w") + f.write(resp) + f.close + else: + print(resp) + else: + raise NameError("Pas de mot demandé") -- GitLab