From da06f1d5e3e7b92d744ffb38e1d4b30e44c17e4f Mon Sep 17 00:00:00 2001 From: pfleu <fleutotp@gmail.com> Date: Tue, 10 Jan 2023 17:51:57 +0100 Subject: [PATCH] =?UTF-8?q?Mise=20=C3=A0=20jour=20version=20en=20dur=20de?= =?UTF-8?q?=20wikstraktor=20et=20modif=20commande=20du=20manager=20wiktion?= =?UTF-8?q?naire=20pour=20utiliser=20le=20param=20force=20ASCII=20de=20la?= =?UTF-8?q?=20commande=20python?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + src/Manager/WiktionaryManager.php | 2 +- src/Wikstraktor/parsers/Structure_json.json | 64 ++++++++++ .../parsers/Structure_minimale.json | 8 ++ src/Wikstraktor/parsers/en_constants.py | 34 ++++++ src/Wikstraktor/parsers/en_en.py | 104 ++++++++++++++++ src/Wikstraktor/parsers/fr_constants.py | 81 +++++++++++++ src/Wikstraktor/parsers/fr_en.py | 114 ++++++++++++++++++ src/Wikstraktor/wikstraktor.py | 59 +++++---- 9 files changed, 445 insertions(+), 22 deletions(-) create mode 100644 src/Wikstraktor/parsers/Structure_json.json create mode 100644 src/Wikstraktor/parsers/Structure_minimale.json create mode 100644 src/Wikstraktor/parsers/en_constants.py create mode 100644 src/Wikstraktor/parsers/en_en.py create mode 100644 src/Wikstraktor/parsers/fr_constants.py create mode 100644 src/Wikstraktor/parsers/fr_en.py diff --git a/.gitignore b/.gitignore index e832733..9dfe465 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ src/Wikstraktor/throttle.ctrl src/Wikstraktor/apicache* src/Wikstraktor/__pycache* src/Wikstraktor/test.json +src/Wikstraktor/parsers/__pycache* diff --git a/src/Manager/WiktionaryManager.php b/src/Manager/WiktionaryManager.php index 44b7054..4a741e8 100644 --- a/src/Manager/WiktionaryManager.php +++ b/src/Manager/WiktionaryManager.php @@ -43,7 +43,7 @@ class WiktionaryManager if ($language != 'en') { return []; } - $result = exec($_ENV['WIKSTRAKTOR_COMMAND'] . ' ' . $word . ' 2>&1'); + $result = exec($_ENV['WIKSTRAKTOR_COMMAND'] . ' -m ' . $word . ' -A -C 2>&1'); //var_dump($result);die(); $dataArray = json_decode($result, true); diff --git a/src/Wikstraktor/parsers/Structure_json.json b/src/Wikstraktor/parsers/Structure_json.json new file mode 100644 index 0000000..c38503c --- /dev/null +++ b/src/Wikstraktor/parsers/Structure_json.json @@ -0,0 +1,64 @@ +{ + "Headword":"sleep", + "Items":[ + { + "PartOfSpeech":"verb", + "Pronunciation":[ + { + "api":"\\ˈsliËp\\ ", + "hiragana":"", + "katakana":"", + "bopomofo":"", + "pinyin":"", + "romaji":"", + "accent1":"RP", + "url1":"https://upload.wikimedia.org/wikipedia/commons/1/19/LL-Q1860_%28eng%29-Back_ache-water.wav" + } + ], + "Senses":[ + { + "Translations":[ + "translation1", + "...", + "translationn" + ], + "Image":[ + "Stilles Mineralwasser.jpg", + "..." + ], + "Definition":"blabla", + "Examples":[ + "blabla", + "blabli", + "blablou" + ], + "subSense":[ + { + "subdef":"blabla", + "subex":[ + "subexa", + "subexb", + "subexz" + ] + } + ] + } + ] + } + ] +} + +/*adapter à la généricité des données ? + \"User-defined fields 1\" + \"User-defined fields 2\" + \"User-defined fields 3\" + \"User-defined fields 4\" + \"User-defined fields 5\" + \"Supplementary field for devs 1\" + \"Supplementary field for devs 2\" + \"Supplementary field for devs 3\" + \"Supplementary field for devs 4\" + \"Supplementary field for devs 5\" + ... + \"Supplementary field for devs 10\ */ + diff --git a/src/Wikstraktor/parsers/Structure_minimale.json b/src/Wikstraktor/parsers/Structure_minimale.json new file mode 100644 index 0000000..f990a19 --- /dev/null +++ b/src/Wikstraktor/parsers/Structure_minimale.json @@ -0,0 +1,8 @@ +"lemma" +"partOfSpeech" : +{ + "sense" : + { + "definition" + } +} diff --git a/src/Wikstraktor/parsers/en_constants.py b/src/Wikstraktor/parsers/en_constants.py new file mode 100644 index 0000000..c933d6a --- /dev/null +++ b/src/Wikstraktor/parsers/en_constants.py @@ -0,0 +1,34 @@ +string_values = { + "ety":"Etymology", + "pro":"Pronunciation", + "en":"English", + "fr":"French", + "t_ipa":"IPA", #template for transcription + "t_snd":"audio", #template for audio + "t_acc":"a", #template for accents + "t_deflabel":"lb", + "t_ex":["ux", "usex"], + "POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS + "Adjective":"Adj", + "Adverb":"Adv", + "Ambiposition":"Ambip", + "Article":"Art", + "Circumposition":"Circump", + "Classifier":"Class", + "Conjunction":"Conj", + "Contraction":"Cont", + "Counter":"Count", + "Determiner":"Det", + "Ideophone":"Ideophone", + "Interjection":"Interj", + "Noun":"N", + "Numeral":"Num", + "Participle":"Part", + "Particle":"Particle", + "Postposition":"Postp", + "Preposition":"Prep", + "Pronoun":"Pro", + "Proper noun":"NP", + "Verb":"V" # TODO: compléter + } +} diff --git a/src/Wikstraktor/parsers/en_en.py b/src/Wikstraktor/parsers/en_en.py new file mode 100644 index 0000000..b317908 --- /dev/null +++ b/src/Wikstraktor/parsers/en_en.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +from wikstraktor import Wikstraktor, Pronunciation, Sense + +from parsers.en_constants import string_values + +debugEty = 0 + +class En_en_straktor(Wikstraktor): + def __init__(self): + super().__init__() + self.wiki_language = "en" + self.entry_language = "en" + self.constants = string_values + self.site = self.pwb.Site(f'wiktionary:en') + + def process_pronunciation(self, proContent): + # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux + l = proContent.get_lists()[0] + i = 0 + pronunciations = [] + while i < len(l.fullitems): + p = Pronunciation() + templates = self.wtp.parse(l.fullitems[i]).templates + a = None + for j, t in enumerate(templates): + if (t.normal_name() == self.constants['t_acc'] and templates[j+1].normal_name()!= self.constants['t_acc']): + a = t.arguments[0].value + elif t.normal_name() == self.constants['t_ipa']: + p.set_transcription(t.arguments[1].value) + p.set_accent(a) + elif t.normal_name() == self.constants['t_snd']: + p.add_sound(self.get_file_url(t.arguments[1].value), a) + if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : + if p.ipa != None or p.accent != None: + pronunciations.append(p) + p = Pronunciation() + i += 1 + return pronunciations + + def process_etymology(self, etyContent): + global debugEty + debugEty += 1 + return "Etymology" + str(debugEty) + + def process_POS(self,parsedwikitext): + pos = None + if parsedwikitext in self.constants['POS'].keys(): + pos = self.constants['POS'][parsedwikitext] + return pos + + def process_senses(self, entry, pos, sensesContent): + baseId = f"{entry}_{pos}_" + l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) + i = 0 + senses = [] + nombreDef = 0 + while i < len(l): + if l[i].pattern == '\\# ': + nombreDef += 1 + newSense = Sense(f"{baseId}{nombreDef}") + newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + elif l[i].pattern == '\\#:': + for j in l[i].items: + k = 0 + isEx = 0 + while k < len(self.wtp.parse(j).templates) and isEx == 0 : + if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): + newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + isEx = 1 + k += 1 + if isEx == 0: + newSense.add_example(self.wtp.parse(j).plain_text().strip()) + if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': + senses.append(newSense) + cnt = 0 + nombreSousDef = 0 + while i < len(l) and l[i].level == 3 : + cnt +=1 + if l[i].pattern == '\\## ': + nombreSousDef += 1 + newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") + newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + elif l[i].pattern == '\\##:': + for j in l[i].items: + k = 0 + isEx = 0 + while k < len(self.wtp.parse(j).templates) and isEx == 0 : + if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): + newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + isEx = 1 + k += 1 + if isEx == 0: + newSense2.add_example(self.wtp.parse(j).plain_text().strip()) + if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': + senses.append(newSense2) + i += 1 + if cnt > 0: + i -= 1 + i += 1 + return senses + +if __name__ == "__main__": + ensk = En_en_straktor() + print(ensk.fetch("test"), "entries added") diff --git a/src/Wikstraktor/parsers/fr_constants.py b/src/Wikstraktor/parsers/fr_constants.py new file mode 100644 index 0000000..cbf8212 --- /dev/null +++ b/src/Wikstraktor/parsers/fr_constants.py @@ -0,0 +1,81 @@ +string_values = { +"ety":"étymologie", +"pro":["prononciation", "Prononciation"], +"en":" {{langue|en}} ", +"fr":"Français", +"t_deflabel":["lexique", "info lex"], +"t_ex":"exemple", + #Inexistants +"t_ipa":"pron", #template for transcription +"t_snd":"écouter", #template for audio +"t_acc":["US", "UK"], #template for accents + +"POS":{ + "adjectif":["adjectif","adjectif qualificatif","adj"], + "adjectif démonstratif":["adjectif démonstratif","adj-dém","adjectif dém"], + "adjectif exclamatif":["adjectif exclamatif","adj-excl","adjectif exc"], + "adjectif indéfini":["adjectif indéfini","adjectif ind","adj-indéf"], + "adjectif interrogatif":["adjectif interrogatif","adj-int","adjectif int"], + "adjectif numéral":["adjectif numéral","adjectif num","adj-num"], + "adjectif possessif":["adjectif possessif","adjectif pos","adj-pos"], + "adjectif relatif":["adjectif relatif","adjectif rel","adj-rel"], + "adverbe":["Adverbe","adv"], + "adverbe indéfini":["adverbe indéfini","adv-ind","adverbe ind"], + "adverbe interrogatif":["adverbe interrogatif","dverbe int","adv-int"], + "adverbe pronominal":["adverbe pronominal","adv-pron","adverbe pro"], + "adverbe relatif":["adverbe relatif","adv-rel","adverbe rel"], + "affixe":["affixe","aff"], + "article":["article","art"], + "article défini":["article défini","article déf","art-déf"], + "article indéfini":["article indéfini","art-indéf","article ind"], + "article partitif":["article partitif","art-part","article par"], + "circonfixe":["circonfixe","circon","circonf"], + "classificateur":["classificateur","class","classif"], + "conjonction":["conjonction","conj"], + "conjonction de coordination":["conjonction de coordination","conj-coord","conjonction coo"], + "copule":["copule"], + "déterminant":["déterminant","dét"], + "enclitique":["cnclitique","encl"], + "gismu":["gismu"], + "infixe":["infixe","inf"], + "interfixe":["interfixe","interf"], + "interjection":["interjection","interj"], + "lettre":["lettre"], + "locution":["locution","loc"], + "locution-phrase":["locution-phrase","loc-phr","phrase locution","phrase","locution-phrase"], + "nom commun":["nom","nom commun","substantif"], + "nom de famille":["nom de famille","nom-fam"], + "nom propre":["nom propre","nom-pr"], + "nom scientifique":["nom scientifique","nom-sciences","nom scient","nom science"], + "numéral":["numéral","num","numér"], + "onomatopée":["onomatopée","onoma","onom"], + "particule":["particule","part"], + "particule numérale":["particule numérale","part-num","particule num"], + "patronyme":["patronyme"], + "postposition":["postposition","postpos","post"], + "pré-nom":["pré-nom"], + "pré-verbe":["pré-verbe"], + "préfixe":["préfixe","préf"], + "prénom":["prénom"], + "préposition":["préposition","prép"], + "proclitique":["proclitique","procl"], + "pronom":["pronom"], + "pronom démonstratif":["pronom démonstratif","pronom dém","pronom-dém"], + "pronom indéfini":["pronom indéfini","pronom ind","pronom-indéf"], + "pronom interrogatif":["pronom interrogatif","pronom int","pronom-int"], + "pronom personnel":["pronom personnel","pronom réf","pronom-per","pronom réfléchi","pronom-réfl","pronom-pers"], + "pronom possessif":["pronom possessif","pronom pos","pronom-pos"], + "pronom relatif":["pronom relatif","pronom-rel","pronom rel"], + "pronom-adjectif":["pronom-adjectif"], + "proverbe":["proverbe","prov"], + "quantificateur":["quantificateur","quantif"], + "radical":["radical","rad"], + "rafsi":["rafsi"], + "sinogramme":["sinogramme","sinog","sino"], + "suffixe":["suffixe","suf","suff"], + "symbole":["symbole","symb"], + "variante par contrainte typographique":["variante typographique","variante typo","variante par contrainte typographique","var-typo"], + "verbe pronominal":["verbe pronominal","verb-pr","verbe pr"], + "verbe":["verbe","verb"] + } +} diff --git a/src/Wikstraktor/parsers/fr_en.py b/src/Wikstraktor/parsers/fr_en.py new file mode 100644 index 0000000..175f22b --- /dev/null +++ b/src/Wikstraktor/parsers/fr_en.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +from wikstraktor import Wikstraktor, Pronunciation, Sense + +from parsers.fr_constants import string_values + +debugEty = 0 + +class Fr_en_straktor(Wikstraktor): + def __init__(self): + super().__init__() + self.wiki_language = "fr" + self.entry_language = "en" + self.constants = string_values + self.site = self.pwb.Site(f'wiktionary:fr') + + def process_pronunciation(self, proContent): + # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux + l = proContent.get_lists()[0] + i = 0 + pronunciations = [] + while i < len(l.fullitems): + p = Pronunciation() + templates = self.wtp.parse(l.fullitems[i]).templates + a = None + for j, t in enumerate(templates): + #if t.normal_name() == self.constants['t_acc']: + # p.set_transcription(t.arguments[i+1].value) + if t.normal_name() == self.constants['t_snd']: + p.add_sound(self.get_file_url(t.arguments[-1].value)) + if len(self.wtp.parse(t.get_arg("1").value).templates) != 1: + p.set_accent(t.get_arg("1").value) + else: + p.set_accent(self.wtp.parse(t.get_arg("1").value).templates[0].normal_name()) + if t.get_arg("2") != None: + p.set_transcription(t.get_arg("2").value) + if p.accent != None and p.sounds != []: + pronunciations.append(p) + p = Pronunciation() + i += 1 + return pronunciations + + def process_etymology(self, etyContent): + global debugEty + debugEty += 1 + return "Etymology" + str(debugEty) + + def process_POS(self,parsedwikitext): + pos = None + ik = 0 + values = list(self.constants['POS'].values()) + while pos == None and ik < len(self.constants['POS'].keys()): + if parsedwikitext in values[ik]: + keys = list(self.constants['POS'].keys()) + pos = keys[ik] + ik += 1 +# print(pos) + return pos + + def process_senses(self, entry, pos, sensesContent): + baseId = f"{entry}_{pos}_" + l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) + i = 0 + senses = [] + nombreDef = 0 + while i < len(l): + if l[i].pattern == '\\# ': + nombreDef += 1 + newSense = Sense(f"{baseId}{nombreDef}") + newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + elif l[i].pattern == '\\#:': + for j in l[i].items: + k = 0 + isEx = 0 + while k < len(self.wtp.parse(j).templates) and isEx == 0 : + if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): + newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + isEx = 1 + k += 1 + if isEx == 0: + newSense.add_example(self.wtp.parse(j).plain_text().strip()) + if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': + senses.append(newSense) + cnt = 0 + nombreSousDef = 0 + while i < len(l) and l[i].level == 3 : + cnt +=1 + if l[i].pattern == '\\## ': + nombreSousDef += 1 + newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") + newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + elif l[i].pattern == '\\##:': + for j in l[i].items: + k = 0 + isEx = 0 + while k < len(self.wtp.parse(j).templates) and isEx == 0 : + if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): + newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + isEx = 1 + k += 1 + if isEx == 0: + newSense2.add_example(self.wtp.parse(j).plain_text().strip()) + if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': + senses.append(newSense2) + i += 1 + if cnt > 0: + i -= 1 + i += 1 + return senses + +if __name__ == "__main__": + ensk = Fr_en_straktor() + print(ensk.fetch("test"), "entries added") + + diff --git a/src/Wikstraktor/wikstraktor.py b/src/Wikstraktor/wikstraktor.py index fd478bd..b9780d8 100644 --- a/src/Wikstraktor/wikstraktor.py +++ b/src/Wikstraktor/wikstraktor.py @@ -3,7 +3,6 @@ import pywikibot import wikitextparser import importlib import json -import sys ####### # Oral @@ -295,9 +294,9 @@ class Wikstraktor: def get_instance(cls, wiki_language, entry_language): try: m_name = f"{wiki_language}_{entry_language}".capitalize() - instance = getattr(importlib.import_module(f"{m_name.lower()}"), f"{m_name}_straktor")() + instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() except ModuleNotFoundError: - print(f"{m_name.lower()} module not found or {m_name}_straktor not found in module") + print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module") instance = None return instance @@ -374,14 +373,14 @@ class Wikstraktor: res = title in self.constants['pro'] #print(title, res) return res - + def isEty(self, title): if type(self.constants['ety']) == str: res = title == self.constants['ety'] else: res = title in self.constants['ety'] return res - + def process_POS(self, parsedwikitext): pass#in subclass @@ -395,24 +394,42 @@ class Wikstraktor: pass#in subclass def __str__(self): + return self.export() + + def export(self, ascii=False, compact=False): res = [] for e in self.entries: res.append(e.serializable()) - return json.dumps(res) - + if compact: + return json.dumps(res, ensure_ascii=ascii) + else: + return json.dumps(res, ensure_ascii=ascii, indent=4) if __name__ == "__main__": - #e = Wikstraktor.get_instance('en', "en") - f = Wikstraktor.get_instance('en', 'en') - # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) - # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) - #e.fetch("water") - #f.fetch('blue') - #file_path = 'test.json' - #fichier = open(file_path, "w", encoding='utf-8') - #fichier.write(str(f)) - #fichier.close() - #fichier.write(str(f)) - word = sys.argv[1] - f.fetch(word) - print(str(f)) + import argparse + from argparse import RawTextHelpFormatter #pour le formattage de l'aide + parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire + \033[1m\033[32mex :\033[0m + ‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m + ‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m + ‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C\033[0m""") + parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en") + parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en") + parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None) + parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None) + parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true") + parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true") + args = parser.parse_args() + if args.mot != None: + w = Wikstraktor.get_instance(args.wiki_language, args.language) + resp = None + if w.fetch(args.mot) > 0: + resp = w.export(args.force_ascii, args.compact) + if args.destination_file != None: + f = open(args.destination_file, "w") + f.write(resp) + f.close + else: + print(resp) + else: + raise NameError("Pas de mot demandé") -- GitLab