diff --git a/.gitignore b/.gitignore index e832733f39bd8f95a61cbca7bab709bd787924eb..9dfe465633983baeb4da96f09d1ba691ad28c757 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ src/Wikstraktor/throttle.ctrl src/Wikstraktor/apicache* src/Wikstraktor/__pycache* src/Wikstraktor/test.json +src/Wikstraktor/parsers/__pycache* diff --git a/src/Manager/WiktionaryManager.php b/src/Manager/WiktionaryManager.php index 44b7054b2382ee17bbc89eef5e8f62f24240780e..4a741e82c94470f45d62cc0be514bd89a36aa572 100644 --- a/src/Manager/WiktionaryManager.php +++ b/src/Manager/WiktionaryManager.php @@ -43,7 +43,7 @@ class WiktionaryManager if ($language != 'en') { return []; } - $result = exec($_ENV['WIKSTRAKTOR_COMMAND'] . ' ' . $word . ' 2>&1'); + $result = exec($_ENV['WIKSTRAKTOR_COMMAND'] . ' -m ' . $word . ' -A -C 2>&1'); //var_dump($result);die(); $dataArray = json_decode($result, true); diff --git a/src/Wikstraktor/parsers/Structure_json.json b/src/Wikstraktor/parsers/Structure_json.json new file mode 100644 index 0000000000000000000000000000000000000000..c38503c4148023192a7ae08469ed5bf33adee85b --- /dev/null +++ b/src/Wikstraktor/parsers/Structure_json.json @@ -0,0 +1,64 @@ +{ + "Headword":"sleep", + "Items":[ + { + "PartOfSpeech":"verb", + "Pronunciation":[ + { + "api":"\\ˈsliËp\\ ", + "hiragana":"", + "katakana":"", + "bopomofo":"", + "pinyin":"", + "romaji":"", + "accent1":"RP", + "url1":"https://upload.wikimedia.org/wikipedia/commons/1/19/LL-Q1860_%28eng%29-Back_ache-water.wav" + } + ], + "Senses":[ + { + "Translations":[ + "translation1", + "...", + "translationn" + ], + "Image":[ + "Stilles Mineralwasser.jpg", + "..." + ], + "Definition":"blabla", + "Examples":[ + "blabla", + "blabli", + "blablou" + ], + "subSense":[ + { + "subdef":"blabla", + "subex":[ + "subexa", + "subexb", + "subexz" + ] + } + ] + } + ] + } + ] +} + +/*adapter à la généricité des données ? + \"User-defined fields 1\" + \"User-defined fields 2\" + \"User-defined fields 3\" + \"User-defined fields 4\" + \"User-defined fields 5\" + \"Supplementary field for devs 1\" + \"Supplementary field for devs 2\" + \"Supplementary field for devs 3\" + \"Supplementary field for devs 4\" + \"Supplementary field for devs 5\" + ... + \"Supplementary field for devs 10\ */ + diff --git a/src/Wikstraktor/parsers/Structure_minimale.json b/src/Wikstraktor/parsers/Structure_minimale.json new file mode 100644 index 0000000000000000000000000000000000000000..f990a1905b471b13e55979b1e5fb70aece2c5bba --- /dev/null +++ b/src/Wikstraktor/parsers/Structure_minimale.json @@ -0,0 +1,8 @@ +"lemma" +"partOfSpeech" : +{ + "sense" : + { + "definition" + } +} diff --git a/src/Wikstraktor/parsers/en_constants.py b/src/Wikstraktor/parsers/en_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..c933d6ab9b796b86e53f6b4ad1c9b9e96ea8eeb7 --- /dev/null +++ b/src/Wikstraktor/parsers/en_constants.py @@ -0,0 +1,34 @@ +string_values = { + "ety":"Etymology", + "pro":"Pronunciation", + "en":"English", + "fr":"French", + "t_ipa":"IPA", #template for transcription + "t_snd":"audio", #template for audio + "t_acc":"a", #template for accents + "t_deflabel":"lb", + "t_ex":["ux", "usex"], + "POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS + "Adjective":"Adj", + "Adverb":"Adv", + "Ambiposition":"Ambip", + "Article":"Art", + "Circumposition":"Circump", + "Classifier":"Class", + "Conjunction":"Conj", + "Contraction":"Cont", + "Counter":"Count", + "Determiner":"Det", + "Ideophone":"Ideophone", + "Interjection":"Interj", + "Noun":"N", + "Numeral":"Num", + "Participle":"Part", + "Particle":"Particle", + "Postposition":"Postp", + "Preposition":"Prep", + "Pronoun":"Pro", + "Proper noun":"NP", + "Verb":"V" # TODO: compléter + } +} diff --git a/src/Wikstraktor/parsers/en_en.py b/src/Wikstraktor/parsers/en_en.py new file mode 100644 index 0000000000000000000000000000000000000000..b31790825d55566011c8c7ed01217b2b85123b81 --- /dev/null +++ b/src/Wikstraktor/parsers/en_en.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +from wikstraktor import Wikstraktor, Pronunciation, Sense + +from parsers.en_constants import string_values + +debugEty = 0 + +class En_en_straktor(Wikstraktor): + def __init__(self): + super().__init__() + self.wiki_language = "en" + self.entry_language = "en" + self.constants = string_values + self.site = self.pwb.Site(f'wiktionary:en') + + def process_pronunciation(self, proContent): + # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux + l = proContent.get_lists()[0] + i = 0 + pronunciations = [] + while i < len(l.fullitems): + p = Pronunciation() + templates = self.wtp.parse(l.fullitems[i]).templates + a = None + for j, t in enumerate(templates): + if (t.normal_name() == self.constants['t_acc'] and templates[j+1].normal_name()!= self.constants['t_acc']): + a = t.arguments[0].value + elif t.normal_name() == self.constants['t_ipa']: + p.set_transcription(t.arguments[1].value) + p.set_accent(a) + elif t.normal_name() == self.constants['t_snd']: + p.add_sound(self.get_file_url(t.arguments[1].value), a) + if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : + if p.ipa != None or p.accent != None: + pronunciations.append(p) + p = Pronunciation() + i += 1 + return pronunciations + + def process_etymology(self, etyContent): + global debugEty + debugEty += 1 + return "Etymology" + str(debugEty) + + def process_POS(self,parsedwikitext): + pos = None + if parsedwikitext in self.constants['POS'].keys(): + pos = self.constants['POS'][parsedwikitext] + return pos + + def process_senses(self, entry, pos, sensesContent): + baseId = f"{entry}_{pos}_" + l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) + i = 0 + senses = [] + nombreDef = 0 + while i < len(l): + if l[i].pattern == '\\# ': + nombreDef += 1 + newSense = Sense(f"{baseId}{nombreDef}") + newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + elif l[i].pattern == '\\#:': + for j in l[i].items: + k = 0 + isEx = 0 + while k < len(self.wtp.parse(j).templates) and isEx == 0 : + if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): + newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + isEx = 1 + k += 1 + if isEx == 0: + newSense.add_example(self.wtp.parse(j).plain_text().strip()) + if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': + senses.append(newSense) + cnt = 0 + nombreSousDef = 0 + while i < len(l) and l[i].level == 3 : + cnt +=1 + if l[i].pattern == '\\## ': + nombreSousDef += 1 + newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") + newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + elif l[i].pattern == '\\##:': + for j in l[i].items: + k = 0 + isEx = 0 + while k < len(self.wtp.parse(j).templates) and isEx == 0 : + if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): + newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + isEx = 1 + k += 1 + if isEx == 0: + newSense2.add_example(self.wtp.parse(j).plain_text().strip()) + if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': + senses.append(newSense2) + i += 1 + if cnt > 0: + i -= 1 + i += 1 + return senses + +if __name__ == "__main__": + ensk = En_en_straktor() + print(ensk.fetch("test"), "entries added") diff --git a/src/Wikstraktor/parsers/fr_constants.py b/src/Wikstraktor/parsers/fr_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..cbf821283a8791bf4adcc9efbed37903fec2ccae --- /dev/null +++ b/src/Wikstraktor/parsers/fr_constants.py @@ -0,0 +1,81 @@ +string_values = { +"ety":"étymologie", +"pro":["prononciation", "Prononciation"], +"en":" {{langue|en}} ", +"fr":"Français", +"t_deflabel":["lexique", "info lex"], +"t_ex":"exemple", + #Inexistants +"t_ipa":"pron", #template for transcription +"t_snd":"écouter", #template for audio +"t_acc":["US", "UK"], #template for accents + +"POS":{ + "adjectif":["adjectif","adjectif qualificatif","adj"], + "adjectif démonstratif":["adjectif démonstratif","adj-dém","adjectif dém"], + "adjectif exclamatif":["adjectif exclamatif","adj-excl","adjectif exc"], + "adjectif indéfini":["adjectif indéfini","adjectif ind","adj-indéf"], + "adjectif interrogatif":["adjectif interrogatif","adj-int","adjectif int"], + "adjectif numéral":["adjectif numéral","adjectif num","adj-num"], + "adjectif possessif":["adjectif possessif","adjectif pos","adj-pos"], + "adjectif relatif":["adjectif relatif","adjectif rel","adj-rel"], + "adverbe":["Adverbe","adv"], + "adverbe indéfini":["adverbe indéfini","adv-ind","adverbe ind"], + "adverbe interrogatif":["adverbe interrogatif","dverbe int","adv-int"], + "adverbe pronominal":["adverbe pronominal","adv-pron","adverbe pro"], + "adverbe relatif":["adverbe relatif","adv-rel","adverbe rel"], + "affixe":["affixe","aff"], + "article":["article","art"], + "article défini":["article défini","article déf","art-déf"], + "article indéfini":["article indéfini","art-indéf","article ind"], + "article partitif":["article partitif","art-part","article par"], + "circonfixe":["circonfixe","circon","circonf"], + "classificateur":["classificateur","class","classif"], + "conjonction":["conjonction","conj"], + "conjonction de coordination":["conjonction de coordination","conj-coord","conjonction coo"], + "copule":["copule"], + "déterminant":["déterminant","dét"], + "enclitique":["cnclitique","encl"], + "gismu":["gismu"], + "infixe":["infixe","inf"], + "interfixe":["interfixe","interf"], + "interjection":["interjection","interj"], + "lettre":["lettre"], + "locution":["locution","loc"], + "locution-phrase":["locution-phrase","loc-phr","phrase locution","phrase","locution-phrase"], + "nom commun":["nom","nom commun","substantif"], + "nom de famille":["nom de famille","nom-fam"], + "nom propre":["nom propre","nom-pr"], + "nom scientifique":["nom scientifique","nom-sciences","nom scient","nom science"], + "numéral":["numéral","num","numér"], + "onomatopée":["onomatopée","onoma","onom"], + "particule":["particule","part"], + "particule numérale":["particule numérale","part-num","particule num"], + "patronyme":["patronyme"], + "postposition":["postposition","postpos","post"], + "pré-nom":["pré-nom"], + "pré-verbe":["pré-verbe"], + "préfixe":["préfixe","préf"], + "prénom":["prénom"], + "préposition":["préposition","prép"], + "proclitique":["proclitique","procl"], + "pronom":["pronom"], + "pronom démonstratif":["pronom démonstratif","pronom dém","pronom-dém"], + "pronom indéfini":["pronom indéfini","pronom ind","pronom-indéf"], + "pronom interrogatif":["pronom interrogatif","pronom int","pronom-int"], + "pronom personnel":["pronom personnel","pronom réf","pronom-per","pronom réfléchi","pronom-réfl","pronom-pers"], + "pronom possessif":["pronom possessif","pronom pos","pronom-pos"], + "pronom relatif":["pronom relatif","pronom-rel","pronom rel"], + "pronom-adjectif":["pronom-adjectif"], + "proverbe":["proverbe","prov"], + "quantificateur":["quantificateur","quantif"], + "radical":["radical","rad"], + "rafsi":["rafsi"], + "sinogramme":["sinogramme","sinog","sino"], + "suffixe":["suffixe","suf","suff"], + "symbole":["symbole","symb"], + "variante par contrainte typographique":["variante typographique","variante typo","variante par contrainte typographique","var-typo"], + "verbe pronominal":["verbe pronominal","verb-pr","verbe pr"], + "verbe":["verbe","verb"] + } +} diff --git a/src/Wikstraktor/parsers/fr_en.py b/src/Wikstraktor/parsers/fr_en.py new file mode 100644 index 0000000000000000000000000000000000000000..175f22b6c6cc585ab887e1b3964cfb866d8a12b4 --- /dev/null +++ b/src/Wikstraktor/parsers/fr_en.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +from wikstraktor import Wikstraktor, Pronunciation, Sense + +from parsers.fr_constants import string_values + +debugEty = 0 + +class Fr_en_straktor(Wikstraktor): + def __init__(self): + super().__init__() + self.wiki_language = "fr" + self.entry_language = "en" + self.constants = string_values + self.site = self.pwb.Site(f'wiktionary:fr') + + def process_pronunciation(self, proContent): + # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux + l = proContent.get_lists()[0] + i = 0 + pronunciations = [] + while i < len(l.fullitems): + p = Pronunciation() + templates = self.wtp.parse(l.fullitems[i]).templates + a = None + for j, t in enumerate(templates): + #if t.normal_name() == self.constants['t_acc']: + # p.set_transcription(t.arguments[i+1].value) + if t.normal_name() == self.constants['t_snd']: + p.add_sound(self.get_file_url(t.arguments[-1].value)) + if len(self.wtp.parse(t.get_arg("1").value).templates) != 1: + p.set_accent(t.get_arg("1").value) + else: + p.set_accent(self.wtp.parse(t.get_arg("1").value).templates[0].normal_name()) + if t.get_arg("2") != None: + p.set_transcription(t.get_arg("2").value) + if p.accent != None and p.sounds != []: + pronunciations.append(p) + p = Pronunciation() + i += 1 + return pronunciations + + def process_etymology(self, etyContent): + global debugEty + debugEty += 1 + return "Etymology" + str(debugEty) + + def process_POS(self,parsedwikitext): + pos = None + ik = 0 + values = list(self.constants['POS'].values()) + while pos == None and ik < len(self.constants['POS'].keys()): + if parsedwikitext in values[ik]: + keys = list(self.constants['POS'].keys()) + pos = keys[ik] + ik += 1 +# print(pos) + return pos + + def process_senses(self, entry, pos, sensesContent): + baseId = f"{entry}_{pos}_" + l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) + i = 0 + senses = [] + nombreDef = 0 + while i < len(l): + if l[i].pattern == '\\# ': + nombreDef += 1 + newSense = Sense(f"{baseId}{nombreDef}") + newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + elif l[i].pattern == '\\#:': + for j in l[i].items: + k = 0 + isEx = 0 + while k < len(self.wtp.parse(j).templates) and isEx == 0 : + if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): + newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + isEx = 1 + k += 1 + if isEx == 0: + newSense.add_example(self.wtp.parse(j).plain_text().strip()) + if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': + senses.append(newSense) + cnt = 0 + nombreSousDef = 0 + while i < len(l) and l[i].level == 3 : + cnt +=1 + if l[i].pattern == '\\## ': + nombreSousDef += 1 + newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") + newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + elif l[i].pattern == '\\##:': + for j in l[i].items: + k = 0 + isEx = 0 + while k < len(self.wtp.parse(j).templates) and isEx == 0 : + if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): + newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + isEx = 1 + k += 1 + if isEx == 0: + newSense2.add_example(self.wtp.parse(j).plain_text().strip()) + if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': + senses.append(newSense2) + i += 1 + if cnt > 0: + i -= 1 + i += 1 + return senses + +if __name__ == "__main__": + ensk = Fr_en_straktor() + print(ensk.fetch("test"), "entries added") + + diff --git a/src/Wikstraktor/wikstraktor.py b/src/Wikstraktor/wikstraktor.py index fd478bdb5780b055b5b3547500f434709f9423be..b9780d8e58b3fb8ac2eea8b5a244c75a626375fd 100644 --- a/src/Wikstraktor/wikstraktor.py +++ b/src/Wikstraktor/wikstraktor.py @@ -3,7 +3,6 @@ import pywikibot import wikitextparser import importlib import json -import sys ####### # Oral @@ -295,9 +294,9 @@ class Wikstraktor: def get_instance(cls, wiki_language, entry_language): try: m_name = f"{wiki_language}_{entry_language}".capitalize() - instance = getattr(importlib.import_module(f"{m_name.lower()}"), f"{m_name}_straktor")() + instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() except ModuleNotFoundError: - print(f"{m_name.lower()} module not found or {m_name}_straktor not found in module") + print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module") instance = None return instance @@ -374,14 +373,14 @@ class Wikstraktor: res = title in self.constants['pro'] #print(title, res) return res - + def isEty(self, title): if type(self.constants['ety']) == str: res = title == self.constants['ety'] else: res = title in self.constants['ety'] return res - + def process_POS(self, parsedwikitext): pass#in subclass @@ -395,24 +394,42 @@ class Wikstraktor: pass#in subclass def __str__(self): + return self.export() + + def export(self, ascii=False, compact=False): res = [] for e in self.entries: res.append(e.serializable()) - return json.dumps(res) - + if compact: + return json.dumps(res, ensure_ascii=ascii) + else: + return json.dumps(res, ensure_ascii=ascii, indent=4) if __name__ == "__main__": - #e = Wikstraktor.get_instance('en', "en") - f = Wikstraktor.get_instance('en', 'en') - # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) - # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) - #e.fetch("water") - #f.fetch('blue') - #file_path = 'test.json' - #fichier = open(file_path, "w", encoding='utf-8') - #fichier.write(str(f)) - #fichier.close() - #fichier.write(str(f)) - word = sys.argv[1] - f.fetch(word) - print(str(f)) + import argparse + from argparse import RawTextHelpFormatter #pour le formattage de l'aide + parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire + \033[1m\033[32mex :\033[0m + ‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m + ‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m + ‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C\033[0m""") + parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en") + parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en") + parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None) + parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None) + parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true") + parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true") + args = parser.parse_args() + if args.mot != None: + w = Wikstraktor.get_instance(args.wiki_language, args.language) + resp = None + if w.fetch(args.mot) > 0: + resp = w.export(args.force_ascii, args.compact) + if args.destination_file != None: + f = open(args.destination_file, "w") + f.write(resp) + f.close + else: + print(resp) + else: + raise NameError("Pas de mot demandé")