Skip to content
Snippets Groups Projects
Commit da06f1d5 authored by Pierre Fleutot's avatar Pierre Fleutot
Browse files

Mise à jour version en dur de wikstraktor et modif commande du manager...

Mise à jour version en dur de wikstraktor et modif commande du manager wiktionnaire pour utiliser le param force ASCII de la commande python
parent dbbc051b
No related branches found
No related tags found
No related merge requests found
...@@ -36,3 +36,4 @@ src/Wikstraktor/throttle.ctrl ...@@ -36,3 +36,4 @@ src/Wikstraktor/throttle.ctrl
src/Wikstraktor/apicache* src/Wikstraktor/apicache*
src/Wikstraktor/__pycache* src/Wikstraktor/__pycache*
src/Wikstraktor/test.json src/Wikstraktor/test.json
src/Wikstraktor/parsers/__pycache*
...@@ -43,7 +43,7 @@ class WiktionaryManager ...@@ -43,7 +43,7 @@ class WiktionaryManager
if ($language != 'en') { if ($language != 'en') {
return []; return [];
} }
$result = exec($_ENV['WIKSTRAKTOR_COMMAND'] . ' ' . $word . ' 2>&1'); $result = exec($_ENV['WIKSTRAKTOR_COMMAND'] . ' -m ' . $word . ' -A -C 2>&1');
//var_dump($result);die(); //var_dump($result);die();
$dataArray = json_decode($result, true); $dataArray = json_decode($result, true);
......
{
"Headword":"sleep",
"Items":[
{
"PartOfSpeech":"verb",
"Pronunciation":[
{
"api":"\\ˈsliːp\\ ",
"hiragana":"",
"katakana":"",
"bopomofo":"",
"pinyin":"",
"romaji":"",
"accent1":"RP",
"url1":"https://upload.wikimedia.org/wikipedia/commons/1/19/LL-Q1860_%28eng%29-Back_ache-water.wav"
}
],
"Senses":[
{
"Translations":[
"translation1",
"...",
"translationn"
],
"Image":[
"Stilles Mineralwasser.jpg",
"..."
],
"Definition":"blabla",
"Examples":[
"blabla",
"blabli",
"blablou"
],
"subSense":[
{
"subdef":"blabla",
"subex":[
"subexa",
"subexb",
"subexz"
]
}
]
}
]
}
]
}
/*adapter à la généricité des données ?
\"User-defined fields 1\"
\"User-defined fields 2\"
\"User-defined fields 3\"
\"User-defined fields 4\"
\"User-defined fields 5\"
\"Supplementary field for devs 1\"
\"Supplementary field for devs 2\"
\"Supplementary field for devs 3\"
\"Supplementary field for devs 4\"
\"Supplementary field for devs 5\"
...
\"Supplementary field for devs 10\ */
"lemma"
"partOfSpeech" :
{
"sense" :
{
"definition"
}
}
string_values = {
"ety":"Etymology",
"pro":"Pronunciation",
"en":"English",
"fr":"French",
"t_ipa":"IPA", #template for transcription
"t_snd":"audio", #template for audio
"t_acc":"a", #template for accents
"t_deflabel":"lb",
"t_ex":["ux", "usex"],
"POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS
"Adjective":"Adj",
"Adverb":"Adv",
"Ambiposition":"Ambip",
"Article":"Art",
"Circumposition":"Circump",
"Classifier":"Class",
"Conjunction":"Conj",
"Contraction":"Cont",
"Counter":"Count",
"Determiner":"Det",
"Ideophone":"Ideophone",
"Interjection":"Interj",
"Noun":"N",
"Numeral":"Num",
"Participle":"Part",
"Particle":"Particle",
"Postposition":"Postp",
"Preposition":"Prep",
"Pronoun":"Pro",
"Proper noun":"NP",
"Verb":"V" # TODO: compléter
}
}
#!/usr/bin/env python3
from wikstraktor import Wikstraktor, Pronunciation, Sense
from parsers.en_constants import string_values
debugEty = 0
class En_en_straktor(Wikstraktor):
def __init__(self):
super().__init__()
self.wiki_language = "en"
self.entry_language = "en"
self.constants = string_values
self.site = self.pwb.Site(f'wiktionary:en')
def process_pronunciation(self, proContent):
# TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux
l = proContent.get_lists()[0]
i = 0
pronunciations = []
while i < len(l.fullitems):
p = Pronunciation()
templates = self.wtp.parse(l.fullitems[i]).templates
a = None
for j, t in enumerate(templates):
if (t.normal_name() == self.constants['t_acc'] and templates[j+1].normal_name()!= self.constants['t_acc']):
a = t.arguments[0].value
elif t.normal_name() == self.constants['t_ipa']:
p.set_transcription(t.arguments[1].value)
p.set_accent(a)
elif t.normal_name() == self.constants['t_snd']:
p.add_sound(self.get_file_url(t.arguments[1].value), a)
if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] :
if p.ipa != None or p.accent != None:
pronunciations.append(p)
p = Pronunciation()
i += 1
return pronunciations
def process_etymology(self, etyContent):
global debugEty
debugEty += 1
return "Etymology" + str(debugEty)
def process_POS(self,parsedwikitext):
pos = None
if parsedwikitext in self.constants['POS'].keys():
pos = self.constants['POS'][parsedwikitext]
return pos
def process_senses(self, entry, pos, sensesContent):
baseId = f"{entry}_{pos}_"
l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' ))
i = 0
senses = []
nombreDef = 0
while i < len(l):
if l[i].pattern == '\\# ':
nombreDef += 1
newSense = Sense(f"{baseId}{nombreDef}")
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
elif l[i].pattern == '\\#:':
for j in l[i].items:
k = 0
isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
isEx = 1
k += 1
if isEx == 0:
newSense.add_example(self.wtp.parse(j).plain_text().strip())
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
senses.append(newSense)
cnt = 0
nombreSousDef = 0
while i < len(l) and l[i].level == 3 :
cnt +=1
if l[i].pattern == '\\## ':
nombreSousDef += 1
newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}")
newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
elif l[i].pattern == '\\##:':
for j in l[i].items:
k = 0
isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
isEx = 1
k += 1
if isEx == 0:
newSense2.add_example(self.wtp.parse(j).plain_text().strip())
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
senses.append(newSense2)
i += 1
if cnt > 0:
i -= 1
i += 1
return senses
if __name__ == "__main__":
ensk = En_en_straktor()
print(ensk.fetch("test"), "entries added")
string_values = {
"ety":"étymologie",
"pro":["prononciation", "Prononciation"],
"en":" {{langue|en}} ",
"fr":"Français",
"t_deflabel":["lexique", "info lex"],
"t_ex":"exemple",
#Inexistants
"t_ipa":"pron", #template for transcription
"t_snd":"écouter", #template for audio
"t_acc":["US", "UK"], #template for accents
"POS":{
"adjectif":["adjectif","adjectif qualificatif","adj"],
"adjectif démonstratif":["adjectif démonstratif","adj-dém","adjectif dém"],
"adjectif exclamatif":["adjectif exclamatif","adj-excl","adjectif exc"],
"adjectif indéfini":["adjectif indéfini","adjectif ind","adj-indéf"],
"adjectif interrogatif":["adjectif interrogatif","adj-int","adjectif int"],
"adjectif numéral":["adjectif numéral","adjectif num","adj-num"],
"adjectif possessif":["adjectif possessif","adjectif pos","adj-pos"],
"adjectif relatif":["adjectif relatif","adjectif rel","adj-rel"],
"adverbe":["Adverbe","adv"],
"adverbe indéfini":["adverbe indéfini","adv-ind","adverbe ind"],
"adverbe interrogatif":["adverbe interrogatif","dverbe int","adv-int"],
"adverbe pronominal":["adverbe pronominal","adv-pron","adverbe pro"],
"adverbe relatif":["adverbe relatif","adv-rel","adverbe rel"],
"affixe":["affixe","aff"],
"article":["article","art"],
"article défini":["article défini","article déf","art-déf"],
"article indéfini":["article indéfini","art-indéf","article ind"],
"article partitif":["article partitif","art-part","article par"],
"circonfixe":["circonfixe","circon","circonf"],
"classificateur":["classificateur","class","classif"],
"conjonction":["conjonction","conj"],
"conjonction de coordination":["conjonction de coordination","conj-coord","conjonction coo"],
"copule":["copule"],
"déterminant":["déterminant","dét"],
"enclitique":["cnclitique","encl"],
"gismu":["gismu"],
"infixe":["infixe","inf"],
"interfixe":["interfixe","interf"],
"interjection":["interjection","interj"],
"lettre":["lettre"],
"locution":["locution","loc"],
"locution-phrase":["locution-phrase","loc-phr","phrase locution","phrase","locution-phrase"],
"nom commun":["nom","nom commun","substantif"],
"nom de famille":["nom de famille","nom-fam"],
"nom propre":["nom propre","nom-pr"],
"nom scientifique":["nom scientifique","nom-sciences","nom scient","nom science"],
"numéral":["numéral","num","numér"],
"onomatopée":["onomatopée","onoma","onom"],
"particule":["particule","part"],
"particule numérale":["particule numérale","part-num","particule num"],
"patronyme":["patronyme"],
"postposition":["postposition","postpos","post"],
"pré-nom":["pré-nom"],
"pré-verbe":["pré-verbe"],
"préfixe":["préfixe","préf"],
"prénom":["prénom"],
"préposition":["préposition","prép"],
"proclitique":["proclitique","procl"],
"pronom":["pronom"],
"pronom démonstratif":["pronom démonstratif","pronom dém","pronom-dém"],
"pronom indéfini":["pronom indéfini","pronom ind","pronom-indéf"],
"pronom interrogatif":["pronom interrogatif","pronom int","pronom-int"],
"pronom personnel":["pronom personnel","pronom réf","pronom-per","pronom réfléchi","pronom-réfl","pronom-pers"],
"pronom possessif":["pronom possessif","pronom pos","pronom-pos"],
"pronom relatif":["pronom relatif","pronom-rel","pronom rel"],
"pronom-adjectif":["pronom-adjectif"],
"proverbe":["proverbe","prov"],
"quantificateur":["quantificateur","quantif"],
"radical":["radical","rad"],
"rafsi":["rafsi"],
"sinogramme":["sinogramme","sinog","sino"],
"suffixe":["suffixe","suf","suff"],
"symbole":["symbole","symb"],
"variante par contrainte typographique":["variante typographique","variante typo","variante par contrainte typographique","var-typo"],
"verbe pronominal":["verbe pronominal","verb-pr","verbe pr"],
"verbe":["verbe","verb"]
}
}
#!/usr/bin/env python3
from wikstraktor import Wikstraktor, Pronunciation, Sense
from parsers.fr_constants import string_values
debugEty = 0
class Fr_en_straktor(Wikstraktor):
def __init__(self):
super().__init__()
self.wiki_language = "fr"
self.entry_language = "en"
self.constants = string_values
self.site = self.pwb.Site(f'wiktionary:fr')
def process_pronunciation(self, proContent):
# TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux
l = proContent.get_lists()[0]
i = 0
pronunciations = []
while i < len(l.fullitems):
p = Pronunciation()
templates = self.wtp.parse(l.fullitems[i]).templates
a = None
for j, t in enumerate(templates):
#if t.normal_name() == self.constants['t_acc']:
# p.set_transcription(t.arguments[i+1].value)
if t.normal_name() == self.constants['t_snd']:
p.add_sound(self.get_file_url(t.arguments[-1].value))
if len(self.wtp.parse(t.get_arg("1").value).templates) != 1:
p.set_accent(t.get_arg("1").value)
else:
p.set_accent(self.wtp.parse(t.get_arg("1").value).templates[0].normal_name())
if t.get_arg("2") != None:
p.set_transcription(t.get_arg("2").value)
if p.accent != None and p.sounds != []:
pronunciations.append(p)
p = Pronunciation()
i += 1
return pronunciations
def process_etymology(self, etyContent):
global debugEty
debugEty += 1
return "Etymology" + str(debugEty)
def process_POS(self,parsedwikitext):
pos = None
ik = 0
values = list(self.constants['POS'].values())
while pos == None and ik < len(self.constants['POS'].keys()):
if parsedwikitext in values[ik]:
keys = list(self.constants['POS'].keys())
pos = keys[ik]
ik += 1
# print(pos)
return pos
def process_senses(self, entry, pos, sensesContent):
baseId = f"{entry}_{pos}_"
l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' ))
i = 0
senses = []
nombreDef = 0
while i < len(l):
if l[i].pattern == '\\# ':
nombreDef += 1
newSense = Sense(f"{baseId}{nombreDef}")
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
elif l[i].pattern == '\\#:':
for j in l[i].items:
k = 0
isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
isEx = 1
k += 1
if isEx == 0:
newSense.add_example(self.wtp.parse(j).plain_text().strip())
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
senses.append(newSense)
cnt = 0
nombreSousDef = 0
while i < len(l) and l[i].level == 3 :
cnt +=1
if l[i].pattern == '\\## ':
nombreSousDef += 1
newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}")
newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
elif l[i].pattern == '\\##:':
for j in l[i].items:
k = 0
isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
isEx = 1
k += 1
if isEx == 0:
newSense2.add_example(self.wtp.parse(j).plain_text().strip())
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
senses.append(newSense2)
i += 1
if cnt > 0:
i -= 1
i += 1
return senses
if __name__ == "__main__":
ensk = Fr_en_straktor()
print(ensk.fetch("test"), "entries added")
...@@ -3,7 +3,6 @@ import pywikibot ...@@ -3,7 +3,6 @@ import pywikibot
import wikitextparser import wikitextparser
import importlib import importlib
import json import json
import sys
####### #######
# Oral # Oral
...@@ -295,9 +294,9 @@ class Wikstraktor: ...@@ -295,9 +294,9 @@ class Wikstraktor:
def get_instance(cls, wiki_language, entry_language): def get_instance(cls, wiki_language, entry_language):
try: try:
m_name = f"{wiki_language}_{entry_language}".capitalize() m_name = f"{wiki_language}_{entry_language}".capitalize()
instance = getattr(importlib.import_module(f"{m_name.lower()}"), f"{m_name}_straktor")() instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")()
except ModuleNotFoundError: except ModuleNotFoundError:
print(f"{m_name.lower()} module not found or {m_name}_straktor not found in module") print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module")
instance = None instance = None
return instance return instance
...@@ -374,14 +373,14 @@ class Wikstraktor: ...@@ -374,14 +373,14 @@ class Wikstraktor:
res = title in self.constants['pro'] res = title in self.constants['pro']
#print(title, res) #print(title, res)
return res return res
def isEty(self, title): def isEty(self, title):
if type(self.constants['ety']) == str: if type(self.constants['ety']) == str:
res = title == self.constants['ety'] res = title == self.constants['ety']
else: else:
res = title in self.constants['ety'] res = title in self.constants['ety']
return res return res
def process_POS(self, parsedwikitext): def process_POS(self, parsedwikitext):
pass#in subclass pass#in subclass
...@@ -395,24 +394,42 @@ class Wikstraktor: ...@@ -395,24 +394,42 @@ class Wikstraktor:
pass#in subclass pass#in subclass
def __str__(self): def __str__(self):
return self.export()
def export(self, ascii=False, compact=False):
res = [] res = []
for e in self.entries: for e in self.entries:
res.append(e.serializable()) res.append(e.serializable())
return json.dumps(res) if compact:
return json.dumps(res, ensure_ascii=ascii)
else:
return json.dumps(res, ensure_ascii=ascii, indent=4)
if __name__ == "__main__": if __name__ == "__main__":
#e = Wikstraktor.get_instance('en', "en") import argparse
f = Wikstraktor.get_instance('en', 'en') from argparse import RawTextHelpFormatter #pour le formattage de l'aide
# print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire
# print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) \033[1m\033[32mex :\033[0m
#e.fetch("water") \033[0m\033[32m./wikstraktor.py -m blue\033[0m
#f.fetch('blue') \033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m
#file_path = 'test.json' \033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C\033[0m""")
#fichier = open(file_path, "w", encoding='utf-8') parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en")
#fichier.write(str(f)) parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en")
#fichier.close() parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None)
#fichier.write(str(f)) parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None)
word = sys.argv[1] parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true")
f.fetch(word) parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true")
print(str(f)) args = parser.parse_args()
if args.mot != None:
w = Wikstraktor.get_instance(args.wiki_language, args.language)
resp = None
if w.fetch(args.mot) > 0:
resp = w.export(args.force_ascii, args.compact)
if args.destination_file != None:
f = open(args.destination_file, "w")
f.write(resp)
f.close
else:
print(resp)
else:
raise NameError("Pas de mot demandé")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment