Mise à jour version en dur de wikstraktor et modif commande du manager...

Mise à jour version en dur de wikstraktor et modif commande du manager wiktionnaire pour utiliser le param force ASCII de la commande python

Mise à jour version en dur de wikstraktor et modif commande du manager...
Mise à jour version en dur de wikstraktor et modif commande du manager wiktionnaire pour utiliser le param force ASCII de la commande python
da06f1d5 · Pierre Fleutot · dbbc051b · da06f1d5 · da06f1d5 · da06f1d5
Commit da06f1d5 authored 2 years ago by Pierre Fleutot
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,4 @@ src/Wikstraktor/throttle.ctrl
 src/Wikstraktor/apicache*
 src/Wikstraktor/__pycache*
 src/Wikstraktor/test.json
+src/Wikstraktor/parsers/__pycache*
--- a/src/Manager/WiktionaryManager.php
+++ b/src/Manager/WiktionaryManager.php
@@ -43,7 +43,7 @@ class WiktionaryManager
        if ($language != 'en') {
            return [];
        }
-        $result = exec($_ENV['WIKSTRAKTOR_COMMAND'] . ' ' . $word . ' 2>&1');
+        $result = exec($_ENV['WIKSTRAKTOR_COMMAND'] . ' -m ' . $word . ' -A -C 2>&1');
 //var_dump($result);die();
        $dataArray = json_decode($result, true);

--- a/src/Wikstraktor/parsers/Structure_json.json
+++ b/src/Wikstraktor/parsers/Structure_json.json
+{
+    "Headword":"sleep",
+    "Items":[
+        {
+            "PartOfSpeech":"verb",
+            "Pronunciation":[
+                {
+                    "api":"\\ˈsliːp\\ ",
+                    "hiragana":"",
+                    "katakana":"",
+                    "bopomofo":"",
+                    "pinyin":"",
+                    "romaji":"",
+                    "accent1":"RP",
+                    "url1":"https://upload.wikimedia.org/wikipedia/commons/1/19/LL-Q1860_%28eng%29-Back_ache-water.wav"
+                }
+            ],
+            "Senses":[
+                {
+                    "Translations":[
+                        "translation1",
+                        "...",
+                        "translationn"
+                    ],
+                    "Image":[
+                        "Stilles Mineralwasser.jpg",
+                        "..."
+                    ],
+                    "Definition":"blabla",
+                    "Examples":[
+                        "blabla",
+                        "blabli",
+                        "blablou"
+                    ],
+                    "subSense":[
+                        {
+                            "subdef":"blabla",
+                            "subex":[
+                                "subexa",
+                                "subexb",
+                                "subexz"
+                            ]
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
+/*adapter à la généricité des données ?
+		\"User-defined fields 1\"
+		\"User-defined fields 2\"
+		\"User-defined fields 3\"
+		\"User-defined fields 4\"
+		\"User-defined fields 5\"
+		\"Supplementary field for devs 1\"
+		\"Supplementary field for devs 2\"
+		\"Supplementary field for devs 3\"
+		\"Supplementary field for devs 4\"
+		\"Supplementary field for devs 5\"
+		...
+		\"Supplementary field for devs 10\ */
--- a/src/Wikstraktor/parsers/Structure_minimale.json
+++ b/src/Wikstraktor/parsers/Structure_minimale.json
+"lemma" 
+"partOfSpeech"  : 
+{
+	"sense" :
+	{
+		"definition"
+	}
+}
--- a/src/Wikstraktor/parsers/en_constants.py
+++ b/src/Wikstraktor/parsers/en_constants.py
+string_values = {
+	"ety":"Etymology",
+	"pro":"Pronunciation",
+	"en":"English",
+	"fr":"French",
+	"t_ipa":"IPA", #template for transcription
+	"t_snd":"audio", #template for audio
+	"t_acc":"a", #template for accents
+	"t_deflabel":"lb",
+	"t_ex":["ux", "usex"],
+	"POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS
+		"Adjective":"Adj",
+		"Adverb":"Adv",
+		"Ambiposition":"Ambip",
+		"Article":"Art",
+		"Circumposition":"Circump",
+		"Classifier":"Class",
+		"Conjunction":"Conj",
+		"Contraction":"Cont",
+		"Counter":"Count",
+		"Determiner":"Det",
+		"Ideophone":"Ideophone",
+		"Interjection":"Interj",
+		"Noun":"N",
+		"Numeral":"Num",
+		"Participle":"Part",
+		"Particle":"Particle",
+		"Postposition":"Postp",
+		"Preposition":"Prep",
+		"Pronoun":"Pro",
+		"Proper noun":"NP",
+		"Verb":"V" # TODO: compléter
+	}
+}
--- a/src/Wikstraktor/parsers/en_en.py
+++ b/src/Wikstraktor/parsers/en_en.py
+#!/usr/bin/env python3
+from wikstraktor import Wikstraktor, Pronunciation, Sense
+from parsers.en_constants import string_values
+debugEty = 0
+class En_en_straktor(Wikstraktor):
+	def __init__(self):
+		super().__init__()
+		self.wiki_language = "en"
+		self.entry_language = "en"
+		self.constants = string_values
+		self.site = self.pwb.Site(f'wiktionary:en')
+	def process_pronunciation(self, proContent):
+		# TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux
+		l = proContent.get_lists()[0]
+		i = 0
+		pronunciations = []
+		while i < len(l.fullitems):
+			p = Pronunciation()
+			templates = self.wtp.parse(l.fullitems[i]).templates
+			a = None
+			for j, t in enumerate(templates):
+				if (t.normal_name() == self.constants['t_acc'] and templates[j+1].normal_name()!= self.constants['t_acc']):
+					a = t.arguments[0].value
+				elif t.normal_name() == self.constants['t_ipa']:
+					p.set_transcription(t.arguments[1].value)
+					p.set_accent(a)
+				elif t.normal_name() == self.constants['t_snd']:
+					p.add_sound(self.get_file_url(t.arguments[1].value), a)
+				if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] :
+					if p.ipa != None or p.accent != None:
+						pronunciations.append(p)
+						p = Pronunciation()
+			i += 1
+		return pronunciations
+	def process_etymology(self, etyContent):
+		global debugEty
+		debugEty += 1
+		return "Etymology" + str(debugEty)
+	def process_POS(self,parsedwikitext):
+		pos = None
+		if parsedwikitext in self.constants['POS'].keys():
+			pos = self.constants['POS'][parsedwikitext]
+		return pos
+	def process_senses(self, entry, pos, sensesContent):
+		baseId = f"{entry}_{pos}_"
+		l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' ))
+		i = 0
+		senses = []
+		nombreDef = 0
+		while i < len(l):
+			if l[i].pattern == '\\# ':
+				nombreDef += 1
+				newSense = Sense(f"{baseId}{nombreDef}")
+				newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
+			elif l[i].pattern == '\\#:':
+				for j in l[i].items:
+					k = 0
+					isEx = 0
+					while k < len(self.wtp.parse(j).templates) and isEx == 0 :
+						if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
+							newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
+							isEx = 1
+						k += 1
+					if isEx == 0:
+						newSense.add_example(self.wtp.parse(j).plain_text().strip())
+			if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
+				senses.append(newSense)
+			cnt = 0
+			nombreSousDef = 0
+			while i < len(l) and l[i].level == 3 :
+				cnt +=1
+				if l[i].pattern == '\\## ':
+					nombreSousDef += 1
+					newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}")
+					newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
+				elif l[i].pattern == '\\##:':
+					for j in l[i].items:
+						k = 0
+						isEx = 0
+						while k < len(self.wtp.parse(j).templates) and isEx == 0 :
+							if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
+								newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
+								isEx = 1
+							k += 1
+						if isEx == 0:
+							newSense2.add_example(self.wtp.parse(j).plain_text().strip())
+				if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
+					senses.append(newSense2)
+				i += 1
+			if cnt > 0:
+				i -= 1
+			i += 1
+		return senses
+if __name__ == "__main__":
+	ensk = En_en_straktor()
+	print(ensk.fetch("test"), "entries added")
--- a/src/Wikstraktor/parsers/fr_constants.py
+++ b/src/Wikstraktor/parsers/fr_constants.py
+string_values = {
+"ety":"étymologie",
+"pro":["prononciation", "Prononciation"],
+"en":" {{langue|en}} ",
+"fr":"Français",
+"t_deflabel":["lexique", "info lex"],
+"t_ex":"exemple",
+	#Inexistants
+"t_ipa":"pron", #template for transcription
+"t_snd":"écouter", #template for audio
+"t_acc":["US", "UK"], #template for accents
+"POS":{
+	"adjectif":["adjectif","adjectif qualificatif","adj"],
+	"adjectif démonstratif":["adjectif démonstratif","adj-dém","adjectif dém"],
+	"adjectif exclamatif":["adjectif exclamatif","adj-excl","adjectif exc"],
+	"adjectif indéfini":["adjectif indéfini","adjectif ind","adj-indéf"],
+	"adjectif interrogatif":["adjectif interrogatif","adj-int","adjectif int"],
+	"adjectif numéral":["adjectif numéral","adjectif num","adj-num"],
+	"adjectif possessif":["adjectif possessif","adjectif pos","adj-pos"],
+	"adjectif relatif":["adjectif relatif","adjectif rel","adj-rel"],
+	"adverbe":["Adverbe","adv"],
+	"adverbe indéfini":["adverbe indéfini","adv-ind","adverbe ind"],
+	"adverbe interrogatif":["adverbe interrogatif","dverbe int","adv-int"],
+	"adverbe pronominal":["adverbe pronominal","adv-pron","adverbe pro"],
+	"adverbe relatif":["adverbe relatif","adv-rel","adverbe rel"],
+	"affixe":["affixe","aff"],
+	"article":["article","art"],
+	"article défini":["article défini","article déf","art-déf"],
+	"article indéfini":["article indéfini","art-indéf","article ind"],
+	"article partitif":["article partitif","art-part","article par"],
+	"circonfixe":["circonfixe","circon","circonf"],
+	"classificateur":["classificateur","class","classif"],
+	"conjonction":["conjonction","conj"],
+	"conjonction de coordination":["conjonction de coordination","conj-coord","conjonction coo"],
+	"copule":["copule"],
+	"déterminant":["déterminant","dét"],
+	"enclitique":["cnclitique","encl"],
+	"gismu":["gismu"],
+	"infixe":["infixe","inf"],
+	"interfixe":["interfixe","interf"],
+	"interjection":["interjection","interj"],
+	"lettre":["lettre"],
+	"locution":["locution","loc"],
+	"locution-phrase":["locution-phrase","loc-phr","phrase locution","phrase","locution-phrase"],
+	"nom commun":["nom","nom commun","substantif"],
+	"nom de famille":["nom de famille","nom-fam"],
+	"nom propre":["nom propre","nom-pr"],
+	"nom scientifique":["nom scientifique","nom-sciences","nom scient","nom science"],
+	"numéral":["numéral","num","numér"],
+	"onomatopée":["onomatopée","onoma","onom"],
+	"particule":["particule","part"],
+	"particule numérale":["particule numérale","part-num","particule num"],
+	"patronyme":["patronyme"],
+	"postposition":["postposition","postpos","post"],
+	"pré-nom":["pré-nom"],
+	"pré-verbe":["pré-verbe"],
+	"préfixe":["préfixe","préf"],
+	"prénom":["prénom"],
+	"préposition":["préposition","prép"],
+	"proclitique":["proclitique","procl"],
+	"pronom":["pronom"],
+	"pronom démonstratif":["pronom démonstratif","pronom dém","pronom-dém"],
+	"pronom indéfini":["pronom indéfini","pronom ind","pronom-indéf"],
+	"pronom interrogatif":["pronom interrogatif","pronom int","pronom-int"],
+	"pronom personnel":["pronom personnel","pronom réf","pronom-per","pronom réfléchi","pronom-réfl","pronom-pers"],
+	"pronom possessif":["pronom possessif","pronom pos","pronom-pos"],
+	"pronom relatif":["pronom relatif","pronom-rel","pronom rel"],
+	"pronom-adjectif":["pronom-adjectif"],
+	"proverbe":["proverbe","prov"],
+	"quantificateur":["quantificateur","quantif"],
+	"radical":["radical","rad"],
+	"rafsi":["rafsi"],
+	"sinogramme":["sinogramme","sinog","sino"],
+	"suffixe":["suffixe","suf","suff"],
+	"symbole":["symbole","symb"],
+	"variante par contrainte typographique":["variante typographique","variante typo","variante par contrainte typographique","var-typo"],
+	"verbe pronominal":["verbe pronominal","verb-pr","verbe pr"],
+	"verbe":["verbe","verb"]
+     }
+}
--- a/src/Wikstraktor/parsers/fr_en.py
+++ b/src/Wikstraktor/parsers/fr_en.py
+#!/usr/bin/env python3
+from wikstraktor import Wikstraktor, Pronunciation, Sense
+from parsers.fr_constants import string_values
+debugEty = 0
+class Fr_en_straktor(Wikstraktor):
+	def __init__(self):
+		super().__init__()
+		self.wiki_language = "fr"
+		self.entry_language = "en"
+		self.constants = string_values
+		self.site = self.pwb.Site(f'wiktionary:fr')
+	def process_pronunciation(self, proContent):
+		# TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux
+		l = proContent.get_lists()[0]
+		i = 0
+		pronunciations = []
+		while i < len(l.fullitems):
+			p = Pronunciation()
+			templates = self.wtp.parse(l.fullitems[i]).templates
+			a = None
+			for j, t in enumerate(templates):
+				#if t.normal_name() == self.constants['t_acc']:
+				#	p.set_transcription(t.arguments[i+1].value)
+				if t.normal_name() == self.constants['t_snd']:
+					p.add_sound(self.get_file_url(t.arguments[-1].value))
+					if len(self.wtp.parse(t.get_arg("1").value).templates) != 1:
+						p.set_accent(t.get_arg("1").value)
+					else:
+						p.set_accent(self.wtp.parse(t.get_arg("1").value).templates[0].normal_name())
+					if t.get_arg("2") != None:
+						p.set_transcription(t.get_arg("2").value)
+					if p.accent != None and p.sounds != []:
+						pronunciations.append(p)
+						p = Pronunciation()
+			i += 1
+		return pronunciations
+	def process_etymology(self, etyContent):
+		global debugEty
+		debugEty += 1
+		return "Etymology" + str(debugEty)
+	def process_POS(self,parsedwikitext):
+		pos = None
+		ik = 0
+		values =  list(self.constants['POS'].values())
+		while pos == None and ik < len(self.constants['POS'].keys()):
+			if parsedwikitext in values[ik]:
+				keys = list(self.constants['POS'].keys())
+				pos = keys[ik]
+			ik += 1
+# 		print(pos)
+		return pos
+	def process_senses(self, entry, pos, sensesContent):
+		baseId = f"{entry}_{pos}_"
+		l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' ))
+		i = 0
+		senses = []
+		nombreDef = 0
+		while i < len(l):
+			if l[i].pattern == '\\# ':
+				nombreDef += 1
+				newSense = Sense(f"{baseId}{nombreDef}")
+				newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
+			elif l[i].pattern == '\\#:':
+				for j in l[i].items:
+					k = 0
+					isEx = 0
+					while k < len(self.wtp.parse(j).templates) and isEx == 0 :
+						if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
+							newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
+							isEx = 1
+						k += 1
+					if isEx == 0:
+						newSense.add_example(self.wtp.parse(j).plain_text().strip())
+			if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
+				senses.append(newSense)
+			cnt = 0
+			nombreSousDef = 0
+			while i < len(l) and l[i].level == 3 :
+				cnt +=1
+				if l[i].pattern == '\\## ':
+					nombreSousDef += 1
+					newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}")
+					newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
+				elif l[i].pattern == '\\##:':
+					for j in l[i].items:
+						k = 0
+						isEx = 0
+						while k < len(self.wtp.parse(j).templates) and isEx == 0 :
+							if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
+								newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
+								isEx = 1
+							k += 1
+						if isEx == 0:
+							newSense2.add_example(self.wtp.parse(j).plain_text().strip())
+				if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
+					senses.append(newSense2)
+				i += 1
+			if cnt > 0:
+				i -= 1
+			i += 1
+		return senses
+if __name__ == "__main__":
+	ensk = Fr_en_straktor()
+	print(ensk.fetch("test"), "entries added")
--- a/src/Wikstraktor/wikstraktor.py
+++ b/src/Wikstraktor/wikstraktor.py
@@ -3,7 +3,6 @@ import pywikibot
 import wikitextparser
 import importlib
 import json
-import sys
 #######
 # Oral
@@ -295,9 +294,9 @@ class Wikstraktor:
 	def get_instance(cls, wiki_language, entry_language):
 		try:
 			m_name = f"{wiki_language}_{entry_language}".capitalize()
-			instance = getattr(importlib.import_module(f"{m_name.lower()}"), f"{m_name}_straktor")()
+			instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")()
 		except ModuleNotFoundError:
-			print(f"{m_name.lower()} module not found or {m_name}_straktor not found in module")
+			print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module")
 			instance = None
 		return instance
@@ -374,14 +373,14 @@ class Wikstraktor:
 			res = title in self.constants['pro']
 		#print(title, res)
 		return res
 	def isEty(self, title):
 		if type(self.constants['ety']) == str:
 			res = title == self.constants['ety']
 		else:
 			res = title in self.constants['ety']
 		return res
 	def process_POS(self, parsedwikitext):
 		pass#in subclass
@@ -395,24 +394,42 @@ class Wikstraktor:
 		pass#in subclass
 	def __str__(self):
+		return self.export()
+	def export(self, ascii=False, compact=False):
 		res = []
 		for e in self.entries:
 			res.append(e.serializable())
-		return json.dumps(res)
+		if compact:
+			return json.dumps(res, ensure_ascii=ascii)
+		else:
+			return json.dumps(res, ensure_ascii=ascii, indent=4)
 if __name__ == "__main__":
-	#e = Wikstraktor.get_instance('en', "en")
+	import argparse
-	f = Wikstraktor.get_instance('en', 'en')
+	from argparse import RawTextHelpFormatter #pour le formattage de l'aide
-	# print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav"))
+	parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire
-	# print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav"))
+	\033[1m\033[32mex :\033[0m
-	#e.fetch("water")
+	‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m
-	#f.fetch('blue')
+	‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m
-	#file_path = 'test.json'
+	‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C\033[0m""")
-	#fichier  = open(file_path, "w", encoding='utf-8')
+	parser.add_argument("-l", "--language",  help="la langue du mot", type=str, default = "en")
-	#fichier.write(str(f))
+	parser.add_argument("-w", "--wiki_language",  help="la langue du wiki", type=str, default = "en")
-	#fichier.close()
+	parser.add_argument("-m", "--mot",  help="le mot à chercher", type=str, default=None)
-	#fichier.write(str(f))
+	parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None)
-	word = sys.argv[1]
+	parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true")
-	f.fetch(word)
+	parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true")
-	print(str(f))
+	args = parser.parse_args()
+	if args.mot != None:
+		w = Wikstraktor.get_instance(args.wiki_language, args.language)
+		resp = None
+		if w.fetch(args.mot) > 0:
+			resp = w.export(args.force_ascii, args.compact)
+		if args.destination_file != None:
+			f = open(args.destination_file, "w")
+			f.write(resp)
+			f.close
+		else:
+			print(resp)
+	else:
+		raise NameError("Pas de mot demandé")