old wikstraktor

0eb66f3f · Mathieu Loiseau · e83816dd · 0eb66f3f
Commit 0eb66f3f authored 1 year ago by Mathieu Loiseau
--- a/wikstraktor_old_pierre.py
+++ b/wikstraktor_old_pierre.py
+#!/usr/bin/env python3
+import pywikibot
+import wikitextparser
+import importlib
+import json
+#######
+# Oral
+#######
+class Sound:
+	def __init__(self, url, accent):
+		self.url = url
+		self.accent = accent
+	def __eq__(self, other):
+		return self.url == other.url and self.accent == other.accent
+	def serializable(self):
+		if self.accent == None:
+			res = {"url":self.url}
+		else:
+			res = {"accent":self.accent, "url":self.url}
+		return res
+class Pronunciation:
+	def __init__(self):
+		self.ipa = None
+		self.sounds = []
+		self.accent = None
+	def set_transcription(self, tscpt):
+		self.ipa = tscpt
+	def set_accent(self, accent):
+		self.accent = accent
+	def add_sound(self, url, accent=None):
+		self.sounds.append(Sound(url,accent))
+	def serializable(self):
+		snds = []
+		for s in self.sounds:
+			snds.append(s.serializable())
+		if self.accent == None:
+			res = {"transcript":self.ipa, "sounds":snds}
+		else:
+			res = {"accent":self.accent, "transcript":self.ipa, "sounds":snds}
+		return res
+	def __str__(self):
+		return f"{self.serializable()}"
+	def __eq__(self, other):
+		res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds)
+		i = 0
+		while res and i<len(self.sounds):
+			res = self.sounds[i] == other.sounds[i]
+			i += 1
+		return res
+#######
+# Metadata
+## TODO:
+#  * POS : créer une classe POS avec les traits dépendants (ex: masc en fr)
+#######
+#######
+# Senses
+# TODO: créer une classe Translations
+#######
+class Definition:
+	def __init__(self, lang, text):
+		self.lang = lang
+		self.text = text
+	def __eq__(self, other):
+		return self.lang == other.lang and self.text == other.text
+	def serializable(self):
+		return {"lang":self.lang, "definition":self.text}
+class Translation(Definition):
+	def serializable(self):
+		return {"lang":self.lang, "translation":self.text}
+class Example:
+	def __init__(self, transcript, source=None, url=None):
+		self.text = transcript
+		self.source = source
+		self.url = url
+	def __eq__(self, other):
+		return self.text==other.text and self.source==other.source and self.url==other.url
+	def serializable(self):
+		res = {"example":self.text}
+		if self.source != None:
+			res["source"] = self.source
+		if self.url != None:
+			res["url"] = self.url
+		return res
+class Sense:
+	def __init__(self, label):
+		self.label = label #l'identifiant du sens
+		self.definitions = [] #liste des définitions (elles auront une langue et un texte)
+		self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels)
+		self.translations = [] #liste des traductions dans d'autres langues
+		self.domain = None #domaine d'usage du mot dans ce sens
+	def set_domain(self, d):
+		self.domain = d
+	def add_def(self, lang, definition):
+		theDef = Definition(lang, definition)
+		if theDef not in self.definitions:
+			self.definitions.append(theDef)
+	def add_example(self, transcript, src=None, url=None):
+		theEx = Example(transcript, src, url)
+		if theEx not in self.examples:
+			self.examples.append(theEx)
+	def add_translation(self, lang, translation):
+		theTranslation = Translation(lang, translation)
+		if theTranslation not in self.translations:
+			self.translations.append(theTranslation)
+	def __eq__(self, other):
+		res = self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain
+		i = 0
+		while res and i < len(self.examples):
+			res = self.examples[i] in other.examples
+			i+=1
+		i = 0
+		while res and i < len(self.translations):
+			res = self.translations[i] in other.translations
+			i+=1
+		i = 0
+		while res and i < len(self.definitions):
+			res = self.definitions[i] in other.definitions
+			i+=1
+		return res
+	def serializable(self):
+		res = {}
+		res[self.label]={}
+		if self.domain != None:
+			res[self.label]["domain"] = self.domain
+		res[self.label]["defs"] = []
+		for d in self.definitions:
+			res[self.label]["defs"].append(d.serializable())
+		res[self.label]["exs"] = []
+		for e in self.examples:
+			res[self.label]["exs"].append(e.serializable())
+		res[self.label]["trad"] = []
+		for t in self.translations:
+			res[self.label]["trad"].append(t.serializable())
+		return res
+class Entry:
+	def __init__(self, lemma):
+		self.lemma = lemma
+		self.pronunciations = []
+		self.pos = None
+		self.senses = []
+	def set_pronunciations(self, pron):
+		if isinstance(pron, Pronunciation):
+			self.pronunciations.append(pron)
+		elif type(pron) == list:
+			for p in pron:
+				if isinstance(p, Pronunciation):
+					self.pronunciations.append(p)
+				else:
+					raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).")
+		else:
+			raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).")
+	def set_pos(self, pos):
+		self.pos = pos
+	def set_senses(self, senses):
+		self.senses = senses
+	def is_valid(self):
+		return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0
+	def __eq__(self, other):
+		res = self.lemma == other.lemma and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses)
+		i = 0
+		while res and i < len(self.senses):
+			res = self.senses[i] == other.senses[i]
+			i += 1
+		i = 0
+		while res and i < len(self.pronunciations):
+			res = self.pronunciations[i] == other.pronunciations[i]
+			i += 1
+		return res
+	def serializable(self):
+		res = {}
+		res[self.lemma] = {"pos":self.pos}
+		res[self.lemma]["pronunciations"] = []
+		for p in self.pronunciations:
+			res[self.lemma]["pronunciations"].append(p.serializable())
+		res[self.lemma]["senses"] = []
+		for s in self.senses:
+			res[self.lemma]["senses"].append(s.serializable())
+		return res
+	def __str__(self):
+		res = f"{self.lemma} ({self.pos})\n"
+		for p in self.pronunciations:
+			res += f"{str(p)}\n"
+		for s in self.senses:
+			res += f"{str(s)}\n"
+		return res
+class ParserContext:
+	def __init__(self, entry):
+		self.lemma = entry
+		self.context = []
+		self.entries = []
+	def get_level(self):
+		if len(self.context) == 0:
+			res = -1
+		else:
+			res = self.context[-1]["wiki"].level
+		return res
+	def push(self, wiki_context):
+		self.context.append({"wiki":wiki_context})
+	def pop(self, testNewEntry = True):
+		if testNewEntry:
+			self.create_entry()
+		return self.context.pop()
+	def set_top_wiki(self, wiki_context):
+		if len(self.context) == 0:
+			self.push(wiki_context)
+		else:
+			self.context[-1]['wiki'] = wiki_context
+	def set_top_entry_info(self, key, entry_context, testNewEntry=True):
+		if len(self.context) == 0:
+			raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.")
+		else:
+			self.context[-1][key] = entry_context
+			if testNewEntry:
+				self.create_entry()
+	def create_entry(self):
+		#Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS
+		res = Entry(self.lemma)
+		for l in self.context:
+			#print(l.keys())
+			if "pro" in l.keys():
+				res.set_pronunciations(l['pro'])
+			if "ety" in l.keys():
+				pass #On ignore l'étymologie pour le moment
+			if "POS" in l.keys():
+				res.set_pos(l['POS'])
+			if "senses" in l.keys():
+				res.set_senses(l['senses'])
+			# TODO: Ajouter les autres types
+		if res.is_valid() and res not in self.entries:
+			self.entries.append(res)
+		else:
+			res = None
+		return res
+	def debug_top(self):
+		res = "Context: "
+		if len(self.context) == 0 :
+			res += "0"
+		else:
+			info = ""
+			for k,v in self.context[-1].items():
+				if k != 'wiki':
+					if info != "":
+						info += "\n\t\t\t"
+					info += f"{k} → {str(v)}"
+			res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}"
+		return res
+class Wikstraktor:
+	@classmethod
+	def get_instance(cls, wiki_language, entry_language):
+		try:
+			m_name = f"{wiki_language}_{entry_language}".capitalize()
+			instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")()
+		except ModuleNotFoundError:
+			print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module")
+			instance = None
+		return instance
+	def __init__(self):
+		self.entries = []
+		self.pwb = pywikibot
+		self.wtp = wikitextparser
+		self.parserContext = None
+	def get_file_url(self, file_page_name):
+		res = None
+		try:
+			f = self.pwb.FilePage(self.site, file_page_name)
+			res = f.get_file_url()
+		except pywikibot.exceptions.NoPageError:
+			print(f"{file_page_name} does not exist in {self.site}.")
+		return res
+	#retrieves the content of a page and processes it (adding the entries to the list of entries)
+	#returns the number of entries added
+	def fetch(self, graphy):
+		nb_entries_added = 0
+		page = self.pwb.Page(self.site, graphy)
+		to_parse = []
+		if page.text != "":
+			sections = self.wtp.parse(page.text).sections
+			found = False
+			i = 0
+			### find language
+			while i < len(sections) and not found:
+				found = sections[i].title != None and sections[i].title.capitalize() == self.constants[self.entry_language]
+				if not found:
+					i += 1
+			if found:
+				nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections)
+		return nb_entries_added
+	def parse(self, entry, sections):
+		self.parserContext = ParserContext(entry)
+		for s in sections:
+			if s.title != None :
+				#handle wiki context
+				if self.parserContext.get_level() < s.level:
+					self.parserContext.push(s)
+				else:
+					while self.parserContext.get_level() > s.level:
+						self.parserContext.pop()
+					self.parserContext.set_top_wiki(s)
+				stitle = self.wtp.parse(s.title).templates
+				if stitle == []:
+					stitle = s.title
+				else:
+					stitle = stitle[0].arguments[0].value
+				if self.isPro(stitle):
+					self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents)))
+				elif self.isEty(stitle):
+					self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents)))
+# 				elif stitle in self.constants['POS'].keys():
+				else:
+					pos = self.process_POS(stitle)
+					if pos != None :
+						self.parserContext.set_top_entry_info('POS', pos, False)
+						self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents)))
+		res = len(self.parserContext.entries)
+		if res > 0:
+			for e in self.parserContext.entries:
+				self.entries.append(e)
+		return res
+	def isPro(self, title):
+		if type(self.constants['pro']) == str:
+			res = title == self.constants['pro']
+		else:
+			res = title in self.constants['pro']
+		#print(title, res)
+		return res
+	def isEty(self, title):
+		if type(self.constants['ety']) == str:
+			res = title == self.constants['ety']
+		else:
+			res = title in self.constants['ety']
+		return res
+	def process_POS(self, parsedwikitext):
+		pass#in subclass
+	def process_pronunciation(self, parsedwikitext):
+		pass#in subclass
+	def process_etymology(self, parsedwikitext):
+		pass#in subclass
+	def process_senses(self, entry, pos, parsedwikitext):
+		pass#in subclass
+	def __str__(self):
+		return self.export()
+	def export(self, ascii=False, compact=False):
+		res = []
+		for e in self.entries:
+			res.append(e.serializable())
+		if compact:
+			return json.dumps(res, ensure_ascii=ascii)
+		else:
+			return json.dumps(res, ensure_ascii=ascii, indent=4)
+if __name__ == "__main__":
+	import argparse
+	from argparse import RawTextHelpFormatter #pour le formattage de l'aide
+	parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire
+	\033[1m\033[32mex :\033[0m
+	‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m
+	‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m
+	‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C\033[0m""")
+	parser.add_argument("-l", "--language",  help="la langue du mot", type=str, default = "en")
+	parser.add_argument("-w", "--wiki_language",  help="la langue du wiki", type=str, default = "en")
+	parser.add_argument("-m", "--mot",  help="le mot à chercher", type=str, default=None)
+	parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None)
+	parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true")
+	parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true")
+	args = parser.parse_args()
+	if args.mot != None:
+		w = Wikstraktor.get_instance(args.wiki_language, args.language)
+		resp = None
+		if w.fetch(args.mot) > 0:
+			resp = w.export(args.force_ascii, args.compact)
+		if args.destination_file != None:
+			f = open(args.destination_file, "w")
+			f.write(resp)
+			f.close
+		else:
+			print(resp)
+	else:
+		raise NameError("Pas de mot demandé")