Skip to content
Snippets Groups Projects
Commit 0eb66f3f authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files

old wikstraktor

parent e83816dd
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
import pywikibot
import wikitextparser
import importlib
import json
#######
# Oral
#######
class Sound:
def __init__(self, url, accent):
self.url = url
self.accent = accent
def __eq__(self, other):
return self.url == other.url and self.accent == other.accent
def serializable(self):
if self.accent == None:
res = {"url":self.url}
else:
res = {"accent":self.accent, "url":self.url}
return res
class Pronunciation:
def __init__(self):
self.ipa = None
self.sounds = []
self.accent = None
def set_transcription(self, tscpt):
self.ipa = tscpt
def set_accent(self, accent):
self.accent = accent
def add_sound(self, url, accent=None):
self.sounds.append(Sound(url,accent))
def serializable(self):
snds = []
for s in self.sounds:
snds.append(s.serializable())
if self.accent == None:
res = {"transcript":self.ipa, "sounds":snds}
else:
res = {"accent":self.accent, "transcript":self.ipa, "sounds":snds}
return res
def __str__(self):
return f"{self.serializable()}"
def __eq__(self, other):
res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds)
i = 0
while res and i<len(self.sounds):
res = self.sounds[i] == other.sounds[i]
i += 1
return res
#######
# Metadata
## TODO:
# * POS : créer une classe POS avec les traits dépendants (ex: masc en fr)
#######
#######
# Senses
# TODO: créer une classe Translations
#######
class Definition:
def __init__(self, lang, text):
self.lang = lang
self.text = text
def __eq__(self, other):
return self.lang == other.lang and self.text == other.text
def serializable(self):
return {"lang":self.lang, "definition":self.text}
class Translation(Definition):
def serializable(self):
return {"lang":self.lang, "translation":self.text}
class Example:
def __init__(self, transcript, source=None, url=None):
self.text = transcript
self.source = source
self.url = url
def __eq__(self, other):
return self.text==other.text and self.source==other.source and self.url==other.url
def serializable(self):
res = {"example":self.text}
if self.source != None:
res["source"] = self.source
if self.url != None:
res["url"] = self.url
return res
class Sense:
def __init__(self, label):
self.label = label #l'identifiant du sens
self.definitions = [] #liste des définitions (elles auront une langue et un texte)
self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels)
self.translations = [] #liste des traductions dans d'autres langues
self.domain = None #domaine d'usage du mot dans ce sens
def set_domain(self, d):
self.domain = d
def add_def(self, lang, definition):
theDef = Definition(lang, definition)
if theDef not in self.definitions:
self.definitions.append(theDef)
def add_example(self, transcript, src=None, url=None):
theEx = Example(transcript, src, url)
if theEx not in self.examples:
self.examples.append(theEx)
def add_translation(self, lang, translation):
theTranslation = Translation(lang, translation)
if theTranslation not in self.translations:
self.translations.append(theTranslation)
def __eq__(self, other):
res = self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain
i = 0
while res and i < len(self.examples):
res = self.examples[i] in other.examples
i+=1
i = 0
while res and i < len(self.translations):
res = self.translations[i] in other.translations
i+=1
i = 0
while res and i < len(self.definitions):
res = self.definitions[i] in other.definitions
i+=1
return res
def serializable(self):
res = {}
res[self.label]={}
if self.domain != None:
res[self.label]["domain"] = self.domain
res[self.label]["defs"] = []
for d in self.definitions:
res[self.label]["defs"].append(d.serializable())
res[self.label]["exs"] = []
for e in self.examples:
res[self.label]["exs"].append(e.serializable())
res[self.label]["trad"] = []
for t in self.translations:
res[self.label]["trad"].append(t.serializable())
return res
class Entry:
def __init__(self, lemma):
self.lemma = lemma
self.pronunciations = []
self.pos = None
self.senses = []
def set_pronunciations(self, pron):
if isinstance(pron, Pronunciation):
self.pronunciations.append(pron)
elif type(pron) == list:
for p in pron:
if isinstance(p, Pronunciation):
self.pronunciations.append(p)
else:
raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).")
else:
raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).")
def set_pos(self, pos):
self.pos = pos
def set_senses(self, senses):
self.senses = senses
def is_valid(self):
return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0
def __eq__(self, other):
res = self.lemma == other.lemma and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses)
i = 0
while res and i < len(self.senses):
res = self.senses[i] == other.senses[i]
i += 1
i = 0
while res and i < len(self.pronunciations):
res = self.pronunciations[i] == other.pronunciations[i]
i += 1
return res
def serializable(self):
res = {}
res[self.lemma] = {"pos":self.pos}
res[self.lemma]["pronunciations"] = []
for p in self.pronunciations:
res[self.lemma]["pronunciations"].append(p.serializable())
res[self.lemma]["senses"] = []
for s in self.senses:
res[self.lemma]["senses"].append(s.serializable())
return res
def __str__(self):
res = f"{self.lemma} ({self.pos})\n"
for p in self.pronunciations:
res += f"{str(p)}\n"
for s in self.senses:
res += f"{str(s)}\n"
return res
class ParserContext:
def __init__(self, entry):
self.lemma = entry
self.context = []
self.entries = []
def get_level(self):
if len(self.context) == 0:
res = -1
else:
res = self.context[-1]["wiki"].level
return res
def push(self, wiki_context):
self.context.append({"wiki":wiki_context})
def pop(self, testNewEntry = True):
if testNewEntry:
self.create_entry()
return self.context.pop()
def set_top_wiki(self, wiki_context):
if len(self.context) == 0:
self.push(wiki_context)
else:
self.context[-1]['wiki'] = wiki_context
def set_top_entry_info(self, key, entry_context, testNewEntry=True):
if len(self.context) == 0:
raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.")
else:
self.context[-1][key] = entry_context
if testNewEntry:
self.create_entry()
def create_entry(self):
#Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS
res = Entry(self.lemma)
for l in self.context:
#print(l.keys())
if "pro" in l.keys():
res.set_pronunciations(l['pro'])
if "ety" in l.keys():
pass #On ignore l'étymologie pour le moment
if "POS" in l.keys():
res.set_pos(l['POS'])
if "senses" in l.keys():
res.set_senses(l['senses'])
# TODO: Ajouter les autres types
if res.is_valid() and res not in self.entries:
self.entries.append(res)
else:
res = None
return res
def debug_top(self):
res = "Context: "
if len(self.context) == 0 :
res += "0"
else:
info = ""
for k,v in self.context[-1].items():
if k != 'wiki':
if info != "":
info += "\n\t\t\t"
info += f"{k}{str(v)}"
res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}"
return res
class Wikstraktor:
@classmethod
def get_instance(cls, wiki_language, entry_language):
try:
m_name = f"{wiki_language}_{entry_language}".capitalize()
instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")()
except ModuleNotFoundError:
print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module")
instance = None
return instance
def __init__(self):
self.entries = []
self.pwb = pywikibot
self.wtp = wikitextparser
self.parserContext = None
def get_file_url(self, file_page_name):
res = None
try:
f = self.pwb.FilePage(self.site, file_page_name)
res = f.get_file_url()
except pywikibot.exceptions.NoPageError:
print(f"{file_page_name} does not exist in {self.site}.")
return res
#retrieves the content of a page and processes it (adding the entries to the list of entries)
#returns the number of entries added
def fetch(self, graphy):
nb_entries_added = 0
page = self.pwb.Page(self.site, graphy)
to_parse = []
if page.text != "":
sections = self.wtp.parse(page.text).sections
found = False
i = 0
### find language
while i < len(sections) and not found:
found = sections[i].title != None and sections[i].title.capitalize() == self.constants[self.entry_language]
if not found:
i += 1
if found:
nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections)
return nb_entries_added
def parse(self, entry, sections):
self.parserContext = ParserContext(entry)
for s in sections:
if s.title != None :
#handle wiki context
if self.parserContext.get_level() < s.level:
self.parserContext.push(s)
else:
while self.parserContext.get_level() > s.level:
self.parserContext.pop()
self.parserContext.set_top_wiki(s)
stitle = self.wtp.parse(s.title).templates
if stitle == []:
stitle = s.title
else:
stitle = stitle[0].arguments[0].value
if self.isPro(stitle):
self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents)))
elif self.isEty(stitle):
self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents)))
# elif stitle in self.constants['POS'].keys():
else:
pos = self.process_POS(stitle)
if pos != None :
self.parserContext.set_top_entry_info('POS', pos, False)
self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents)))
res = len(self.parserContext.entries)
if res > 0:
for e in self.parserContext.entries:
self.entries.append(e)
return res
def isPro(self, title):
if type(self.constants['pro']) == str:
res = title == self.constants['pro']
else:
res = title in self.constants['pro']
#print(title, res)
return res
def isEty(self, title):
if type(self.constants['ety']) == str:
res = title == self.constants['ety']
else:
res = title in self.constants['ety']
return res
def process_POS(self, parsedwikitext):
pass#in subclass
def process_pronunciation(self, parsedwikitext):
pass#in subclass
def process_etymology(self, parsedwikitext):
pass#in subclass
def process_senses(self, entry, pos, parsedwikitext):
pass#in subclass
def __str__(self):
return self.export()
def export(self, ascii=False, compact=False):
res = []
for e in self.entries:
res.append(e.serializable())
if compact:
return json.dumps(res, ensure_ascii=ascii)
else:
return json.dumps(res, ensure_ascii=ascii, indent=4)
if __name__ == "__main__":
import argparse
from argparse import RawTextHelpFormatter #pour le formattage de l'aide
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire
\033[1m\033[32mex :\033[0m
\033[0m\033[32m./wikstraktor.py -m blue\033[0m
\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m
\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C\033[0m""")
parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en")
parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en")
parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None)
parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None)
parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true")
parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true")
args = parser.parse_args()
if args.mot != None:
w = Wikstraktor.get_instance(args.wiki_language, args.language)
resp = None
if w.fetch(args.mot) > 0:
resp = w.export(args.force_ascii, args.compact)
if args.destination_file != None:
f = open(args.destination_file, "w")
f.write(resp)
f.close
else:
print(resp)
else:
raise NameError("Pas de mot demandé")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment