-
Mathieu Loiseau authored73c27df3
wikstraktor.py 6.34 KiB
#!/usr/bin/env python3
import pywikibot
import wikitextparser
import importlib
from pronunciation import Pronunciation
class Entry:
def __init__(self, lemma):
self.lemma = lemma
self.pronunciations = []
self.pos = None
self.senses = []
def set_pronunciations(self, pron):
if isinstance(pron, Pronunciation):
self.pronunciations.append(pron)
elif type(pron) == list:
for p in pron:
if isinstance(p, Pronunciation):
self.pronunciations.append(p)
else:
raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).")
else:
raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).")
def set_pos(self, pos):
self.pos = pos
def set_senses(self, senses):
self.senses = senses
def is_valid(self):
return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0
def __eq__(self, other):
res = self.lemma == other.lemma and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses)
i = 0
while res and i < len(self.senses):
res = self.senses[i] == other.senses[i]
i += 1
i = 0
while res and i < len(self.pronunciations):
res = self.pronunciations[i] == other.pronunciations[i]
i += 1
return res
def __str__(self):
res = f"{self.lemma} ({self.pos})\n"
for p in self.pronunciations:
res += f"{str(p)}\n"
for s in self.senses:
res += f"{str(s)}\n"
return res
class ParserContext:
def __init__(self, entry):
self.lemma = entry
self.context = []
self.entries = []
def get_level(self):
if len(self.context) == 0:
res = -1
else:
res = self.context[-1]["wiki"].level
return res
def push(self, wiki_context):
self.context.append({"wiki":wiki_context})
def pop(self, testNewEntry = True):
if testNewEntry:
self.create_entry()
return self.context.pop()
def set_top_wiki(self, wiki_context):
if len(self.context) == 0:
self.push(wiki_context)
else:
self.context[-1]['wiki'] = wiki_context
def set_top_entry_info(self, key, entry_context, testNewEntry=True):
if len(self.context) == 0:
raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.")
else:
self.context[-1][key] = entry_context
if testNewEntry:
self.create_entry()
def create_entry(self):
res = Entry(self.lemma)
for l in self.context:
if "pro" in l.keys():
res.set_pronunciations(l['pro'])
if "ety" in l.keys():
pass #On ignore l'étymologie pour le moment
if "POS" in l.keys():
res.set_pos(l['POS'])
if "senses" in l.keys():
res.set_senses(l['senses'])
# TODO: Ajouter les autres types
if res.is_valid() and res not in self.entries:
self.entries.append(res)
else:
res = None
return res
def debug_top(self):
res = "Context: "
if len(self.context) == 0 :
res += "0"
else:
info = ""
for k,v in self.context[-1].items():
if k != 'wiki':
if info != "":
info += "\n\t\t\t"
info += f"{k} → {str(v)}"
res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}"
return res
class Wikstraktor:
@classmethod
def get_instance(cls, wiki_language, entry_language):
try:
m_name = f"{wiki_language}_{entry_language}".capitalize()
instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")()
except ModuleNotFoundError:
print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module")
instance = None
return instance
def __init__(self):
self.entries = []
self.pwb = pywikibot
self.wtp = wikitextparser
self.parserContext = None
def get_file_url(self, file_page_name):
res = None
try:
f = self.pwb.FilePage(self.site, file_page_name)
res = f.get_file_url()
except pywikibot.exceptions.NoPageError:
print(f"{file_page_name} does not exist in {self.site}.")
return res
#retrieves the content of a page and processes it (adding the entries to the list of entries)
#returns the number of entries added
def fetch(self, graphy):
nb_entries_added = 0
page = self.pwb.Page(self.site, graphy)
to_parse = []
if page.text != "":
sections = self.wtp.parse(page.text).sections
found = False
i = 0
### find language
while i < len(sections) and not found:
found = sections[i].title != None and sections[i].title.capitalize() == self.constants[self.entry_language]
if not found:
i += 1
if found:
nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections)
return nb_entries_added
def parse(self, entry, sections):
self.parserContext = ParserContext(entry)
for s in sections:
if s.title != None :
#handle wiki context
if self.parserContext.get_level() < s.level:
self.parserContext.push(s)
else:
while self.parserContext.get_level() > s.level:
self.parserContext.pop()
self.parserContext.set_top_wiki(s)
if s.title == self.constants['pro']:
self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents)))
elif self.constants['ety'] in s.title:
self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents)))
elif s.title in self.constants['POS'].keys():
self.parserContext.set_top_entry_info('POS', self.constants['POS'][s.title], False)
self.parserContext.set_top_entry_info('senses', self.process_senses(self.wtp.parse(s.contents)))
res = len(self.parserContext.entries)
if res > 0:
for e in self.parserContext.entries:
self.entries.append(e)
return res
def __str__(self):
res = ""
for e in self.entries:
res += f"{str(e)}\n"
return res
if __name__ == "__main__":
e = Wikstraktor.get_instance('en', "en")
# print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav"))
# print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav"))
print(e.fetch("test"), "entries added")
print(e)
# site = pywikibot.Site(f'wiktionary:en')
# p = pywikibot.FilePage(site, "File:LL-Q1860 (eng)-Nattes à chat----parent.wav")
# print(p)
# if not p.exists():
# site = pywikibot.Site('commons')
# p = pywikibot.FilePage(site, "File:LL-Q1860 (eng)-Nattes à chat-parent.wav")
# print(p.get_file_url())
#print(e)
#Entry("test", wtp.parse(page.text)))