diff --git a/README.md b/README.md index 6a70850d7ba297c4fbe350a0f24a3f9e2060eace..3d0f4f200f4eea979de76acc4de111f640b63e5c 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,12 @@ A python tool to query the [wiktionary](https://wiktionary.org) and extract stru ## Dependencies This project does depend on python packages. -* [pywikibot](https://github.com/wikimedia/pywikibot) allows to use the mediawiki API +* [``pywikibot``](https://github.com/wikimedia/pywikibot) allows to use the mediawiki API * [documentation](https://doc.wikimedia.org/pywikibot/stable/api_ref/pywikibot.html) * [manual](https://www.mediawiki.org/wiki/Manual:Pywikibot) * [configuration for the wiktionary](https://github.com/wikimedia/pywikibot/blob/master/pywikibot/families/wiktionary_family.py) -* [wikitextparser](https://github.com/5j9/wikitextparser) can parse mediawiki pages and extract sections, templates and links +* [``wikitextparser``](https://github.com/5j9/wikitextparser) can parse mediawiki pages and extract sections, templates and links +* [``importlib``](https://docs.python.org/3/library/importlib.html) : to import parser modules ## Installation (maybe to be replaced by an automation of some sort) diff --git a/parsers/en_constants.py b/parsers/en_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..2c4b46d83a477b312535a43947249fe6d7ca3c9e --- /dev/null +++ b/parsers/en_constants.py @@ -0,0 +1,6 @@ +string_values = { + "ety":"Etymology", + "ipa":"Pronunciation", + "en":"English", + "fr":"French" +} diff --git a/parsers/en_en.py b/parsers/en_en.py new file mode 100644 index 0000000000000000000000000000000000000000..58afd899cee63f3580b95f1dc636c3071b807c13 --- /dev/null +++ b/parsers/en_en.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +from wikstraktor import Wikstraktor +#from en_constants import string_values + +class En_en_straktor(Wikstraktor): + def __init__(self): + self.wiki_language = "en" + self.entry_language = "en" + #self.constants = string_values + #self.site = pwb.Site(f'wiktionary:{self.wiki_language}') + + def parse(self, wp_content): + #sections = wtp.parse(wp_content).sections + #print(sections) + print("ok") diff --git a/parsers/fr_constants.py b/parsers/fr_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..7999af4a6c7a92baac8e34111dd0ad650c8daff3 --- /dev/null +++ b/parsers/fr_constants.py @@ -0,0 +1,6 @@ +strings = { + "ety":"Étimologie", + "ipa":"Prononciation", + "en":"Anglais", + "fr":"Français" +} diff --git a/wikstraktor.py b/wikstraktor.py index f1b7297cdd036220fe137346821ea6143bb4437e..9f167b0c38bcd482ff4832f0df6b5756a7679700 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import pywikibot as pwb import wikitextparser as wtp +import importlib class Entry: def __init__(self, lemma): @@ -9,32 +10,40 @@ class Entry: def __str__(self): res = f"{self.lemma} ({self.cat})" -class Entries: - def __init__(self, graphy, wiki_language, entry_language): - self.language = entry_language - self.site = pwb.Site(f'wiktionary:{wiki_language}') +class Wikstraktor: + @classmethod + def get_instance(cls, wiki_language, entry_language): + try: + m_name = f"{wiki_language}_{entry_language}".capitalize() + instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() + except ModuleNotFoundError: + print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module") + instance = None + return instance + + def __init__(self): self.entries = [] - page = pwb.Page(self.site, graphy) - if page.text != "": - self.graphy = page.title() - wp_content = wtp.parse(page.text) - self.parse(wp_content) - @classmethod - def processPronunciation(section): - return "prononciation" - - def parse(self, wp_content): - sections = wp_content.sections - over = False - i = 0 - ### find language - while i < len(sections) and not over: - over = sections[i].title == self.language - i += 1 - ### language found i points to the first subsection - if sections[i].title == "Pronunciation" or sections[i].title == "pronunciation" : ## TODO: créer un fichier de localisation - print(sections[i].contents) + #retrieves the content of a page and processes it (adding the entries to the list of entries) + #returns the number of entries added + def fetch(self, graphy): + nb_entries_added = 0 + # page = pwb.Page(self.site, graphy) + # if page.text != "": + # sections = wtp.parse(page.text).sections + # found = False + # i = 0 + # ### find language + # while i < len(sections) and not found: + # found = sections[i].title.capitalize() == self.constants[self.entry_language] + # i += 1 + # if i < len(sections) and found: + # nb_entries_added = self.parse(page.title(), sections[i]) + return nb_entries_added + + def parse(self): + #handled by subclass + return -1 def __str__(self): res = "" @@ -42,7 +51,9 @@ class Entries: res += f"{e}\n" return res + if __name__ == "__main__": - e = Entries("test", 'en', "English") + e = Wikstraktor.get_instance('en', "en") + print(e.fetch("test"), "entries added") #print(e) #Entry("test", wtp.parse(page.text)))