Skip to content
Snippets Groups Projects
Commit b9da9951 authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files

Global infrastructure

parent d9f55f3d
No related branches found
No related tags found
No related merge requests found
......@@ -5,11 +5,12 @@ A python tool to query the [wiktionary](https://wiktionary.org) and extract stru
## Dependencies
This project does depend on python packages.
* [pywikibot](https://github.com/wikimedia/pywikibot) allows to use the mediawiki API
* [``pywikibot``](https://github.com/wikimedia/pywikibot) allows to use the mediawiki API
* [documentation](https://doc.wikimedia.org/pywikibot/stable/api_ref/pywikibot.html)
* [manual](https://www.mediawiki.org/wiki/Manual:Pywikibot)
* [configuration for the wiktionary](https://github.com/wikimedia/pywikibot/blob/master/pywikibot/families/wiktionary_family.py)
* [wikitextparser](https://github.com/5j9/wikitextparser) can parse mediawiki pages and extract sections, templates and links
* [``wikitextparser``](https://github.com/5j9/wikitextparser) can parse mediawiki pages and extract sections, templates and links
* [``importlib``](https://docs.python.org/3/library/importlib.html) : to import parser modules
## Installation
(maybe to be replaced by an automation of some sort)
......
string_values = {
"ety":"Etymology",
"ipa":"Pronunciation",
"en":"English",
"fr":"French"
}
#!/usr/bin/env python3
from wikstraktor import Wikstraktor
#from en_constants import string_values
class En_en_straktor(Wikstraktor):
def __init__(self):
self.wiki_language = "en"
self.entry_language = "en"
#self.constants = string_values
#self.site = pwb.Site(f'wiktionary:{self.wiki_language}')
def parse(self, wp_content):
#sections = wtp.parse(wp_content).sections
#print(sections)
print("ok")
strings = {
"ety":"Étimologie",
"ipa":"Prononciation",
"en":"Anglais",
"fr":"Français"
}
#!/usr/bin/env python3
import pywikibot as pwb
import wikitextparser as wtp
import importlib
class Entry:
def __init__(self, lemma):
......@@ -9,32 +10,40 @@ class Entry:
def __str__(self):
res = f"{self.lemma} ({self.cat})"
class Entries:
def __init__(self, graphy, wiki_language, entry_language):
self.language = entry_language
self.site = pwb.Site(f'wiktionary:{wiki_language}')
class Wikstraktor:
@classmethod
def get_instance(cls, wiki_language, entry_language):
try:
m_name = f"{wiki_language}_{entry_language}".capitalize()
instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")()
except ModuleNotFoundError:
print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module")
instance = None
return instance
def __init__(self):
self.entries = []
page = pwb.Page(self.site, graphy)
if page.text != "":
self.graphy = page.title()
wp_content = wtp.parse(page.text)
self.parse(wp_content)
@classmethod
def processPronunciation(section):
return "prononciation"
def parse(self, wp_content):
sections = wp_content.sections
over = False
i = 0
### find language
while i < len(sections) and not over:
over = sections[i].title == self.language
i += 1
### language found i points to the first subsection
if sections[i].title == "Pronunciation" or sections[i].title == "pronunciation" : ## TODO: créer un fichier de localisation
print(sections[i].contents)
#retrieves the content of a page and processes it (adding the entries to the list of entries)
#returns the number of entries added
def fetch(self, graphy):
nb_entries_added = 0
# page = pwb.Page(self.site, graphy)
# if page.text != "":
# sections = wtp.parse(page.text).sections
# found = False
# i = 0
# ### find language
# while i < len(sections) and not found:
# found = sections[i].title.capitalize() == self.constants[self.entry_language]
# i += 1
# if i < len(sections) and found:
# nb_entries_added = self.parse(page.title(), sections[i])
return nb_entries_added
def parse(self):
#handled by subclass
return -1
def __str__(self):
res = ""
......@@ -42,7 +51,9 @@ class Entries:
res += f"{e}\n"
return res
if __name__ == "__main__":
e = Entries("test", 'en', "English")
e = Wikstraktor.get_instance('en', "en")
print(e.fetch("test"), "entries added")
#print(e)
#Entry("test", wtp.parse(page.text)))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment