Skip to content
Snippets Groups Projects
Commit b9da9951 authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files

Global infrastructure

parent d9f55f3d
No related branches found
No related tags found
No related merge requests found
...@@ -5,11 +5,12 @@ A python tool to query the [wiktionary](https://wiktionary.org) and extract stru ...@@ -5,11 +5,12 @@ A python tool to query the [wiktionary](https://wiktionary.org) and extract stru
## Dependencies ## Dependencies
This project does depend on python packages. This project does depend on python packages.
* [pywikibot](https://github.com/wikimedia/pywikibot) allows to use the mediawiki API * [``pywikibot``](https://github.com/wikimedia/pywikibot) allows to use the mediawiki API
* [documentation](https://doc.wikimedia.org/pywikibot/stable/api_ref/pywikibot.html) * [documentation](https://doc.wikimedia.org/pywikibot/stable/api_ref/pywikibot.html)
* [manual](https://www.mediawiki.org/wiki/Manual:Pywikibot) * [manual](https://www.mediawiki.org/wiki/Manual:Pywikibot)
* [configuration for the wiktionary](https://github.com/wikimedia/pywikibot/blob/master/pywikibot/families/wiktionary_family.py) * [configuration for the wiktionary](https://github.com/wikimedia/pywikibot/blob/master/pywikibot/families/wiktionary_family.py)
* [wikitextparser](https://github.com/5j9/wikitextparser) can parse mediawiki pages and extract sections, templates and links * [``wikitextparser``](https://github.com/5j9/wikitextparser) can parse mediawiki pages and extract sections, templates and links
* [``importlib``](https://docs.python.org/3/library/importlib.html) : to import parser modules
## Installation ## Installation
(maybe to be replaced by an automation of some sort) (maybe to be replaced by an automation of some sort)
......
string_values = {
"ety":"Etymology",
"ipa":"Pronunciation",
"en":"English",
"fr":"French"
}
#!/usr/bin/env python3
from wikstraktor import Wikstraktor
#from en_constants import string_values
class En_en_straktor(Wikstraktor):
def __init__(self):
self.wiki_language = "en"
self.entry_language = "en"
#self.constants = string_values
#self.site = pwb.Site(f'wiktionary:{self.wiki_language}')
def parse(self, wp_content):
#sections = wtp.parse(wp_content).sections
#print(sections)
print("ok")
strings = {
"ety":"Étimologie",
"ipa":"Prononciation",
"en":"Anglais",
"fr":"Français"
}
#!/usr/bin/env python3 #!/usr/bin/env python3
import pywikibot as pwb import pywikibot as pwb
import wikitextparser as wtp import wikitextparser as wtp
import importlib
class Entry: class Entry:
def __init__(self, lemma): def __init__(self, lemma):
...@@ -9,32 +10,40 @@ class Entry: ...@@ -9,32 +10,40 @@ class Entry:
def __str__(self): def __str__(self):
res = f"{self.lemma} ({self.cat})" res = f"{self.lemma} ({self.cat})"
class Entries: class Wikstraktor:
def __init__(self, graphy, wiki_language, entry_language): @classmethod
self.language = entry_language def get_instance(cls, wiki_language, entry_language):
self.site = pwb.Site(f'wiktionary:{wiki_language}') try:
m_name = f"{wiki_language}_{entry_language}".capitalize()
instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")()
except ModuleNotFoundError:
print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module")
instance = None
return instance
def __init__(self):
self.entries = [] self.entries = []
page = pwb.Page(self.site, graphy)
if page.text != "":
self.graphy = page.title()
wp_content = wtp.parse(page.text)
self.parse(wp_content)
@classmethod #retrieves the content of a page and processes it (adding the entries to the list of entries)
def processPronunciation(section): #returns the number of entries added
return "prononciation" def fetch(self, graphy):
nb_entries_added = 0
def parse(self, wp_content): # page = pwb.Page(self.site, graphy)
sections = wp_content.sections # if page.text != "":
over = False # sections = wtp.parse(page.text).sections
i = 0 # found = False
### find language # i = 0
while i < len(sections) and not over: # ### find language
over = sections[i].title == self.language # while i < len(sections) and not found:
i += 1 # found = sections[i].title.capitalize() == self.constants[self.entry_language]
### language found i points to the first subsection # i += 1
if sections[i].title == "Pronunciation" or sections[i].title == "pronunciation" : ## TODO: créer un fichier de localisation # if i < len(sections) and found:
print(sections[i].contents) # nb_entries_added = self.parse(page.title(), sections[i])
return nb_entries_added
def parse(self):
#handled by subclass
return -1
def __str__(self): def __str__(self):
res = "" res = ""
...@@ -42,7 +51,9 @@ class Entries: ...@@ -42,7 +51,9 @@ class Entries:
res += f"{e}\n" res += f"{e}\n"
return res return res
if __name__ == "__main__": if __name__ == "__main__":
e = Entries("test", 'en', "English") e = Wikstraktor.get_instance('en', "en")
print(e.fetch("test"), "entries added")
#print(e) #print(e)
#Entry("test", wtp.parse(page.text))) #Entry("test", wtp.parse(page.text)))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment