Global infrastructure

b9da9951 · Mathieu Loiseau · d9f55f3d · b9da9951 · b9da9951 · b9da9951
Commit b9da9951 authored 2 years ago by Mathieu Loiseau
--- a/README.md
+++ b/README.md
@@ -5,11 +5,12 @@ A python tool to query the [wiktionary](https://wiktionary.org) and extract stru

 ## Dependencies
 This project does depend on python packages.
-* [pywikibot](https://github.com/wikimedia/pywikibot) allows to use the mediawiki API
+* [``pywikibot``](https://github.com/wikimedia/pywikibot) allows to use the mediawiki API
    * [documentation](https://doc.wikimedia.org/pywikibot/stable/api_ref/pywikibot.html)
    * [manual](https://www.mediawiki.org/wiki/Manual:Pywikibot)
    * [configuration for the wiktionary](https://github.com/wikimedia/pywikibot/blob/master/pywikibot/families/wiktionary_family.py)
-* [wikitextparser](https://github.com/5j9/wikitextparser) can parse mediawiki pages and extract sections, templates and links
+* [``wikitextparser``](https://github.com/5j9/wikitextparser) can parse mediawiki pages and extract sections, templates and links
+* [``importlib``](https://docs.python.org/3/library/importlib.html) : to import parser modules

 ## Installation
 (maybe to be replaced by an automation of some sort)

--- a/parsers/en_constants.py
+++ b/parsers/en_constants.py
+string_values = {
+	"ety":"Etymology",
+	"ipa":"Pronunciation",
+	"en":"English",
+	"fr":"French"
+}
--- a/parsers/en_en.py
+++ b/parsers/en_en.py
+#!/usr/bin/env python3
+from wikstraktor import Wikstraktor
+#from en_constants import string_values
+
+class En_en_straktor(Wikstraktor):
+	def __init__(self):
+		self.wiki_language = "en"
+		self.entry_language = "en"
+		#self.constants = string_values
+		#self.site = pwb.Site(f'wiktionary:{self.wiki_language}')
+
+	def parse(self, wp_content):
+		#sections = wtp.parse(wp_content).sections
+		#print(sections)
+		print("ok")
--- a/parsers/fr_constants.py
+++ b/parsers/fr_constants.py
+strings = {
+	"ety":"Étimologie",
+	"ipa":"Prononciation",
+	"en":"Anglais",
+	"fr":"Français"
+}
--- a/wikstraktor.py
+++ b/wikstraktor.py
 #!/usr/bin/env python3
 import pywikibot as pwb
 import wikitextparser as wtp
+import importlib

 class Entry:
 	def __init__(self, lemma):
@@ -9,32 +10,40 @@ class Entry:
 	def __str__(self):
 		res = f"{self.lemma} ({self.cat})"

-class Entries:
-	def __init__(self, graphy, wiki_language, entry_language):
-		self.language = entry_language
-		self.site = pwb.Site(f'wiktionary:{wiki_language}')
+class Wikstraktor:
+	@classmethod
+	def get_instance(cls, wiki_language, entry_language):
+		try:
+			m_name = f"{wiki_language}_{entry_language}".capitalize()
+			instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")()
+		except ModuleNotFoundError:
+			print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module")
+			instance = None
+		return instance
+
+	def __init__(self):
 		self.entries = []
-		page = pwb.Page(self.site, graphy)
-		if page.text != "":
-			self.graphy = page.title()
-			wp_content = wtp.parse(page.text)
-			self.parse(wp_content)

-	@classmethod
-	def processPronunciation(section):
-		return "prononciation"
-
-	def parse(self, wp_content):
-		sections = wp_content.sections
-		over = False
-		i = 0
-		### find language
-		while i < len(sections) and not over:
-			over = sections[i].title == self.language
-			i += 1
-		### language found i points to the first subsection
-		if sections[i].title == "Pronunciation" or sections[i].title == "pronunciation" : ## TODO: créer un fichier de localisation
-			print(sections[i].contents)
+	#retrieves the content of a page and processes it (adding the entries to the list of entries)
+	#returns the number of entries added
+	def fetch(self, graphy):
+		nb_entries_added = 0
+		# page = pwb.Page(self.site, graphy)
+		# if page.text != "":
+		# 	sections = wtp.parse(page.text).sections
+		# 	found = False
+		# 	i = 0
+		# 	### find language
+		# 	while i < len(sections) and not found:
+		# 		found = sections[i].title.capitalize() == self.constants[self.entry_language]
+		# 		i += 1
+		# 	if i < len(sections) and found:
+		# 		nb_entries_added = self.parse(page.title(), sections[i])
+		return nb_entries_added
+
+	def parse(self):
+		#handled by subclass
+		return -1

 	def __str__(self):
 		res = ""
@@ -42,7 +51,9 @@ class Entries:
 			res += f"{e}\n"
 		return res

+
 if __name__ == "__main__":
-	e = Entries("test", 'en', "English")
+	e = Wikstraktor.get_instance('en', "en")
+	print(e.fetch("test"), "entries added")
 	#print(e)
 	#Entry("test", wtp.parse(page.text)))