From 339c6f4f3a9d98a66ce8a5625d6a64369e2e0695 Mon Sep 17 00:00:00 2001
From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr>
Date: Fri, 30 Sep 2022 18:15:48 +0200
Subject: [PATCH] begin parsing

---
 wikstraktor.py | 43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/wikstraktor.py b/wikstraktor.py
index a015d02..50f5c96 100755
--- a/wikstraktor.py
+++ b/wikstraktor.py
@@ -1,7 +1,44 @@
 #!/usr/bin/env python3
 import pywikibot as pwb
+import wikitextparser as wtp
+
+class Entry:
+	def __init__(self, lemma):
+		self.lemma = lemma
+
+	def __str__(self):
+		res = f"{self.lemma} ({self.cat})"
+
+class Entries:
+	def __init__(self, graphy, languagewiki, languageInWiki):
+		self.language = languageInWiki
+		self.site = pwb.Site(f'wiktionary:{languagewiki}')
+		self.entries = []
+		page = pwb.Page(self.site, graphy)
+		if page.text != "":
+			self.graphy = page.title()
+			wp_content = wtp.parse(page.text)
+			self.parse(wp_content)
+
+	def parse(self, wp_content):
+		sections = wp_content.sections
+		over = False
+		i = 0
+		### find language
+		while i < len(sections) and not over:
+			over = sections[i].title == self.language
+			i += 1
+		### language found i points to the first subsection
+		if sections[i].title == "Pronunciation" or sections[i].title == "Pronunciation" :
+			print(sections[i].contents)
+
+	def __str__(self):
+		res = ""
+		for e in self.entries:
+			res += f"{e}\n"
+		return res
 
 if __name__ == "__main__":
-	site = pwb.Site('wiktionary:en')
-	page = pwb.Page(site, "test")
-	print(page.text)
+	e = Entries("test", 'en', "English")
+	#print(e)
+	#Entry("test", wtp.parse(page.text)))
-- 
GitLab