From 339c6f4f3a9d98a66ce8a5625d6a64369e2e0695 Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Fri, 30 Sep 2022 18:15:48 +0200 Subject: [PATCH] begin parsing --- wikstraktor.py | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/wikstraktor.py b/wikstraktor.py index a015d02..50f5c96 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -1,7 +1,44 @@ #!/usr/bin/env python3 import pywikibot as pwb +import wikitextparser as wtp + +class Entry: + def __init__(self, lemma): + self.lemma = lemma + + def __str__(self): + res = f"{self.lemma} ({self.cat})" + +class Entries: + def __init__(self, graphy, languagewiki, languageInWiki): + self.language = languageInWiki + self.site = pwb.Site(f'wiktionary:{languagewiki}') + self.entries = [] + page = pwb.Page(self.site, graphy) + if page.text != "": + self.graphy = page.title() + wp_content = wtp.parse(page.text) + self.parse(wp_content) + + def parse(self, wp_content): + sections = wp_content.sections + over = False + i = 0 + ### find language + while i < len(sections) and not over: + over = sections[i].title == self.language + i += 1 + ### language found i points to the first subsection + if sections[i].title == "Pronunciation" or sections[i].title == "Pronunciation" : + print(sections[i].contents) + + def __str__(self): + res = "" + for e in self.entries: + res += f"{e}\n" + return res if __name__ == "__main__": - site = pwb.Site('wiktionary:en') - page = pwb.Page(site, "test") - print(page.text) + e = Entries("test", 'en', "English") + #print(e) + #Entry("test", wtp.parse(page.text))) -- GitLab