diff --git a/parsers/en_en.py b/parsers/en_en.py index 58afd899cee63f3580b95f1dc636c3071b807c13..566ade2fb5de92d41890128abef3434f2af9dc2b 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -1,15 +1,36 @@ #!/usr/bin/env python3 from wikstraktor import Wikstraktor -#from en_constants import string_values +from parsers.en_constants import string_values + +def debugC(c): + res = "Context: " + if len(c) == 0 : + res += "0" + else: + res += f"{len(c)}, {c[-1].level*'#'} {c[-1].title}" + return res class En_en_straktor(Wikstraktor): def __init__(self): + super().__init__() self.wiki_language = "en" self.entry_language = "en" - #self.constants = string_values - #self.site = pwb.Site(f'wiktionary:{self.wiki_language}') + self.constants = string_values + self.site = self.pwb.Site(f'wiktionary:{self.wiki_language}') - def parse(self, wp_content): - #sections = wtp.parse(wp_content).sections - #print(sections) + def parse(self, entry, sections): + context = [] + for s in sections: + if s.title != None : + if len(context) == 0 or s.level > context[-1].level: + context.append(s) + else: + while len(context)>0 and s.level < context[-1].level: + context.pop() + context[-1] = s + print(s.level, debugC(context)) print("ok") + +if __name__ == "__main__": + ensk = En_en_straktor() + print(ensk.fetch("test"), "entries added") diff --git a/wikstraktor.py b/wikstraktor.py index 9f167b0c38bcd482ff4832f0df6b5756a7679700..0fad41a14030dc66ad9540355900cc4dd19350d0 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -import pywikibot as pwb -import wikitextparser as wtp +import pywikibot +import wikitextparser import importlib class Entry: @@ -23,22 +23,26 @@ class Wikstraktor: def __init__(self): self.entries = [] + self.pwb = pywikibot + self.wtp = wikitextparser #retrieves the content of a page and processes it (adding the entries to the list of entries) #returns the number of entries added def fetch(self, graphy): nb_entries_added = 0 - # page = pwb.Page(self.site, graphy) - # if page.text != "": - # sections = wtp.parse(page.text).sections - # found = False - # i = 0 - # ### find language - # while i < len(sections) and not found: - # found = sections[i].title.capitalize() == self.constants[self.entry_language] - # i += 1 - # if i < len(sections) and found: - # nb_entries_added = self.parse(page.title(), sections[i]) + page = self.pwb.Page(self.site, graphy) + to_parse = [] + if page.text != "": + sections = self.wtp.parse(page.text).sections + found = False + i = 0 + ### find language + while i < len(sections) and not found: + found = sections[i].title != None and sections[i].title.capitalize() == self.constants[self.entry_language] + if not found: + i += 1 + if found: + nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections) return nb_entries_added def parse(self):