From 948508cceee4c8486cd97d3e9588da6626ff6a04 Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Mon, 11 Sep 2023 18:36:51 +0200 Subject: [PATCH] follow redirects --- parsers/en_constants.py | 1 + parsers/en_en.py | 1 + parsers/fr_constants.py | 1 + wikstraktor.py | 20 ++++++++++++++++++++ 4 files changed, 23 insertions(+) diff --git a/parsers/en_constants.py b/parsers/en_constants.py index 76eca3e..d3888eb 100644 --- a/parsers/en_constants.py +++ b/parsers/en_constants.py @@ -9,6 +9,7 @@ string_values = { "t_deflabel":"lb", "t_alt":"alternative spelling of", "t_alt_param":1, #number of the parameter of t_alt containing the other spelling + "t_infl":{"past participle of":1, "infl of":1}, "t_ex":["ux", "usex"], "t_lbl":["lb","lbl", "label"], #template for labels "regions":{ diff --git a/parsers/en_en.py b/parsers/en_en.py index c52dcfc..6353a02 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -79,6 +79,7 @@ class En_en_straktor(Wikstraktor): def_text = parsed_def.plain_text().strip() templates = parsed_def.templates the_def = self.parse_template_1(templates) + self.try_inflected_forms(templates) if the_def == None: the_def = self.parse_alt_spell(templates) if the_def == None: diff --git a/parsers/fr_constants.py b/parsers/fr_constants.py index ffcde47..5a97c53 100644 --- a/parsers/fr_constants.py +++ b/parsers/fr_constants.py @@ -6,6 +6,7 @@ string_values = { "t_deflabel":["lexique", "info lex"], "t_alt":"variante de", "t_alt_param":0, #number of the parameter of t_alt containing the other spelling +"t_infl":{"en-conj-irrég":"inf", "en-conj-aux":"1", "en-conj-rég":"inf", "lien":0, "l":0}, "t_ex":"exemple", #Inexistants "t_ipa":"pron", #template for transcription diff --git a/wikstraktor.py b/wikstraktor.py index dae7979..7e1871d 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -547,6 +547,7 @@ class Wikstraktor: def __init__(self): self.entries = [] + self.redirects = {} self.pwb = pywikibot self.wtp = wikitextparser self.parserContext = None @@ -579,6 +580,10 @@ class Wikstraktor: nb_entries_added = self.parse(page.title(), page.latest_revision_id, sections[i].sections) else: self.log.add_log("Wikstraktor.fetch", f"“{graphy}†page not found") + if len(self.redirects) > 0: + for e,p in self.redirects.items(): + if not p: + nb_entries_added += self.fetch(self.process_redirect(e)) return nb_entries_added def parse(self, entry, v_id, sections): @@ -639,14 +644,29 @@ class Wikstraktor: def process_etymology(self, parsedwikitext): pass#in subclass + def add_redirect(self, redirect): + if redirect not in self.redirects.keys(): + self.redirects[redirect] = False + + def process_redirect(self, redirect): + if redirect in self.redirects.keys(): + self.redirects[redirect] = True + return redirect + def parse_alt_spell(self, templates): the_def = None for t in templates: if t.normal_name() == self.constants['t_alt']: the_def = Definition(self.entry_language, f"Alternate spelling of “{t.arguments[self.constants['t_alt_param']].value}â€") + self.add_redirect(t.arguments[self.constants['t_alt_param']].value) break return the_def + def try_inflected_forms(self, templates): + for t in templates: + if t.normal_name() in self.constants['t_infl'].keys(): + self.add_redirect(t.arguments[self.constants['t_infl'][t.normal_name()]].value) + #can be overloaded def process_example(self, example_wiki_text): k = 0 -- GitLab