From 7f220e33f5ce7603e91934a1cf5c67d14c4b2f7d Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Thu, 11 Jan 2024 15:02:32 +0100 Subject: [PATCH] see also redirections + cas d'un mot EN qui n'existe qu'en majuscules --- parsers/en_constants.py | 1 + parsers/en_en.py | 12 +++++++++++- parsers/fr_constants.py | 1 + parsers/fr_en.py | 9 +++++++++ wikstraklog.py | 2 ++ wikstraktor.py | 12 +++++++++--- 6 files changed, 33 insertions(+), 4 deletions(-) diff --git a/parsers/en_constants.py b/parsers/en_constants.py index d3888eb..aa7392c 100644 --- a/parsers/en_constants.py +++ b/parsers/en_constants.py @@ -12,6 +12,7 @@ string_values = { "t_infl":{"past participle of":1, "infl of":1}, "t_ex":["ux", "usex"], "t_lbl":["lb","lbl", "label"], #template for labels + "t_sa":["also","see also"], "regions":{ "UK":"United Kingdom", "United Kingdom":"United Kingdom", diff --git a/parsers/en_en.py b/parsers/en_en.py index febb56a..bdc3ce4 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -119,8 +119,18 @@ class En_en_straktor(Wikstraktor): pos = self.constants['POS'][parsedwikitext] return pos + def check_see_also(self, parsedwikitext): + templates = parsedwikitext.templates + for t in templates: + if t.normal_name() in self.constants['t_sa']: + for p in t.arguments: + self.add_redirect(p.value) + def fetch(self, graphy, follow_redirections): - return super().fetch(graphy.lower(), follow_redirections) + res = super().fetch(graphy.lower(), follow_redirections) + if res == 0: + res = super().fetch(graphy, follow_redirections) + return res if __name__ == "__main__": ensk = En_en_straktor() diff --git a/parsers/fr_constants.py b/parsers/fr_constants.py index 5a97c53..b99999b 100644 --- a/parsers/fr_constants.py +++ b/parsers/fr_constants.py @@ -12,6 +12,7 @@ string_values = { "t_ipa":"pron", #template for transcription "t_snd":"écouter", #template for audio "t_acc":["US", "UK"], #template for accents (inutile utilise régions) +"t_sa":["voir"], "regions":{ "UK":"Royaume-Uni", "United Kingdom":"Royaume-Uni", diff --git a/parsers/fr_en.py b/parsers/fr_en.py index a5caab0..eba66f0 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -61,6 +61,15 @@ class Fr_en_straktor(Wikstraktor): ik += 1 return pos + def check_see_also(self, parsedwikitext): + templates = parsedwikitext.templates + for t in templates: + if t.normal_name() in self.constants['t_sa']: + for p in t.arguments: + self.add_redirect(p.value) + elif any(s+"/" in t.normal_name() for s in self.constants['t_sa']): + self.check_see_also(self.wtp.parse(self.pwb.Page(self.site, "Modèle:"+t.normal_name()))) + if __name__ == "__main__": ensk = Fr_en_straktor() print(ensk.fetch("test"), "entries added") diff --git a/wikstraklog.py b/wikstraklog.py index 74b912b..ba03fb1 100755 --- a/wikstraklog.py +++ b/wikstraklog.py @@ -20,6 +20,8 @@ class Wikstraklog: self.wx_v = wikstraktor_version self.w_l = word_language self.wk_l = wiki_language + self.cur_w = None + self.cur_pid = -1 def set_context(self, word, permanentId): self.cur_w = word diff --git a/wikstraktor.py b/wikstraktor.py index a683054..07853b2 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -569,9 +569,9 @@ class Wikstraktor: def fetch(self, graphy, follow_redirections=False): nb_entries_added = 0 page = self.pwb.Page(self.site, graphy) - to_parse = [] if page.text != "": - sections = self.wtp.parse(page.text).sections + parsedText = self.wtp.parse(page.text) + sections = parsedText.sections found = False i = 0 ### find language @@ -582,7 +582,8 @@ class Wikstraktor: if found: nb_entries_added = self.parse(page.title(), page.latest_revision_id, sections[i].sections) else: - self.log.add_log("Wikstraktor.fetch", f"“{graphy}†page not found") + self.check_see_also(parsedText) + self.log.add_log("Wikstraktor.fetch", f"“{graphy}†page not found (checked see also — {len(self.redirects)} results)", graphy, -1) #no permanentId better set to null, but database used if len(self.redirects) > 0 and follow_redirections: for e,p in self.redirects.items(): if not p: @@ -647,6 +648,11 @@ class Wikstraktor: def process_etymology(self, parsedwikitext): pass#in subclass + def check_see_also(self, parsedwikitext): + #parses the text for see also redirections + #adds the corresponding redirects + pass#insubclass + def add_redirect(self, redirect): if redirect not in self.redirects.keys(): self.redirects[redirect] = False -- GitLab