From 90eac4c67d49bab482deb5a202d94edee73bab4c Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Tue, 30 May 2023 12:24:12 +0200 Subject: [PATCH] process 1 and lbl templates in English --- parsers/en_constants.py | 32 +++++++++++++++++++++++++++ parsers/en_en.py | 48 +++++++++++++++++++++++++++++++++++++++- wikstraktor.py | 28 +++++++++++++++++++---- wikstraktor.sqlite | Bin 8192 -> 16384 bytes 4 files changed, 103 insertions(+), 5 deletions(-) diff --git a/parsers/en_constants.py b/parsers/en_constants.py index de100b7..c13e689 100644 --- a/parsers/en_constants.py +++ b/parsers/en_constants.py @@ -8,6 +8,38 @@ string_values = { "t_acc":"a", #template for accents "t_deflabel":"lb", "t_ex":["ux", "usex"], + "t_lbl":["lb","lbl", "label"], #template for labels + "regions":{ + "UK":"United Kingdom", + "United Kingdom":"United Kingdom", + "British":"Great Britain", + "GB":"Great Britain", + "Great Britain":"Great Britain", + "Scot":"Scotland", + "Scottish":"Scotland", + "Scotland":"Scotland", + "Irl":"Ireland", + "Irish":"Ireland", + "Ireland":"Ireland", + "Ulst":"Northern Ireland", + "Ulster":"Northern Ireland", + "Northern Ireland":"Northern Ireland", + "Wls":"Wales", + "Welsh":"Wales", + "Wales":"Wales", + "English":"England", + "Eng":"England", + "En":"England", + "England":"England", + "US":"United States of America", + "USA":"United States of America", + "United States":"United States of America", + "United States of America":"United States of America", + "NZ":"New Zealand", + "New Zealand":"New Zealand", + "Au":"Australia", + "AU":"Australia", + "Australia":"Australia"}, "sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns {"def":"\\#", "ex":"\\#[:;]", "add_subdef":"\\#"} ], diff --git a/parsers/en_en.py b/parsers/en_en.py index 1a2fab1..db91422 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -from wikstraktor import Wikstraktor, Pronunciation, Sense, SubSense +from wikstraktor import Wikstraktor, Pronunciation, Sense, SubSense, Definition from parsers.en_constants import string_values @@ -42,6 +42,52 @@ class En_en_straktor(Wikstraktor): debugEty += 1 return "Etymology" + str(debugEty) + def parse_template_1(self, templates): + the_def = None + for t in templates: + if t.normal_name() == "1": + the_def = Definition(self.entry_language, f"Other wording of “{t.arguments[0].value}â€") + break + return the_def + + def parse_labels(self, a_def, templates): + key = "labels" + desc = "language" + num = 0 + for t in templates: + if t.normal_name() in self.constants['t_lbl']: + while a_def.metadata_exists(f"{key}_{num}_{desc}"): + num+=1 + a_def.add_metadata(f"{key}_{num}_{desc}", t.arguments[0].value) + complete_previous = False + for a in t.arguments[1:]: + if a.value == "_": + complete_previous = True + elif a.value == "and": + pass + elif a.value in self.constants['regions'].keys(): + a_def.add_to_metadata("region", self.constants['regions'][a.value]) + elif complete_previous: + a_def.extend_metadata(f"{key}_{num}", a.value, " ") + complete_previous = False + else: + a_def.add_to_metadata(f"{key}_{num}", a.value) + + def parse_definition(self, def_string): + the_def = None + parsed_def = self.wtp.parse(def_string) + def_text = parsed_def.plain_text().strip() + templates = parsed_def.templates + if def_text != "": + the_def = Definition(self.entry_language, def_text) + else: + the_def = self.parse_template_1(templates) + if the_def != None: + self.parse_labels(the_def, templates) + else: + raise ValueError(f"En_en_straktor.parse_definition with empty definition\n\t{def_string}") + return the_def + def process_POS(self,parsedwikitext): pos = None if parsedwikitext in self.constants['POS'].keys(): diff --git a/wikstraktor.py b/wikstraktor.py index 6339324..20391a0 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -122,9 +122,26 @@ class Definition(SubInfo): raise ValueError(f"Definition.__init__: “{text}†empty definition.") def add_metadata(self, key, value): - if key in self.metadata.keys(): - self.log.add_log("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata['key']}†by {key}:“{value}â€") - self.metadata["key"]=value + if self.metadata_exists(key): + print("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata[key]}†by {key}:“{value}â€") + self.metadata[key]=value + + def add_to_metadata(self, key, value): + if not self.metadata_exists(key): + self.metadata[key] = [] + self.metadata[key].append(value) + + #to add at the end of the metadata, if empty add_metadata not add_to_metadata + def extend_metadata(self, key, value, separator=""): + if not self.metadata_exists(key): + self.add_metadata(key, value) + elif type(self.metadata[key]) == list: + self.metadata[key][-1] += separator+value + else: + self.metadata[key] += separator+value + + def metadata_exists(self, key): + return key in self.metadata.keys() def __eq__(self, other): return isinstance(other, self.__class__) and self.lang == other.lang and self.text == other.text @@ -204,7 +221,10 @@ class Sense(SubInfo): self.domain = d def add_def(self, lang, definition): - theDef = Definition(lang, definition) + if isinstance(definition, Definition): + theDef = definition + else: + theDef = Definition(lang, definition) if theDef != None and theDef not in self.definitions: theDef.set_id(self.set_id()) self.definitions.append(theDef) diff --git a/wikstraktor.sqlite b/wikstraktor.sqlite index b57b31ca0b8ab2c4d6a5cea2b52bfabfea67fc90..72c4e0fbff96b7a894f0b3423fdf8ec50dc36376 100644 GIT binary patch literal 16384 zcmeI2&rj1(9Kc%`KQ<UYL;;n=mvPa|k<#vGjfn%QK?mw6;z6=>>jN4}yL6Sruq9x; zn0VHUUi42e-u2?mzrY(8{{cOD@fGA{D}l&LafffSu0P&;&8MGz+rHQD79Qp_8>1D| z+E8pHFar$BGPe+77-o#j0Wu#<ELm}HACw)n9AhSy?uJPz+xM9vADF-Z7ytuc01SWu zFaQR?02lxRU;qp>W#IWuwrga3oPCkCm7<Q9wDpQ@DeJnqTDRIZotLt*gyig0PC}8o zk_fsQ3PxrW8%KgOq>4N{Ck1Du+u6mOjHVa!d1+oQI6D{Q?A(Lvr0zju+q4Rs*s5rz z5edrDV;RlgC-Y)1w^z%w)Pk-Ut520xa_Gb5+PZdZxzpYmlpoc$jLCrAz=mDW)JSl4 zUY71i`38h(Mc1)a*ueIhNvb#q4%xJ6*reJ1j?lFZVx8^(?YIx@FPi!ppb;1V17H9Q zfB`T72EYIq00UqE41fVJ(6R>Fhn?K~F!Pb=ztOiD{u&<bUFi7``W2FbZ@PbUXS!Z? ze(#KS)B@iESK7_CFKvTtiB$f-(@tzSBrFdO4h3d+dU%c(Vq797aA=Yfxr{iOR5-Pi zN+r`7uE<TQLIxLcG8q^61o`5e?L;x2Or_$Oygsq88d!d`UH4LFqHLKZT&WaPyrLPJ zP2TV9ZofraTY9mIjjCdm))cK&tC2m}h?SL!jm(uQIS#JXcDLW5zs<3VlnTNdW&7Dq z^aADWa`C3c1n(2CqTARq6kFTGs8Ytdt{JNiDme*4s~$-kJ5Oodc73GvV0Vb}9=6N3 zoZWMj_OM;wXg$~+q`dWZ$%U54h#Ys)*nRTq)4yD6O~>v4rLDK?Gpz@^{gl^jH!X5P ztJ&?Nv~IgT(t5BPro3*uDN$%SyS<dwZP!Oy4|aPfuiI`?<WBq)_WRBdqO@+iKGJ%y z8>GB$yKym|X|+2;H>Gvk^^w+t-7d=OvdeQ~qUB<@lhV5E`bpco-Kh@Bo2cvWlBcl$ zyt5W2X;-znc&}Q-BtJ{iua&B1?ESx4-|MwcI^AES;|<eLj_b4s=t(q$NLq}iL_XeX z7hpT3Z3q#6X+4BU8|7^Xk+a}EwGf$Nk35OE$R~UXk$vf>brR0_zoYjx)1U8qAO0Dh zA*(Qf0WbgtzyKHk17H9QfB`T72F}pHPV_PzTGfxfNs5mbnn*kRV~Vfow8JGzTR-~t onYQ`Scl09Vb=f7zt(TA-rL-=)e$sldJ3@I~cKu9GHyOKs0C0TLEC2ui delta 37 rcmZo@U~F)hAkE6iz`(#XQNf;(abv;)el}hpmqB2&puk)Hi2=L-nEVI` -- GitLab