diff --git a/parsers/en_constants.py b/parsers/en_constants.py index de100b7f77d7768038dffe54a27f6208c1224c09..c13e689fb36170cb665a64b9e0ea323ab20707f6 100644 --- a/parsers/en_constants.py +++ b/parsers/en_constants.py @@ -8,6 +8,38 @@ string_values = { "t_acc":"a", #template for accents "t_deflabel":"lb", "t_ex":["ux", "usex"], + "t_lbl":["lb","lbl", "label"], #template for labels + "regions":{ + "UK":"United Kingdom", + "United Kingdom":"United Kingdom", + "British":"Great Britain", + "GB":"Great Britain", + "Great Britain":"Great Britain", + "Scot":"Scotland", + "Scottish":"Scotland", + "Scotland":"Scotland", + "Irl":"Ireland", + "Irish":"Ireland", + "Ireland":"Ireland", + "Ulst":"Northern Ireland", + "Ulster":"Northern Ireland", + "Northern Ireland":"Northern Ireland", + "Wls":"Wales", + "Welsh":"Wales", + "Wales":"Wales", + "English":"England", + "Eng":"England", + "En":"England", + "England":"England", + "US":"United States of America", + "USA":"United States of America", + "United States":"United States of America", + "United States of America":"United States of America", + "NZ":"New Zealand", + "New Zealand":"New Zealand", + "Au":"Australia", + "AU":"Australia", + "Australia":"Australia"}, "sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns {"def":"\\#", "ex":"\\#[:;]", "add_subdef":"\\#"} ], diff --git a/parsers/en_en.py b/parsers/en_en.py index 1a2fab1bad8094859a53f166b88e337f9d18e9a1..db91422f3733a1e2e0c90c336bfa9463d418be8c 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -from wikstraktor import Wikstraktor, Pronunciation, Sense, SubSense +from wikstraktor import Wikstraktor, Pronunciation, Sense, SubSense, Definition from parsers.en_constants import string_values @@ -42,6 +42,52 @@ class En_en_straktor(Wikstraktor): debugEty += 1 return "Etymology" + str(debugEty) + def parse_template_1(self, templates): + the_def = None + for t in templates: + if t.normal_name() == "1": + the_def = Definition(self.entry_language, f"Other wording of “{t.arguments[0].value}â€") + break + return the_def + + def parse_labels(self, a_def, templates): + key = "labels" + desc = "language" + num = 0 + for t in templates: + if t.normal_name() in self.constants['t_lbl']: + while a_def.metadata_exists(f"{key}_{num}_{desc}"): + num+=1 + a_def.add_metadata(f"{key}_{num}_{desc}", t.arguments[0].value) + complete_previous = False + for a in t.arguments[1:]: + if a.value == "_": + complete_previous = True + elif a.value == "and": + pass + elif a.value in self.constants['regions'].keys(): + a_def.add_to_metadata("region", self.constants['regions'][a.value]) + elif complete_previous: + a_def.extend_metadata(f"{key}_{num}", a.value, " ") + complete_previous = False + else: + a_def.add_to_metadata(f"{key}_{num}", a.value) + + def parse_definition(self, def_string): + the_def = None + parsed_def = self.wtp.parse(def_string) + def_text = parsed_def.plain_text().strip() + templates = parsed_def.templates + if def_text != "": + the_def = Definition(self.entry_language, def_text) + else: + the_def = self.parse_template_1(templates) + if the_def != None: + self.parse_labels(the_def, templates) + else: + raise ValueError(f"En_en_straktor.parse_definition with empty definition\n\t{def_string}") + return the_def + def process_POS(self,parsedwikitext): pos = None if parsedwikitext in self.constants['POS'].keys(): diff --git a/wikstraktor.py b/wikstraktor.py index 6339324973615347940bed782bf2598fb9d98122..20391a0c1f4c5e96fda7494406ba2782b84cf3f2 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -122,9 +122,26 @@ class Definition(SubInfo): raise ValueError(f"Definition.__init__: “{text}†empty definition.") def add_metadata(self, key, value): - if key in self.metadata.keys(): - self.log.add_log("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata['key']}†by {key}:“{value}â€") - self.metadata["key"]=value + if self.metadata_exists(key): + print("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata[key]}†by {key}:“{value}â€") + self.metadata[key]=value + + def add_to_metadata(self, key, value): + if not self.metadata_exists(key): + self.metadata[key] = [] + self.metadata[key].append(value) + + #to add at the end of the metadata, if empty add_metadata not add_to_metadata + def extend_metadata(self, key, value, separator=""): + if not self.metadata_exists(key): + self.add_metadata(key, value) + elif type(self.metadata[key]) == list: + self.metadata[key][-1] += separator+value + else: + self.metadata[key] += separator+value + + def metadata_exists(self, key): + return key in self.metadata.keys() def __eq__(self, other): return isinstance(other, self.__class__) and self.lang == other.lang and self.text == other.text @@ -204,7 +221,10 @@ class Sense(SubInfo): self.domain = d def add_def(self, lang, definition): - theDef = Definition(lang, definition) + if isinstance(definition, Definition): + theDef = definition + else: + theDef = Definition(lang, definition) if theDef != None and theDef not in self.definitions: theDef.set_id(self.set_id()) self.definitions.append(theDef) diff --git a/wikstraktor.sqlite b/wikstraktor.sqlite index b57b31ca0b8ab2c4d6a5cea2b52bfabfea67fc90..72c4e0fbff96b7a894f0b3423fdf8ec50dc36376 100644 Binary files a/wikstraktor.sqlite and b/wikstraktor.sqlite differ