diff --git a/parsers/en_constants.py b/parsers/en_constants.py index c13e689fb36170cb665a64b9e0ea323ab20707f6..aab251e2caa54dc289207e1a0540bd3e09fcea8d 100644 --- a/parsers/en_constants.py +++ b/parsers/en_constants.py @@ -31,6 +31,10 @@ string_values = { "Eng":"England", "En":"England", "England":"England", + "Canada":"Canada", + "Canadian":"Canada", + 'North American':'North America', + 'North America':"North America", "US":"United States of America", "USA":"United States of America", "United States":"United States of America", @@ -39,7 +43,11 @@ string_values = { "New Zealand":"New Zealand", "Au":"Australia", "AU":"Australia", - "Australia":"Australia"}, + "Australia":"Australia", + "India":"India", + "Indian":"India", + "Nigeria":"Nigeria", + "Nigerian":"Nigeria"}, "sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns {"def":"\\#", "ex":"\\#[:;]", "add_subdef":"\\#"} ], diff --git a/parsers/en_en.py b/parsers/en_en.py index db91422f3733a1e2e0c90c336bfa9463d418be8c..e2a931639973aa7554e09fdbd343decd39873d69 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -50,15 +50,15 @@ class En_en_straktor(Wikstraktor): break return the_def - def parse_labels(self, a_def, templates): + def parse_labels(self, a_sense, templates): key = "labels" desc = "language" num = 0 for t in templates: if t.normal_name() in self.constants['t_lbl']: - while a_def.metadata_exists(f"{key}_{num}_{desc}"): + while a_sense.metadata_exists(f"{key}_{num}_{desc}"): num+=1 - a_def.add_metadata(f"{key}_{num}_{desc}", t.arguments[0].value) + a_sense.add_metadata(f"{key}_{num}_{desc}", t.arguments[0].value) complete_previous = False for a in t.arguments[1:]: if a.value == "_": @@ -66,28 +66,32 @@ class En_en_straktor(Wikstraktor): elif a.value == "and": pass elif a.value in self.constants['regions'].keys(): - a_def.add_to_metadata("region", self.constants['regions'][a.value]) + a_sense.add_region(self.constants['regions'][a.value]) elif complete_previous: - a_def.extend_metadata(f"{key}_{num}", a.value, " ") + a_sense.extend_metadata(f"{key}_{num}", a.value, " ") complete_previous = False else: - a_def.add_to_metadata(f"{key}_{num}", a.value) + a_sense.add_to_metadata(f"{key}_{num}", a.value) - def parse_definition(self, def_string): - the_def = None - parsed_def = self.wtp.parse(def_string) + def parse_definition(self, parsed_def): + if not isinstance(parsed_def, self.wtp.WikiText): + parsed_def = self.wtp.parse(parsed_def) def_text = parsed_def.plain_text().strip() templates = parsed_def.templates - if def_text != "": + the_def = self.parse_template_1(templates) + if the_def == None: the_def = Definition(self.entry_language, def_text) - else: - the_def = self.parse_template_1(templates) - if the_def != None: - self.parse_labels(the_def, templates) - else: + if the_def == None: raise ValueError(f"En_en_straktor.parse_definition with empty definition\n\t{def_string}") return the_def + def get_sense_metadata(self, sense, parsed_def): + if not isinstance(parsed_def, self.wtp.WikiText): + parsed_def = self.wtp.parse(parsed_def) + templates = parsed_def.templates + self.parse_labels(sense, templates) + + def process_POS(self,parsedwikitext): pos = None if parsedwikitext in self.constants['POS'].keys(): diff --git a/wikstraktor.py b/wikstraktor.py index 20391a0c1f4c5e96fda7494406ba2782b84cf3f2..66e187abda8fc96e86dcaaaa62ca2dd5e7bad513 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -117,32 +117,9 @@ class Definition(SubInfo): if text != "": self.lang = lang self.text = text - self.metadata = {} else: raise ValueError(f"Definition.__init__: “{text}†empty definition.") - def add_metadata(self, key, value): - if self.metadata_exists(key): - print("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata[key]}†by {key}:“{value}â€") - self.metadata[key]=value - - def add_to_metadata(self, key, value): - if not self.metadata_exists(key): - self.metadata[key] = [] - self.metadata[key].append(value) - - #to add at the end of the metadata, if empty add_metadata not add_to_metadata - def extend_metadata(self, key, value, separator=""): - if not self.metadata_exists(key): - self.add_metadata(key, value) - elif type(self.metadata[key]) == list: - self.metadata[key][-1] += separator+value - else: - self.metadata[key] += separator+value - - def metadata_exists(self, key): - return key in self.metadata.keys() - def __eq__(self, other): return isinstance(other, self.__class__) and self.lang == other.lang and self.text == other.text @@ -150,8 +127,6 @@ class Definition(SubInfo): res = super().serializable(prefix) res["lang"] = self.lang res[self.__class__.key] = self.text - if len(self.metadata.keys()) > 0 : - res["metadata"] = self.metadata return res class Translation(Definition): @@ -202,12 +177,39 @@ class Sense(SubInfo): self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) self.translations = [] #liste des traductions dans d'autres langues self.domain = None #domaine d'usage du mot dans ce sens + self.metadata = {} + self.regions = set() if definition != None: try: self.add_def(wiki_lang, definition) except ValueError as err: raise ValueError(f"Sense.__init__() with empty definition\n{err}") + def add_metadata(self, key, value): + if self.metadata_exists(key): + print("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata[key]}†by {key}:“{value}â€") + self.metadata[key]=value + + def add_to_metadata(self, key, value): + if not self.metadata_exists(key): + self.metadata[key] = [] + self.metadata[key].append(value) + + def add_region(self, region): + self.regions.add(region) + + #to add at the end of the metadata, if empty add_metadata not add_to_metadata + def extend_metadata(self, key, value, separator=""): + if not self.metadata_exists(key): + self.add_metadata(key, value) + elif type(self.metadata[key]) == list: + self.metadata[key][-1] += separator+value + else: + self.metadata[key] += separator+value + + def metadata_exists(self, key): + return key in self.metadata.keys() + def set_id(self, prefix=None): if prefix != None and self.label == None: self.label = f"{prefix}_{self.__class__.next_id}" #l'identifiant du sens @@ -251,7 +253,7 @@ class Sense(SubInfo): self.subsenses.append(subsense) def __eq__(self, other): - res = isinstance(other, self.__class__) and self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain + res = isinstance(other, self.__class__) and self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain and len(other.metadata) == len(self.metadata) and other.regions == self.regions i = 0 while res and i < len(self.examples): res = self.examples[i] in other.examples @@ -268,20 +270,30 @@ class Sense(SubInfo): while res and i < len(self.subsenses): res = self.subsenses[i] in other.subsenses i+=1 + i = 0 + l = list(self.metadata.keys()) + while res and i < len(l): + res = l[i] in other.metadata.keys() and type(self.metadata[l[i]]) == type(other.metadata[l[i]]) + if res and type(self.metadata[l[i]]) == list and len(self.metadata[l[i]]) == len(other.metadata[l[i]]): + j = 0 + while res and j < len(self.metadata[l[i]]): + res = self.metadata[l[i]][j] in other.metadata[l[i]] + j+=1 + i+=1 return res def serializable(self, prefix = None): res = {} if self.domain != None: res["Domain"] = self.domain + if len(self.regions) > 0: + res['Regions'] = list(self.regions) if len(self.definitions) > 0: res["Definitions"] = [] for d in self.definitions: res["Definitions"].append(d.serializable(prefix)) - if len(self.subsenses) > 0: - res["Subsenses"] = {} - for t in self.subsenses: - res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix) + if len(self.metadata.keys()) > 0 : + res["Metadata"] = self.metadata if len(self.examples) > 0 : res["Examples"] = [] for e in self.examples: @@ -290,6 +302,10 @@ class Sense(SubInfo): res["Translations"] = [] for t in self.translations: res["Translations"].append(t.serializable(prefix)) + if len(self.subsenses) > 0: + res["Subsenses"] = {} + for t in self.subsenses: + res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix) return res def __str__(self): @@ -616,19 +632,30 @@ class Wikstraktor: return res #can be overloaded - def parse_definition(self, definition): - return self.wtp.parse(definition).plain_text().strip() + def parse_definition(self, definition_wikitext): + if type(definition_wikitext) == str: + res = self.wtp.parse(definition_wikitext).plain_text().strip() + elif isinstance(definition_wikitext, wikitextparser.WikiText): + res = definition_wikitext.plain_text().strip() + return res + + #can be overloaded + def get_sense_metadata(self, sense, definition_wikitext): + pass #can be overloaded def process_definition(self, definition, sub_items, def_level = True): #does not process wk_en quotations try: + parsed_def = self.wtp.parse(definition) if def_level: - newSense = Sense(self.entry_language, self.parse_definition(definition), self.wiki_language) + newSense = Sense(self.entry_language, self.parse_definition(parsed_def), self.wiki_language) + self.get_sense_metadata(newSense, parsed_def) pattern_ex = self.constants['sense_pattern'][0]["ex"] pattern_subdef = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["def"] else: - newSense = SubSense(self.entry_language, self.parse_definition(definition), self.wiki_language) + newSense = SubSense(self.entry_language, self.parse_definition(parsed_def), self.wiki_language) + self.get_sense_metadata(newSense, parsed_def) pattern_subdef = None pattern_ex = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["ex"] #Process examples diff --git a/wikstraktor.sqlite b/wikstraktor.sqlite index 72c4e0fbff96b7a894f0b3423fdf8ec50dc36376..cf340034be915fbe4d55478064deb092757f3e55 100644 Binary files a/wikstraktor.sqlite and b/wikstraktor.sqlite differ