diff --git a/parsers/en_constants.py b/parsers/en_constants.py index c933d6ab9b796b86e53f6b4ad1c9b9e96ea8eeb7..42c73cbf041d0f7da4efc20101badfac995a2301 100644 --- a/parsers/en_constants.py +++ b/parsers/en_constants.py @@ -8,6 +8,9 @@ string_values = { "t_acc":"a", #template for accents "t_deflabel":"lb", "t_ex":["ux", "usex"], + "sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns + {"def":"\\#", "ex":"\\#:", "add_subdef":"\\#"} + ], "POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS "Adjective":"Adj", "Adverb":"Adv", diff --git a/parsers/en_en.py b/parsers/en_en.py index d840524fe795ffeb2d0dbfa26866b46ad7d931d7..1a2fab1bad8094859a53f166b88e337f9d18e9a1 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -48,54 +48,6 @@ class En_en_straktor(Wikstraktor): pos = self.constants['POS'][parsedwikitext] return pos - def process_senses(self, sensesContent): - l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) - i = 0 - senses = [] - while i < len(l): - if l[i].pattern == '\\# ': - theDef = self.wtp.parse(l[i].items[0]).plain_text().strip() - if theDef != "": - newSense = Sense(self.entry_language, theDef, self.wiki_language) - #newSence.add_translation() - elif l[i].pattern == '\\#:': - for j in l[i].items: - k = 0 - isEx = 0 - while k < len(self.wtp.parse(j).templates) and isEx == 0 : - if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) - isEx = 1 - k += 1 - if isEx == 0: - newSense.add_example(self.wtp.parse(j).plain_text().strip()) - if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - senses.append(newSense) - cnt = 0 - while i < len(l) and l[i].level == 3 : - cnt +=1 - if l[i].pattern == '\\## ': - newSense2 = SubSense(self.entry_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), self.wiki_language) - #newSense2.add_translation() - elif l[i].pattern == '\\##:': - for j in l[i].items: - k = 0 - isEx = 0 - while k < len(self.wtp.parse(j).templates) and isEx == 0 : - if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) - isEx = 1 - k += 1 - if isEx == 0: - newSense2.add_example(self.wtp.parse(j).plain_text().strip()) - if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - newSense.add_subsense(newSense2) - i += 1 - if cnt > 0: - i -= 1 - i += 1 - return senses - if __name__ == "__main__": ensk = En_en_straktor() print(ensk.fetch("test"), "entries added") diff --git a/parsers/fr_en.py b/parsers/fr_en.py index 6a18ac5750b64d4ff58ee52bb1d7d4ec3ccc1c6d..91c43743ef70931cdf9db22e27afa6ce7c9bfdd3 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -55,57 +55,6 @@ class Fr_en_straktor(Wikstraktor): ik += 1 return pos - def process_example(self, example_wiki_text): - k = 0 - isEx = 0 - res = None - #process templates - while k < len(self.wtp.parse(example_wiki_text).templates) and isEx == 0 : - if (self.wtp.parse(example_wiki_text).templates[k].normal_name() in self.constants['t_ex']): - res = self.wtp.parse(example_wiki_text).templates[0].arguments[-1].value - isEx = 1 - k += 1 - if isEx == 0: - res = self.wtp.parse(example_wiki_text).plain_text().strip() - return res - - def process_definition(self, definition, sub_items, def_level = True): - if def_level: - newSense = Sense(self.entry_language, self.wtp.parse(definition).plain_text().strip(), self.wiki_language) - pattern_ex = self.constants['sense_pattern'][0]["ex"] - pattern_subdef = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["def"] - else: - newSense = SubSense(self.entry_language, self.wtp.parse(item).plain_text().strip(), self.wiki_language) - pattern_subdef = None - pattern_ex = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["ex"] - #Process examples - a = 0 - #print(newSense, sub_items)# DEBUG: - for item_list in sub_items: - if item_list.pattern == pattern_ex: - for item in item_list.items: - newSense.add_example(self.process_example(item)) - #Si on veut traiter les sous items (ex traductions), on peut utiliser - #item_list.sublists(a) - if def_level and item_list.pattern == pattern_subdef: - for item in item_list.items: - newSense.add_subsense(self.process_definition(item, item_list.sublists(a), False)) - a += 1 - return newSense - - def process_senses(self, sensesContent): - l = sensesContent.get_lists((self.constants['sense_pattern'][0]["def"])) - senses = [] - if len(l) > 1: - self.log.add_log("Wikstraktor.process_senses", f"===== WARNING ======\nmore than one sense list, make sure we don't forget anything\nignored lists : \n{l[1:]}\n===================") - l = l[0] #l now contains a list of list items - if l.pattern == self.constants['sense_pattern'][0]["def"]: - i = 0 - for item in l.items: - senses.append(self.process_definition(item, l.sublists(i))) - i += 1 - return senses - if __name__ == "__main__": ensk = Fr_en_straktor() print(ensk.fetch("test"), "entries added") diff --git a/wikstraktor.py b/wikstraktor.py index bf2317c9a39f23446dbaa834409c5450d0d6f8db..fec1efb131518ec026be156fa3cdd4f42538cf62 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -572,14 +572,58 @@ class Wikstraktor: def process_etymology(self, parsedwikitext): pass#in subclass + #can be overloaded def process_example(self, example_wiki_text): - pass#in subclass + k = 0 + isEx = 0 + res = None + #process templates + while k < len(self.wtp.parse(example_wiki_text).templates) and isEx == 0 : + if (self.wtp.parse(example_wiki_text).templates[k].normal_name() in self.constants['t_ex']): + res = self.wtp.parse(example_wiki_text).templates[0].arguments[-1].value + isEx = 1 + k += 1 + if isEx == 0: + res = self.wtp.parse(example_wiki_text).plain_text().strip() + return res + #can be overloaded def process_definition(self, definition, sub_items, def_level = True): - pass#in subclass - - def process_senses(self, parsedwikitext): - pass#in subclass + if def_level: + newSense = Sense(self.entry_language, self.wtp.parse(definition).plain_text().strip(), self.wiki_language) + pattern_ex = self.constants['sense_pattern'][0]["ex"] + pattern_subdef = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["def"] + else: + newSense = SubSense(self.entry_language, self.wtp.parse(definition).plain_text().strip(), self.wiki_language) + pattern_subdef = None + pattern_ex = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["ex"] + #Process examples + a = 0 + #print(newSense, sub_items)# DEBUG: + for item_list in sub_items: + if item_list.pattern == pattern_ex: + for item in item_list.items: + newSense.add_example(self.process_example(item)) + #Si on veut traiter les sous items (ex traductions), on peut utiliser + #item_list.sublists(a) + if def_level and item_list.pattern == pattern_subdef: + for item in item_list.items: + newSense.add_subsense(self.process_definition(item, item_list.sublists(a), False)) + a += 1 + return newSense + + def process_senses(self, sensesContent): + l = sensesContent.get_lists((self.constants['sense_pattern'][0]["def"])) + senses = [] + if len(l) > 1: + self.log.add_log("Wikstraktor.process_senses", f"===== WARNING ======\nmore than one sense list, make sure we don't forget anything\nignored lists : \n{l[1:]}\n===================") + l = l[0] #l now contains a list of list items + if l.pattern == self.constants['sense_pattern'][0]["def"]: + i = 0 + for item in l.items: + senses.append(self.process_definition(item, l.sublists(i))) + i += 1 + return senses def __str__(self): return self.export()