From da8f8f5c5d9ac82c2ede2cc9f08c81a0f6f8efff Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Sat, 25 Mar 2023 15:47:55 +0100 Subject: [PATCH] Improve stack to handle === Noun === === Adj === === Pronounciations === --- blue.json | 0 parsers/fr_constants.py | 6 ++- parsers/fr_en.py | 86 +++++++++++++++++++++------------------- wikstraktor.py | 80 +++++++++++++++++++++++++------------ wikstraktor.sqlite | Bin 0 -> 8192 bytes 5 files changed, 105 insertions(+), 67 deletions(-) delete mode 100644 blue.json create mode 100644 wikstraktor.sqlite diff --git a/blue.json b/blue.json deleted file mode 100644 index e69de29..0000000 diff --git a/parsers/fr_constants.py b/parsers/fr_constants.py index cbf8212..73d6812 100644 --- a/parsers/fr_constants.py +++ b/parsers/fr_constants.py @@ -9,7 +9,9 @@ string_values = { "t_ipa":"pron", #template for transcription "t_snd":"écouter", #template for audio "t_acc":["US", "UK"], #template for accents - +"sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns + {"def":"\\#", "ex":"\\#\\*", "add_subdef":"\\#"} +], "POS":{ "adjectif":["adjectif","adjectif qualificatif","adj"], "adjectif démonstratif":["adjectif démonstratif","adj-dém","adjectif dém"], @@ -77,5 +79,5 @@ string_values = { "variante par contrainte typographique":["variante typographique","variante typo","variante par contrainte typographique","var-typo"], "verbe pronominal":["verbe pronominal","verb-pr","verbe pr"], "verbe":["verbe","verb"] - } + } } diff --git a/parsers/fr_en.py b/parsers/fr_en.py index fcaa931..6a18ac5 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -55,49 +55,55 @@ class Fr_en_straktor(Wikstraktor): ik += 1 return pos + def process_example(self, example_wiki_text): + k = 0 + isEx = 0 + res = None + #process templates + while k < len(self.wtp.parse(example_wiki_text).templates) and isEx == 0 : + if (self.wtp.parse(example_wiki_text).templates[k].normal_name() in self.constants['t_ex']): + res = self.wtp.parse(example_wiki_text).templates[0].arguments[-1].value + isEx = 1 + k += 1 + if isEx == 0: + res = self.wtp.parse(example_wiki_text).plain_text().strip() + return res + + def process_definition(self, definition, sub_items, def_level = True): + if def_level: + newSense = Sense(self.entry_language, self.wtp.parse(definition).plain_text().strip(), self.wiki_language) + pattern_ex = self.constants['sense_pattern'][0]["ex"] + pattern_subdef = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["def"] + else: + newSense = SubSense(self.entry_language, self.wtp.parse(item).plain_text().strip(), self.wiki_language) + pattern_subdef = None + pattern_ex = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["ex"] + #Process examples + a = 0 + #print(newSense, sub_items)# DEBUG: + for item_list in sub_items: + if item_list.pattern == pattern_ex: + for item in item_list.items: + newSense.add_example(self.process_example(item)) + #Si on veut traiter les sous items (ex traductions), on peut utiliser + #item_list.sublists(a) + if def_level and item_list.pattern == pattern_subdef: + for item in item_list.items: + newSense.add_subsense(self.process_definition(item, item_list.sublists(a), False)) + a += 1 + return newSense + def process_senses(self, sensesContent): - l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) - i = 0 + l = sensesContent.get_lists((self.constants['sense_pattern'][0]["def"])) senses = [] - while i < len(l): - if l[i].pattern == '\\# ': - newSense = Sense(self.entry_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), self.wiki_language) - #la version d'enzo ajoute +ieurs defs (for i in l[i].items) - elif l[i].pattern == '\\#:': - for j in l[i].items: - k = 0 - isEx = 0 - while k < len(self.wtp.parse(j).templates) and isEx == 0 : - if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) - isEx = 1 - k += 1 - if isEx == 0: - newSense.add_example(self.wtp.parse(j).plain_text().strip()) - if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - senses.append(newSense) - cnt = 0 - while i < len(l) and l[i].level == 3 : - cnt +=1 - if l[i].pattern == '\\## ': - newSense2 = SubSense(self.entry_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), self.wiki_language) - elif l[i].pattern == '\\##:': - for j in l[i].items: - k = 0 - isEx = 0 - while k < len(self.wtp.parse(j).templates) and isEx == 0 : - if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) - isEx = 1 - k += 1 - if isEx == 0: - newSense2.add_example(self.wtp.parse(j).plain_text().strip()) - if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - newSense.add_subsense(newSense2) + if len(l) > 1: + self.log.add_log("Wikstraktor.process_senses", f"===== WARNING ======\nmore than one sense list, make sure we don't forget anything\nignored lists : \n{l[1:]}\n===================") + l = l[0] #l now contains a list of list items + if l.pattern == self.constants['sense_pattern'][0]["def"]: + i = 0 + for item in l.items: + senses.append(self.process_definition(item, l.sublists(i))) i += 1 - if cnt > 0: - i -= 1 - i += 1 return senses if __name__ == "__main__": diff --git a/wikstraktor.py b/wikstraktor.py index d7d8d1e..bf2317c 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -391,9 +391,13 @@ class ParserContext: def pop(self, testNewEntry = True): if testNewEntry: - self.create_entry() + self.create_entries() return self.context.pop() + def flush(self): + while len(self.context) > 0: + self.pop(True) + def set_top_wiki(self, wiki_context): if len(self.context) == 0: self.push(wiki_context) @@ -406,26 +410,32 @@ class ParserContext: else: self.context[-1][key] = entry_context if testNewEntry: - self.create_entry() + self.create_entries() -#Pb là dedans - def create_entry(self): - #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS - res = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version) + def create_entries(self): + #In the key dict there are traits that describe every thing (ety, pro) and different entities (POS:senses) + tmp = {} + res = 0 + pro = None for l in self.context: - if "pro" in l.keys(): - res.set_pronunciations(l['pro']) - if "ety" in l.keys(): - pass #On ignore l'étymologie pour le moment - if "POS" in l.keys(): - res.set_pos(l['POS']) - if "senses" in l.keys(): - res.set_senses(l['senses']) - # TODO: Ajouter les autres types - if res.is_valid() and res not in self.entries: - self.entries.append(res) - else: - res = None + for k,v in l.items(): + if k == "pro": + pro = v + elif k == "ety" or k == "wiki": + #wiki context is not necessary + pass #On ignore l'étymologie pour le moment + else: + tmp[k]=v + if(pro!=None and len(tmp)>0): + for pos,senses in tmp.items(): + e = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version) + e.set_pronunciations(pro) + e.set_pos(pos) + e.set_senses(senses) + #an improvement would be to remove that sense from context, but we test not to add doubles + if e.is_valid() and e not in self.entries: + res += 1 + self.entries.append(e) return res def debug_top(self): @@ -442,6 +452,20 @@ class ParserContext: res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}" return res + def __str__(self): + res = "" + i=0 + for c in self.context: + res += f"====={i}======\n" + for k,v in c.items(): + if k!= "wiki": + res+=f" {k}→{v}\n" + else: + res+=f" {k}→{len(v)}\n" + i+=1 + return res+f"nb of entries: {len(self.entries)}" + + class Wikstraktor: @classmethod @@ -500,8 +524,9 @@ class Wikstraktor: self.parserContext.push(s) else: while self.parserContext.get_level() > s.level: - self.parserContext.pop() + self.parserContext.pop(True) self.parserContext.set_top_wiki(s) + #get section title stitle = self.wtp.parse(s.title).templates if stitle == []: stitle = s.title @@ -511,14 +536,12 @@ class Wikstraktor: self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents))) elif self.isEty(stitle): self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents))) -# elif stitle in self.constants['POS'].keys(): else: + #Edit to process other types of sections pos = self.process_POS(stitle) if pos != None : - self.parserContext.set_top_entry_info('POS', pos, False) - self.parserContext.set_top_entry_info('senses', self.process_senses(self.wtp.parse(s.contents))) #cette ligne le prob - # self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus. - + self.parserContext.set_top_entry_info(pos, self.process_senses(self.wtp.parse(s.contents))) + self.parserContext.flush() res = len(self.parserContext.entries) if res > 0: for e in self.parserContext.entries: @@ -539,6 +562,7 @@ class Wikstraktor: res = title in self.constants['ety'] return res + #recognizes POS and returns None if it can't def process_POS(self, parsedwikitext): pass#in subclass @@ -548,6 +572,12 @@ class Wikstraktor: def process_etymology(self, parsedwikitext): pass#in subclass + def process_example(self, example_wiki_text): + pass#in subclass + + def process_definition(self, definition, sub_items, def_level = True): + pass#in subclass + def process_senses(self, parsedwikitext): pass#in subclass diff --git a/wikstraktor.sqlite b/wikstraktor.sqlite new file mode 100644 index 0000000000000000000000000000000000000000..b57b31ca0b8ab2c4d6a5cea2b52bfabfea67fc90 GIT binary patch literal 8192 zcmeI#K}*9h6bJBR2-1P#&1>i`3gX4HU~QvJx~g5uyp_meE$bTTq8~u~V16`Tl)^+* z@F@RBLz}$!O8DL0^g6Ab5qGv*YA24^4&$7i2*DWRv<9^L@cWH^LvO$TEjT-zU2IVm z-+pEEARqt%2tWV=5P$##AOHafKmY=N5_mr5o4cbCe~FyFZOp7*tX!uTjjh%`2XQVV zC54JkQz^o=Oepq-L6~S~!XTj%HJ->Ik!MknDiIfXE;Cj7!>NiUR|izvH#^&vPo`Vd zwhaR%Z<NR`X%%VuUS+#^+32=<)D`{uYrbC8KjwYEAIiV$eK0hzx2APvJr9F%rsTQI zKV2c#O=G%pY23Y~5`P7Mv)Hzd+CC13`+hd3p&tPO2tWV=5P$##AOHafKmY;|fWSW% Fcmp61Xm<br literal 0 HcmV?d00001 -- GitLab