diff --git a/parsers/fr_en.py b/parsers/fr_en.py new file mode 100644 index 0000000000000000000000000000000000000000..0b8d4981062de480c7f0e16380d8a9465920b252 --- /dev/null +++ b/parsers/fr_en.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +from wikstraktor import Wikstraktor, Pronunciation, Sense + +from parsers.fr_constants import string_values + +debugEty = 0 + +class Fr_en_straktor(Wikstraktor): + def __init__(self): + super().__init__() + self.wiki_language = "fr" + self.entry_language = "en" + self.constants = string_values + self.site = self.pwb.Site(f'wiktionary:fr') + + def process_pronunciation(self, proContent): + # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux + l = proContent.get_lists()[0] + i = 0 + pronunciations = [] + while i < len(l.fullitems): + p = Pronunciation() + templates = self.wtp.parse(l.fullitems[i]).templates + a = None + for j, t in enumerate(templates): + #if t.normal_name() == self.constants['t_acc']: + # p.set_transcription(t.arguments[i+1].value) + if t.normal_name() == self.constants['t_snd']: + p.add_sound(self.get_file_url(t.arguments[-1].value)) + j=0 + while "=" in t.arguments[j]: + j += 1 + p.set_accent(t.arguments[j]) + if p.accent != None and p.sounds != []: + pronunciations.append(p) + p = Pronunciation() + i += 1 + return pronunciations + + def process_etymology(self, etyContent): + global debugEty + debugEty += 1 + return "Etymology" + str(debugEty) + + def process_senses(self, entry, pos, sensesContent): + baseId = f"{entry}_{pos}_" + l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) + i = 0 + senses = [] + nombreDef = 0 + while i < len(l): + if l[i].pattern == '\\# ': + nombreDef += 1 + newSense = Sense(f"{baseId}{nombreDef}") + newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + elif l[i].pattern == '\\#:': + for j in l[i].items: + k = 0 + isEx = 0 + while k < len(self.wtp.parse(j).templates) and isEx == 0 : + if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): + newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + isEx = 1 + k += 1 + if isEx == 0: + newSense.add_example(self.wtp.parse(j).plain_text().strip()) + if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': + senses.append(newSense) + cnt = 0 + nombreSousDef = 0 + while i < len(l) and l[i].level == 3 : + cnt +=1 + if l[i].pattern == '\\## ': + nombreSousDef += 1 + newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") + newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) + elif l[i].pattern == '\\##:': + for j in l[i].items: + k = 0 + isEx = 0 + while k < len(self.wtp.parse(j).templates) and isEx == 0 : + if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): + newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) + isEx = 1 + k += 1 + if isEx == 0: + newSense2.add_example(self.wtp.parse(j).plain_text().strip()) + if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': + senses.append(newSense2) + i += 1 + if cnt > 0: + i -= 1 + i += 1 + return senses + +if __name__ == "__main__": + ensk = Fr_en_straktor() + print(ensk.fetch("test"), "entries added") + +