From dbf616628a20fdbba07ce08873605f8b21edec93 Mon Sep 17 00:00:00 2001 From: Enzo Simonnet <enzosim@laposte.net> Date: Thu, 20 Oct 2022 08:32:25 +0000 Subject: [PATCH] def + exemples ok --- parsers/en_constants.py | 113 ++++++++++++++++++++++++++++------------ 1 file changed, 80 insertions(+), 33 deletions(-) diff --git a/parsers/en_constants.py b/parsers/en_constants.py index 7b161f5..32c9392 100644 --- a/parsers/en_constants.py +++ b/parsers/en_constants.py @@ -1,33 +1,80 @@ -string_values = { - "ety":"Etymology", - "pro":"Pronunciation", - "en":"English", - "fr":"French", - "t_ipa":"IPA", #template for transcription - "t_snd":"audio", #template for audio - "t_acc":"a", #template for accents - "t_deflabel":"lb", - "POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS - "Adjective":"Adj", - "Adverb":"Adv", - "Ambiposition":"Ambip", - "Article":"Art", - "Circumposition":"Circump", - "Classifier":"Class", - "Conjunction":"Conj", - "Contraction":"Cont", - "Counter":"Count", - "Determiner":"Det", - "Ideophone":"Ideophone", - "Interjection":"Interj", - "Noun":"N", - "Numeral":"Num", - "Participle":"Part", - "Particle":"Particle", - "Postposition":"Postp", - "Preposition":"Prep", - "Pronoun":"Pro", - "Proper noun":"NP", - "Verb":"V" # TODO: compléter - } -} +#!/usr/bin/env python3 +from wikstraktor import Wikstraktor, Pronunciation, Sense + +from parsers.en_constants import string_values + +debugEty = 0 + +class En_en_straktor(Wikstraktor): + def __init__(self): + super().__init__() + self.wiki_language = "en" + self.entry_language = "en" + self.constants = string_values + self.site = self.pwb.Site(f'wiktionary:en') + + def process_pronunciation(self, proContent): + # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux + l = proContent.get_lists()[0] + i = 0 + pronunciations = [] + while i < len(l.fullitems): + p = Pronunciation() + templates = self.wtp.parse(l.fullitems[i]).templates + a = None + for j, t in enumerate(templates): + if (t.normal_name() == self.constants['t_acc'] and templates[j+1].normal_name()!= self.constants['t_acc']): + a = t.arguments[0].value + elif t.normal_name() == self.constants['t_ipa']: + p.set_transcription(t.arguments[1].value) + p.set_accent(a) + elif t.normal_name() == self.constants['t_snd']: + p.add_sound(self.get_file_url(t.arguments[1].value), a) + if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : + if p.ipa != None or p.accent != None: + pronunciations.append(p) + p = Pronunciation() + i += 1 + return pronunciations + + def process_etymology(self, etyContent): + global debugEty + debugEty += 1 + return "Etymology" + str(debugEty) + + def process_senses(self, entry, pos, sensesContent): + baseId = f"{entry}_{pos}_" + #here we don't look at + l = sensesContent.get_lists()[0] + i = 0 + senses = [] + while i < len(l.fullitems): + newSense = Sense(f"{baseId}{i}") + li = self.wtp.parse(l.fullitems[i]) + j = 0 + while j < len(li.templates) and li.templates[j].normal_name() != self.constants['t_deflabel']: + j += 1 + if j < len(li.templates): + newSense.set_domain(li.templates[j].arguments[-1].value)#We could use the second parameter for a comment + newSense.add_def(self.wiki_language, self.wtp.parse(li.get_lists()[0].items[0]).plain_text().strip()) + while j < len(li.templates)-1 and li.templates[j+1].normal_name() == self.constants['t_ex']: + newSense.add_example(li.templates[j+1].arguments[1].value) + j += 1 + senses.append(newSense) + if len(li.get_lists(pattern = '##')) > 0 : + for cnt, k in enumerate (li.get_lists(pattern = '##')[0].items): + if self.wtp.parse(k).templates[0].normal_name() == self.constants['t_deflabel']: + newSense2 = Sense(f"{baseId}{i}{cnt}") + newSense2.set_domain(self.wtp.parse(k).templates[0].arguments[-1].value)#We could use the second parameter for a comment + newSense2.add_def(self.wiki_language, self.wtp.parse(k).plain_text().strip()) + for a in self.wtp.parse(li.get_lists(pattern = '##')[0].fullitems[cnt]).templates: + if a.normal_name() == self.constants['t_ex']: + newSense2.add_example(a.arguments[-1].value) + senses.append(newSense2) + # TODO: process examples + i += 1 + return senses + +if __name__ == "__main__": + ensk = En_en_straktor() + print(ensk.fetch("test"), "entries added") -- GitLab