diff --git a/parsers/en_constants.py b/parsers/en_constants.py index 32c9392b524a058203bc25b468ac124b7000b0ea..d5b2d2f2d7a077963ce84e316ea53876e798949f 100644 --- a/parsers/en_constants.py +++ b/parsers/en_constants.py @@ -1,80 +1,34 @@ -#!/usr/bin/env python3 -from wikstraktor import Wikstraktor, Pronunciation, Sense - -from parsers.en_constants import string_values - -debugEty = 0 - -class En_en_straktor(Wikstraktor): - def __init__(self): - super().__init__() - self.wiki_language = "en" - self.entry_language = "en" - self.constants = string_values - self.site = self.pwb.Site(f'wiktionary:en') - - def process_pronunciation(self, proContent): - # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux - l = proContent.get_lists()[0] - i = 0 - pronunciations = [] - while i < len(l.fullitems): - p = Pronunciation() - templates = self.wtp.parse(l.fullitems[i]).templates - a = None - for j, t in enumerate(templates): - if (t.normal_name() == self.constants['t_acc'] and templates[j+1].normal_name()!= self.constants['t_acc']): - a = t.arguments[0].value - elif t.normal_name() == self.constants['t_ipa']: - p.set_transcription(t.arguments[1].value) - p.set_accent(a) - elif t.normal_name() == self.constants['t_snd']: - p.add_sound(self.get_file_url(t.arguments[1].value), a) - if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : - if p.ipa != None or p.accent != None: - pronunciations.append(p) - p = Pronunciation() - i += 1 - return pronunciations - - def process_etymology(self, etyContent): - global debugEty - debugEty += 1 - return "Etymology" + str(debugEty) - - def process_senses(self, entry, pos, sensesContent): - baseId = f"{entry}_{pos}_" - #here we don't look at - l = sensesContent.get_lists()[0] - i = 0 - senses = [] - while i < len(l.fullitems): - newSense = Sense(f"{baseId}{i}") - li = self.wtp.parse(l.fullitems[i]) - j = 0 - while j < len(li.templates) and li.templates[j].normal_name() != self.constants['t_deflabel']: - j += 1 - if j < len(li.templates): - newSense.set_domain(li.templates[j].arguments[-1].value)#We could use the second parameter for a comment - newSense.add_def(self.wiki_language, self.wtp.parse(li.get_lists()[0].items[0]).plain_text().strip()) - while j < len(li.templates)-1 and li.templates[j+1].normal_name() == self.constants['t_ex']: - newSense.add_example(li.templates[j+1].arguments[1].value) - j += 1 - senses.append(newSense) - if len(li.get_lists(pattern = '##')) > 0 : - for cnt, k in enumerate (li.get_lists(pattern = '##')[0].items): - if self.wtp.parse(k).templates[0].normal_name() == self.constants['t_deflabel']: - newSense2 = Sense(f"{baseId}{i}{cnt}") - newSense2.set_domain(self.wtp.parse(k).templates[0].arguments[-1].value)#We could use the second parameter for a comment - newSense2.add_def(self.wiki_language, self.wtp.parse(k).plain_text().strip()) - for a in self.wtp.parse(li.get_lists(pattern = '##')[0].fullitems[cnt]).templates: - if a.normal_name() == self.constants['t_ex']: - newSense2.add_example(a.arguments[-1].value) - senses.append(newSense2) - # TODO: process examples - i += 1 - return senses - -if __name__ == "__main__": - ensk = En_en_straktor() - print(ensk.fetch("test"), "entries added") +string_values = { + "ety":"Etymology", + "pro":"Pronunciation", + "en":"English", + "fr":"French", + "t_ipa":"IPA", #template for transcription + "t_snd":"audio", #template for audio + "t_acc":"a", #template for accents + "t_deflabel":"lb", + "t_ex":"ux", + "POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS + "Adjective":"Adj", + "Adverb":"Adv", + "Ambiposition":"Ambip", + "Article":"Art", + "Circumposition":"Circump", + "Classifier":"Class", + "Conjunction":"Conj", + "Contraction":"Cont", + "Counter":"Count", + "Determiner":"Det", + "Ideophone":"Ideophone", + "Interjection":"Interj", + "Noun":"N", + "Numeral":"Num", + "Participle":"Part", + "Particle":"Particle", + "Postposition":"Postp", + "Preposition":"Prep", + "Pronoun":"Pro", + "Proper noun":"NP", + "Verb":"V" # TODO: compléter + } +}