Skip to content
Snippets Groups Projects
Commit 1f3789ad authored by Enzo Simonnet's avatar Enzo Simonnet
Browse files

Replace en_constants.py

parent 86daa52a
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
from wikstraktor import Wikstraktor, Pronunciation, Sense
from parsers.en_constants import string_values
debugEty = 0
class En_en_straktor(Wikstraktor):
def __init__(self):
super().__init__()
self.wiki_language = "en"
self.entry_language = "en"
self.constants = string_values
self.site = self.pwb.Site(f'wiktionary:en')
def process_pronunciation(self, proContent):
# TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux
l = proContent.get_lists()[0]
i = 0
pronunciations = []
while i < len(l.fullitems):
p = Pronunciation()
templates = self.wtp.parse(l.fullitems[i]).templates
a = None
for j, t in enumerate(templates):
if (t.normal_name() == self.constants['t_acc'] and templates[j+1].normal_name()!= self.constants['t_acc']):
a = t.arguments[0].value
elif t.normal_name() == self.constants['t_ipa']:
p.set_transcription(t.arguments[1].value)
p.set_accent(a)
elif t.normal_name() == self.constants['t_snd']:
p.add_sound(self.get_file_url(t.arguments[1].value), a)
if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] :
if p.ipa != None or p.accent != None:
pronunciations.append(p)
p = Pronunciation()
i += 1
return pronunciations
def process_etymology(self, etyContent):
global debugEty
debugEty += 1
return "Etymology" + str(debugEty)
def process_senses(self, entry, pos, sensesContent):
baseId = f"{entry}_{pos}_"
#here we don't look at
l = sensesContent.get_lists()[0]
i = 0
senses = []
while i < len(l.fullitems):
newSense = Sense(f"{baseId}{i}")
li = self.wtp.parse(l.fullitems[i])
j = 0
while j < len(li.templates) and li.templates[j].normal_name() != self.constants['t_deflabel']:
j += 1
if j < len(li.templates):
newSense.set_domain(li.templates[j].arguments[-1].value)#We could use the second parameter for a comment
newSense.add_def(self.wiki_language, self.wtp.parse(li.get_lists()[0].items[0]).plain_text().strip())
while j < len(li.templates)-1 and li.templates[j+1].normal_name() == self.constants['t_ex']:
newSense.add_example(li.templates[j+1].arguments[1].value)
j += 1
senses.append(newSense)
if len(li.get_lists(pattern = '##')) > 0 :
for cnt, k in enumerate (li.get_lists(pattern = '##')[0].items):
if self.wtp.parse(k).templates[0].normal_name() == self.constants['t_deflabel']:
newSense2 = Sense(f"{baseId}{i}{cnt}")
newSense2.set_domain(self.wtp.parse(k).templates[0].arguments[-1].value)#We could use the second parameter for a comment
newSense2.add_def(self.wiki_language, self.wtp.parse(k).plain_text().strip())
for a in self.wtp.parse(li.get_lists(pattern = '##')[0].fullitems[cnt]).templates:
if a.normal_name() == self.constants['t_ex']:
newSense2.add_example(a.arguments[-1].value)
senses.append(newSense2)
# TODO: process examples
i += 1
return senses
if __name__ == "__main__":
ensk = En_en_straktor()
print(ensk.fetch("test"), "entries added")
string_values = {
"ety":"Etymology",
"pro":"Pronunciation",
"en":"English",
"fr":"French",
"t_ipa":"IPA", #template for transcription
"t_snd":"audio", #template for audio
"t_acc":"a", #template for accents
"t_deflabel":"lb",
"t_ex":"ux",
"POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS
"Adjective":"Adj",
"Adverb":"Adv",
"Ambiposition":"Ambip",
"Article":"Art",
"Circumposition":"Circump",
"Classifier":"Class",
"Conjunction":"Conj",
"Contraction":"Cont",
"Counter":"Count",
"Determiner":"Det",
"Ideophone":"Ideophone",
"Interjection":"Interj",
"Noun":"N",
"Numeral":"Num",
"Participle":"Part",
"Particle":"Particle",
"Postposition":"Postp",
"Preposition":"Prep",
"Pronoun":"Pro",
"Proper noun":"NP",
"Verb":"V" # TODO: compléter
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment