From dbf616628a20fdbba07ce08873605f8b21edec93 Mon Sep 17 00:00:00 2001
From: Enzo Simonnet <enzosim@laposte.net>
Date: Thu, 20 Oct 2022 08:32:25 +0000
Subject: [PATCH] def + exemples ok

---
 parsers/en_constants.py | 113 ++++++++++++++++++++++++++++------------
 1 file changed, 80 insertions(+), 33 deletions(-)

diff --git a/parsers/en_constants.py b/parsers/en_constants.py
index 7b161f5..32c9392 100644
--- a/parsers/en_constants.py
+++ b/parsers/en_constants.py
@@ -1,33 +1,80 @@
-string_values = {
-	"ety":"Etymology",
-	"pro":"Pronunciation",
-	"en":"English",
-	"fr":"French",
-	"t_ipa":"IPA", #template for transcription
-	"t_snd":"audio", #template for audio
-	"t_acc":"a", #template for accents
-	"t_deflabel":"lb",
-	"POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS
-		"Adjective":"Adj",
-		"Adverb":"Adv",
-		"Ambiposition":"Ambip",
-		"Article":"Art",
-		"Circumposition":"Circump",
-		"Classifier":"Class",
-		"Conjunction":"Conj",
-		"Contraction":"Cont",
-		"Counter":"Count",
-		"Determiner":"Det",
-		"Ideophone":"Ideophone",
-		"Interjection":"Interj",
-		"Noun":"N",
-		"Numeral":"Num",
-		"Participle":"Part",
-		"Particle":"Particle",
-		"Postposition":"Postp",
-		"Preposition":"Prep",
-		"Pronoun":"Pro",
-		"Proper noun":"NP",
-		"Verb":"V" # TODO: compléter
-	}
-}
+#!/usr/bin/env python3
+from wikstraktor import Wikstraktor, Pronunciation, Sense
+
+from parsers.en_constants import string_values
+
+debugEty = 0
+
+class En_en_straktor(Wikstraktor):
+	def __init__(self):
+		super().__init__()
+		self.wiki_language = "en"
+		self.entry_language = "en"
+		self.constants = string_values
+		self.site = self.pwb.Site(f'wiktionary:en')
+
+	def process_pronunciation(self, proContent):
+		# TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux
+		l = proContent.get_lists()[0]
+		i = 0
+		pronunciations = []
+		while i < len(l.fullitems):
+			p = Pronunciation()
+			templates = self.wtp.parse(l.fullitems[i]).templates
+			a = None
+			for j, t in enumerate(templates):
+				if (t.normal_name() == self.constants['t_acc'] and templates[j+1].normal_name()!= self.constants['t_acc']):
+					a = t.arguments[0].value
+				elif t.normal_name() == self.constants['t_ipa']:
+					p.set_transcription(t.arguments[1].value)
+					p.set_accent(a)
+				elif t.normal_name() == self.constants['t_snd']:
+					p.add_sound(self.get_file_url(t.arguments[1].value), a)
+				if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] :
+					if p.ipa != None or p.accent != None:
+						pronunciations.append(p)
+						p = Pronunciation()
+			i += 1
+		return pronunciations
+
+	def process_etymology(self, etyContent):
+		global debugEty
+		debugEty += 1
+		return "Etymology" + str(debugEty)
+
+	def process_senses(self, entry, pos, sensesContent):
+		baseId = f"{entry}_{pos}_"
+		#here we don't look at
+		l = sensesContent.get_lists()[0]
+		i = 0
+		senses = []
+		while i < len(l.fullitems):
+			newSense = Sense(f"{baseId}{i}")
+			li = self.wtp.parse(l.fullitems[i])
+			j = 0
+			while j < len(li.templates) and li.templates[j].normal_name() != self.constants['t_deflabel']:
+				j += 1
+			if j < len(li.templates):
+				newSense.set_domain(li.templates[j].arguments[-1].value)#We could use the second parameter for a comment
+				newSense.add_def(self.wiki_language, self.wtp.parse(li.get_lists()[0].items[0]).plain_text().strip())
+				while j < len(li.templates)-1 and li.templates[j+1].normal_name() == self.constants['t_ex']:
+					newSense.add_example(li.templates[j+1].arguments[1].value)
+					j += 1
+				senses.append(newSense)
+				if len(li.get_lists(pattern = '##')) > 0 :
+					for cnt, k in enumerate (li.get_lists(pattern = '##')[0].items):
+							if self.wtp.parse(k).templates[0].normal_name() == self.constants['t_deflabel']:
+								newSense2 = Sense(f"{baseId}{i}{cnt}")
+								newSense2.set_domain(self.wtp.parse(k).templates[0].arguments[-1].value)#We could use the second parameter for a comment
+								newSense2.add_def(self.wiki_language, self.wtp.parse(k).plain_text().strip())
+							for a in self.wtp.parse(li.get_lists(pattern = '##')[0].fullitems[cnt]).templates:
+								if a.normal_name() == self.constants['t_ex']:
+									newSense2.add_example(a.arguments[-1].value)
+							senses.append(newSense2)
+			# TODO: process examples
+			i += 1
+		return senses
+
+if __name__ == "__main__":
+	ensk = En_en_straktor()
+	print(ensk.fetch("test"), "entries added")
-- 
GitLab