Merge branch 'main' of https://gitlab.liris.cnrs.fr/lex-game/wikstraktor

ca4c97d8 · Mathieu Loiseau · 22577fb7 · 7b7eed90 · ca4c97d8 · ca4c97d8
Commit ca4c97d8 authored 2 years ago by Mathieu Loiseau
--- a/README.md
+++ b/README.md
 wikstraktor
 ===========
-A python tool to query the [wiktionary](https://wiktionary.org) and extract structured lexical data.
+A python tool to query the [wiktionary](https://wiktionary.org) and extract [structured lexical data](https://gitlab.liris.cnrs.fr/lex-game/wikstraktor/-/wikis/Entry-structure).
 ## Dependencies
 This project does depend on python packages.

--- a/parsers/en_en.py
+++ b/parsers/en_en.py
@@ -17,6 +17,7 @@ class En_en_straktor(Wikstraktor):
 		# TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux
 		l = proContent.get_lists()[0]
 		i = 0
+		cpt = 0
 		pronunciations = []
 		while i < len(l.fullitems):
 			p = Pronunciation()
@@ -32,6 +33,8 @@ class En_en_straktor(Wikstraktor):
 					p.add_sound(self.get_file_url(t.arguments[1].value), a)
 				if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] :
 					if p.ipa != None or p.accent != None:
+						cpt += 1
+						p.id= f"p_{cpt}"
 						pronunciations.append(p)
 						p = Pronunciation()
 			i += 1
@@ -57,19 +60,27 @@ class En_en_straktor(Wikstraktor):
 		while i < len(l):
 			if l[i].pattern == '\\# ':
 				nombreDef += 1
+<<<<<<< HEAD
 				newSense = Sense(f"{baseId}{nombreDef}", self.entry_language)
 				newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
+=======
+				newSense = Sense(f"{baseId}{nombreDef}")
+				newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"d_{nombreDef}")
+				newSense.add_translation(f"t_{nombreDef}_0")
+>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
 			elif l[i].pattern == '\\#:':
+				cptEx=0
 				for j in l[i].items:
 					k = 0
 					isEx = 0
 					while k < len(self.wtp.parse(j).templates) and isEx == 0 :
 						if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
-							newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
+							cptEx +=1
+							newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"e_{nombreDef}_{cptEx}")
 							isEx = 1
 						k += 1
 					if isEx == 0:
-						newSense.add_example(self.wtp.parse(j).plain_text().strip())
+						newSense.add_example(self.wtp.parse(j).plain_text().strip(), f"e_{nombreDef}_{cptEx}")
 			if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
 				senses.append(newSense)
 			cnt = 0
@@ -78,19 +89,27 @@ class En_en_straktor(Wikstraktor):
 				cnt +=1
 				if l[i].pattern == '\\## ':
 					nombreSousDef += 1
+<<<<<<< HEAD
 					newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language)
 					newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
+=======
+					newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}")
+					newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"sd_{nombreDef}_{nombreSousDef}")
+					newSense2.add_translation(f"t_{nombreDef}_{nombreSousDef}_0")
+>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
 				elif l[i].pattern == '\\##:':
+					cptex2 = 0
 					for j in l[i].items:
 						k = 0
 						isEx = 0
 						while k < len(self.wtp.parse(j).templates) and isEx == 0 :
 							if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
-								newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
+								cptex2 +=1
+								newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"se_{nombreDef}_{nombreSousDef}_{cptex2}")
 								isEx = 1
 							k += 1
 						if isEx == 0:
-							newSense2.add_example(self.wtp.parse(j).plain_text().strip())
+							newSense2.add_example(self.wtp.parse(j).plain_text().strip(), f"se_{nombreDef}_{nombreSousDef}_{cptex2}")
 				if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
 					newSense.add_subsense(newSense2)
 				i += 1

--- a/parsers/fr_en.py
+++ b/parsers/fr_en.py
@@ -53,7 +53,7 @@ class Fr_en_straktor(Wikstraktor):
 				keys = list(self.constants['POS'].keys())
 				pos = keys[ik]
 			ik += 1
-# 		print(pos)
+		print(pos)
 		return pos
 	def process_senses(self, entry, pos, sensesContent):
@@ -64,9 +64,17 @@ class Fr_en_straktor(Wikstraktor):
 		nombreDef = 0
 		while i < len(l):
 			if l[i].pattern == '\\# ':
+<<<<<<< HEAD
 				nombreDef += 1
 				newSense = Sense(f"{baseId}{nombreDef}", self.entry_language)
 				newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
+=======
+				#A revoir ça, très douteux
+				for nbDef in l[i].items :
+					nombreDef += 1
+					newSense = Sense(f"{baseId}{nombreDef}")
+					newSense.add_def(self.wiki_language, self.wtp.parse(nbDef).plain_text().strip())
+>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
 			elif l[i].pattern == '\\#:':
 				for j in l[i].items:
 					k = 0

--- a/test_wikstraktor.py
+++ b/test_wikstraktor.py
@@ -5,7 +5,7 @@ if __name__ == "__main__":
 	# print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav"))
 	# print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav"))
 	#e.fetch("water")
-	f.fetch("blue")
+	f.fetch("water")
 	# print(e.fetch("test"), "entries added")
 	#print(e)
 	file_path = 'test.json'
@@ -22,3 +22,5 @@ if __name__ == "__main__":
 	# print(p.get_file_url())
 	#print(e)
 	#Entry("test", wtp.parse(page.text)))
+	# PRENDS PAS LE FICHIER AUDIO POUR "LIVE" EN_EN
--- a/wikstraktor.py
+++ b/wikstraktor.py
@@ -19,7 +19,7 @@ class Sound:
 		if self.accent == None:
 			res = {"url":self.url}
 		else:
-			res = {"accent":self.accent, "url":self.url}
+			res = { "accent":self.accent, "url":self.url}
 		return res
 class Pronunciation:
@@ -27,6 +27,7 @@ class Pronunciation:
 		self.ipa = None
 		self.sounds = []
 		self.accent = None
+		self.id = None
 	def set_transcription(self, tscpt):
 		self.ipa = tscpt
@@ -42,9 +43,9 @@ class Pronunciation:
 		for s in self.sounds:
 			snds.append(s.serializable())
 		if self.accent == None:
-			res = {"transcript":self.ipa, "sounds":snds}
+			res = {"ID":self.id, "transcript":self.ipa, "sounds":snds}
 		else:
-			res = {"accent":self.accent, "transcript":self.ipa, "sounds":snds}
+			res = {"ID":self.id,"accent":self.accent, "transcript":self.ipa, "sounds":snds}
 		return res
 	def __str__(self):
@@ -70,7 +71,12 @@ class Pronunciation:
 #######
 class Definition:
+<<<<<<< HEAD
 	def __init__(self, lang, text, id=None):
+=======
+	def __init__(self, lang, text, id):
+		self.id = id
+>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
 		self.lang = lang
 		self.text = text
 		self.id = id
@@ -81,6 +87,7 @@ class Definition:
 	def __eq__(self, other):
 		return self.lang == other.lang and self.text == other.text
+<<<<<<< HEAD
 	def serializable(self, id = True):
 		res = {}
 		if id and self.id != None:
@@ -100,6 +107,21 @@ class Translation(Definition):
 class Example:
 	def __init__(self, transcript, id=None, source=None, url=None):
+=======
+	def serializable(self):
+		return {"ID":self.id, "lang":self.lang, "definition":self.text}
+class Translation():
+	def __init__(self, id, lang=None, text=None):
+		self.id = id
+		self.lang = lang
+		self.text = text
+	def serializable(self):
+		return {"ID:" : self.id, "lang":self.lang, "translation":self.text}
+class Example:
+	def __init__(self, transcript, id, source=None, url=None):
+>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
 		self.text = transcript
 		self.source = source
 		self.url = url
@@ -108,6 +130,7 @@ class Example:
 	def __eq__(self, other):
 		return self.text==other.text and self.source==other.source and self.url==other.url
+<<<<<<< HEAD
 	def set_id(self, id):
 		self.id = id
@@ -116,6 +139,10 @@ class Example:
 			res = {"id":self.id, "example":self.text}
 		else:
 			res = {"example":self.text}
+=======
+	def serializable(self):
+		res = {"ID":self.id, "example":self.text}
+>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
 		if self.source != None:
 			res["source"] = self.source
 		if self.url != None:
@@ -135,21 +162,30 @@ class Sense:
 	def set_domain(self, d):
 		self.domain = d
+<<<<<<< HEAD
 	def add_def(self, lang, definition):
 		theDef = Definition(lang, definition)
 		if theDef not in self.definitions:
 			theDef.set_id(f"{self.label}_def{len(self.definitions)}")
 			self.definitions.append(theDef)
+=======
-	def add_example(self, transcript, src=None, url=None):
+	def add_def(self, lang, definition, id):
-		theEx = Example(transcript, src, url)
+		theDef = Definition(lang, definition, id)
+		if self.definition == None:
+			self.definition = theDef
+		elif self.definition != theDef:
+			raise ValueError(f"Superposition de deux définitions:\n\t{self.definition}\nremplacée par\n\t{theDef}")
+>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
+	def add_example(self, transcript, id, src=None, url=None):
+		theEx = Example(transcript, id, src, url)
 		if theEx not in self.examples:
 			theEx.set_id(f"{self.label}_ex{len(self.examples)}")
 			self.examples.append(theEx)
-	def add_translation(self, lang, translation):
+	def add_translation(self, id, lang=None, translation=None):
-		theTranslation = Translation(lang, translation)
+		theTranslation = Translation(id, lang, translation)
 		if theTranslation not in self.translations:
 			theTranslation.set_id(f"{self.label}_trad{len(self.translations)}")
 			self.translations.append(theTranslation)
@@ -180,6 +216,7 @@ class Sense:
 	def serializable(self, key = False):
 		res = {}
+<<<<<<< HEAD
 		if self.domain != None:
 			res["Domain"] = self.domain
 		if len(self.definitions) > 0:
@@ -198,6 +235,41 @@ class Sense:
 			res["Translations"] = []
 			for t in self.translations:
 				res["Translations"].append(t.serializable(key))
+=======
+		if key:
+			res[self.label]={}
+			if self.domain != None:
+				res[self.label]["Domain"] = self.domain
+			res[self.label]["Definition"] = self.definition.serializable()
+			if len(self.subsenses) > 0:
+				res[self.label]["Subsenses"] = []
+				for t in self.subsenses:
+					res[self.label]["Subsenses"].append(t.serializable())
+			if len(self.examples) > 0 :
+				res[self.label]["Examples"] = []
+				for e in self.examples:
+					res[self.label]["Examples"].append(e.serializable())
+			#if len(self.translations) > 0:
+			res[self.label]["Translations"] = []
+			for t in self.translations:
+				res[self.label]["Translations"].append(t.serializable())
+		else:
+			if self.domain != None:
+				res["Domain"] = self.domain
+			res["Definition"] = self.definition.serializable()
+			if len(self.subsenses) > 0:
+				res["Subsenses"] = {}
+				for t in self.subsenses:
+					res["Subsenses"][t.label]= t.serializable(key)
+			if len(self.examples) > 0 :
+				res["Examples"] = []
+				for e in self.examples:
+					res["Examples"].append(e.serializable())
+			#if len(self.translations) > 0:
+			res["Translations"] = []
+			for t in self.translations:
+				res["Translations"].append(t.serializable())
+>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
 		return res
@@ -298,9 +370,14 @@ class ParserContext:
 			if testNewEntry:
 				self.create_entry()
+#Pb là dedans
 	def create_entry(self):
+<<<<<<< HEAD
 		#Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS
 		res = Entry(self.lemma, self.lang)
+=======
+		res = Entry(self.lemma)
+>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
 		for l in self.context:
 			#print(l.keys())
 			if "pro" in l.keys():
@@ -403,7 +480,8 @@ class Wikstraktor:
 					pos = self.process_POS(stitle)
 					if pos != None :
 						self.parserContext.set_top_entry_info('POS', pos, False)
-						self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents)))
+						self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) #cette ligne le prob
+		# self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus.
 		res = len(self.parserContext.entries)
 		if res > 0:
 			for e in self.parserContext.entries: