From da8f8f5c5d9ac82c2ede2cc9f08c81a0f6f8efff Mon Sep 17 00:00:00 2001
From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr>
Date: Sat, 25 Mar 2023 15:47:55 +0100
Subject: [PATCH] Improve stack to handle === Noun === === Adj === ===
 Pronounciations ===

---
 blue.json               |   0
 parsers/fr_constants.py |   6 ++-
 parsers/fr_en.py        |  86 +++++++++++++++++++++-------------------
 wikstraktor.py          |  80 +++++++++++++++++++++++++------------
 wikstraktor.sqlite      | Bin 0 -> 8192 bytes
 5 files changed, 105 insertions(+), 67 deletions(-)
 delete mode 100644 blue.json
 create mode 100644 wikstraktor.sqlite

diff --git a/blue.json b/blue.json
deleted file mode 100644
index e69de29..0000000
diff --git a/parsers/fr_constants.py b/parsers/fr_constants.py
index cbf8212..73d6812 100644
--- a/parsers/fr_constants.py
+++ b/parsers/fr_constants.py
@@ -9,7 +9,9 @@ string_values = {
 "t_ipa":"pron", #template for transcription
 "t_snd":"écouter", #template for audio
 "t_acc":["US", "UK"], #template for accents
-		
+"sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns
+	{"def":"\\#", "ex":"\\#\\*", "add_subdef":"\\#"}
+],
 "POS":{
 	"adjectif":["adjectif","adjectif qualificatif","adj"],
 	"adjectif démonstratif":["adjectif démonstratif","adj-dém","adjectif dém"],
@@ -77,5 +79,5 @@ string_values = {
 	"variante par contrainte typographique":["variante typographique","variante typo","variante par contrainte typographique","var-typo"],
 	"verbe pronominal":["verbe pronominal","verb-pr","verbe pr"],
 	"verbe":["verbe","verb"]
-     }
+	}
 }
diff --git a/parsers/fr_en.py b/parsers/fr_en.py
index fcaa931..6a18ac5 100644
--- a/parsers/fr_en.py
+++ b/parsers/fr_en.py
@@ -55,49 +55,55 @@ class Fr_en_straktor(Wikstraktor):
 			ik += 1
 		return pos
 
+	def process_example(self, example_wiki_text):
+		k = 0
+		isEx = 0
+		res = None
+		#process templates
+		while k < len(self.wtp.parse(example_wiki_text).templates) and isEx == 0 :
+			if (self.wtp.parse(example_wiki_text).templates[k].normal_name() in self.constants['t_ex']):
+				res = self.wtp.parse(example_wiki_text).templates[0].arguments[-1].value
+				isEx = 1
+			k += 1
+		if isEx == 0:
+			res = self.wtp.parse(example_wiki_text).plain_text().strip()
+		return res
+
+	def process_definition(self, definition, sub_items, def_level = True):
+		if def_level:
+			newSense = Sense(self.entry_language, self.wtp.parse(definition).plain_text().strip(), self.wiki_language)
+			pattern_ex = self.constants['sense_pattern'][0]["ex"]
+			pattern_subdef = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["def"]
+		else:
+			newSense = SubSense(self.entry_language, self.wtp.parse(item).plain_text().strip(), self.wiki_language)
+			pattern_subdef = None
+			pattern_ex = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["ex"]
+		#Process examples
+		a = 0
+		#print(newSense, sub_items)# DEBUG:
+		for item_list in sub_items:
+			if item_list.pattern == pattern_ex:
+				for item in item_list.items:
+					newSense.add_example(self.process_example(item))
+					#Si on veut traiter les sous items (ex traductions), on peut utiliser
+					#item_list.sublists(a)
+			if def_level and item_list.pattern == pattern_subdef:
+				for item in item_list.items:
+					newSense.add_subsense(self.process_definition(item, item_list.sublists(a), False))
+			a += 1
+		return newSense
+
 	def process_senses(self, sensesContent):
-		l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' ))
-		i = 0
+		l = sensesContent.get_lists((self.constants['sense_pattern'][0]["def"]))
 		senses = []
-		while i < len(l):
-			if l[i].pattern == '\\# ':
-				newSense = Sense(self.entry_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), self.wiki_language)
-				#la version d'enzo ajoute +ieurs defs (for i in l[i].items)
-			elif l[i].pattern == '\\#:':
-				for j in l[i].items:
-					k = 0
-					isEx = 0
-					while k < len(self.wtp.parse(j).templates) and isEx == 0 :
-						if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
-							newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
-							isEx = 1
-						k += 1
-					if isEx == 0:
-						newSense.add_example(self.wtp.parse(j).plain_text().strip())
-			if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
-				senses.append(newSense)
-			cnt = 0
-			while i < len(l) and l[i].level == 3 :
-				cnt +=1
-				if l[i].pattern == '\\## ':
-					newSense2 = SubSense(self.entry_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), self.wiki_language)
-				elif l[i].pattern == '\\##:':
-					for j in l[i].items:
-						k = 0
-						isEx = 0
-						while k < len(self.wtp.parse(j).templates) and isEx == 0 :
-							if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
-								newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
-								isEx = 1
-							k += 1
-						if isEx == 0:
-							newSense2.add_example(self.wtp.parse(j).plain_text().strip())
-				if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
-					newSense.add_subsense(newSense2)
+		if len(l) > 1:
+			self.log.add_log("Wikstraktor.process_senses", f"===== WARNING ======\nmore than one sense list, make sure we don't forget anything\nignored lists : \n{l[1:]}\n===================")
+		l = l[0] #l now contains a list of list items
+		if l.pattern == self.constants['sense_pattern'][0]["def"]:
+			i = 0
+			for item in l.items:
+				senses.append(self.process_definition(item, l.sublists(i)))
 				i += 1
-			if cnt > 0:
-				i -= 1
-			i += 1
 		return senses
 
 if __name__ == "__main__":
diff --git a/wikstraktor.py b/wikstraktor.py
index d7d8d1e..bf2317c 100755
--- a/wikstraktor.py
+++ b/wikstraktor.py
@@ -391,9 +391,13 @@ class ParserContext:
 
 	def pop(self, testNewEntry = True):
 		if testNewEntry:
-			self.create_entry()
+			self.create_entries()
 		return self.context.pop()
 
+	def flush(self):
+		while len(self.context) > 0:
+			self.pop(True)
+
 	def set_top_wiki(self, wiki_context):
 		if len(self.context) == 0:
 			self.push(wiki_context)
@@ -406,26 +410,32 @@ class ParserContext:
 		else:
 			self.context[-1][key] = entry_context
 			if testNewEntry:
-				self.create_entry()
+				self.create_entries()
 
-#Pb là dedans
-	def create_entry(self):
-		#Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS
-		res = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version)
+	def create_entries(self):
+		#In the key dict there are traits that describe every thing (ety, pro) and different entities (POS:senses)
+		tmp = {}
+		res = 0
+		pro = None
 		for l in self.context:
-			if "pro" in l.keys():
-				res.set_pronunciations(l['pro'])
-			if "ety" in l.keys():
-				pass #On ignore l'étymologie pour le moment
-			if "POS" in l.keys():
-				res.set_pos(l['POS'])
-			if "senses" in l.keys():
-				res.set_senses(l['senses'])
-			# TODO: Ajouter les autres types
-		if res.is_valid() and res not in self.entries:
-			self.entries.append(res)
-		else:
-			res = None
+			for k,v in l.items():
+				if k == "pro":
+					pro = v
+				elif k == "ety" or k == "wiki":
+					#wiki context is not necessary
+					pass #On ignore l'étymologie pour le moment
+				else:
+					tmp[k]=v
+		if(pro!=None and len(tmp)>0):
+			for pos,senses in tmp.items():
+				e = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version)
+				e.set_pronunciations(pro)
+				e.set_pos(pos)
+				e.set_senses(senses)
+				#an improvement would be to remove that sense from context, but we test not to add doubles
+				if e.is_valid() and e not in self.entries:
+					res += 1
+					self.entries.append(e)
 		return res
 
 	def debug_top(self):
@@ -442,6 +452,20 @@ class ParserContext:
 			res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}"
 		return res
 
+	def __str__(self):
+		res = ""
+		i=0
+		for c in self.context:
+			res += f"====={i}======\n"
+			for k,v in c.items():
+				if k!= "wiki":
+					res+=f"  {k}→{v}\n"
+				else:
+					res+=f"  {k}→{len(v)}\n"
+			i+=1
+		return res+f"nb of entries: {len(self.entries)}"
+
+
 
 class Wikstraktor:
 	@classmethod
@@ -500,8 +524,9 @@ class Wikstraktor:
 					self.parserContext.push(s)
 				else:
 					while self.parserContext.get_level() > s.level:
-						self.parserContext.pop()
+						self.parserContext.pop(True)
 					self.parserContext.set_top_wiki(s)
+				#get section title
 				stitle = self.wtp.parse(s.title).templates
 				if stitle == []:
 					stitle = s.title
@@ -511,14 +536,12 @@ class Wikstraktor:
 					self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents)))
 				elif self.isEty(stitle):
 					self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents)))
-# 				elif stitle in self.constants['POS'].keys():
 				else:
+					#Edit to process other types of sections
 					pos = self.process_POS(stitle)
 					if pos != None :
-						self.parserContext.set_top_entry_info('POS', pos, False)
-						self.parserContext.set_top_entry_info('senses', self.process_senses(self.wtp.parse(s.contents))) #cette ligne le prob
-		# self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus.
-
+						self.parserContext.set_top_entry_info(pos, self.process_senses(self.wtp.parse(s.contents)))
+		self.parserContext.flush()
 		res = len(self.parserContext.entries)
 		if res > 0:
 			for e in self.parserContext.entries:
@@ -539,6 +562,7 @@ class Wikstraktor:
 			res = title in self.constants['ety']
 		return res
 
+	#recognizes POS and returns None if it can't
 	def process_POS(self, parsedwikitext):
 		pass#in subclass
 
@@ -548,6 +572,12 @@ class Wikstraktor:
 	def process_etymology(self, parsedwikitext):
 		pass#in subclass
 
+	def process_example(self, example_wiki_text):
+		pass#in subclass
+
+	def process_definition(self, definition, sub_items, def_level = True):
+		pass#in subclass
+
 	def process_senses(self, parsedwikitext):
 		pass#in subclass
 
diff --git a/wikstraktor.sqlite b/wikstraktor.sqlite
new file mode 100644
index 0000000000000000000000000000000000000000..b57b31ca0b8ab2c4d6a5cea2b52bfabfea67fc90
GIT binary patch
literal 8192
zcmeI#K}*9h6bJBR2-1P#&1>i`3gX4HU~QvJx~g5uyp_meE$bTTq8~u~V16`Tl)^+*
z@F@RBLz}$!O8DL0^g6Ab5qGv*YA24^4&$7i2*DWRv<9^L@cWH^LvO$TEjT-zU2IVm
z-+pEEARqt%2tWV=5P$##AOHafKmY=N5_mr5o4cbCe~FyFZOp7*tX!uTjjh%`2XQVV
zC54JkQz^o=Oepq-L6~S~!XTj%HJ->Ik!MknDiIfXE;Cj7!>NiUR|izvH#^&vPo`Vd
zwhaR%Z<NR`X%%VuUS+#^+32=<)D`{uYrbC8KjwYEAIiV$eK0hzx2APvJr9F%rsTQI
zKV2c#O=G%pY23Y~5`P7Mv)Hzd+CC13`+hd3p&tPO2tWV=5P$##AOHafKmY;|fWSW%
Fcmp61Xm<br

literal 0
HcmV?d00001

-- 
GitLab