From 90eac4c67d49bab482deb5a202d94edee73bab4c Mon Sep 17 00:00:00 2001
From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr>
Date: Tue, 30 May 2023 12:24:12 +0200
Subject: [PATCH] process 1 and lbl templates in English

---
 parsers/en_constants.py |  32 +++++++++++++++++++++++++++
 parsers/en_en.py        |  48 +++++++++++++++++++++++++++++++++++++++-
 wikstraktor.py          |  28 +++++++++++++++++++----
 wikstraktor.sqlite      | Bin 8192 -> 16384 bytes
 4 files changed, 103 insertions(+), 5 deletions(-)

diff --git a/parsers/en_constants.py b/parsers/en_constants.py
index de100b7..c13e689 100644
--- a/parsers/en_constants.py
+++ b/parsers/en_constants.py
@@ -8,6 +8,38 @@ string_values = {
 	"t_acc":"a", #template for accents
 	"t_deflabel":"lb",
 	"t_ex":["ux", "usex"],
+	"t_lbl":["lb","lbl", "label"], #template for labels
+	"regions":{
+		"UK":"United Kingdom",
+		"United Kingdom":"United Kingdom",
+		"British":"Great Britain",
+		"GB":"Great Britain",
+		"Great Britain":"Great Britain",
+		"Scot":"Scotland",
+		"Scottish":"Scotland",
+		"Scotland":"Scotland",
+		"Irl":"Ireland",
+		"Irish":"Ireland",
+		"Ireland":"Ireland",
+		"Ulst":"Northern Ireland",
+		"Ulster":"Northern Ireland",
+		"Northern Ireland":"Northern Ireland",
+		"Wls":"Wales",
+		"Welsh":"Wales",
+		"Wales":"Wales",
+		"English":"England",
+		"Eng":"England",
+		"En":"England",
+		"England":"England",
+		"US":"United States of America",
+		"USA":"United States of America",
+		"United States":"United States of America",
+		"United States of America":"United States of America",
+		"NZ":"New Zealand",
+		"New Zealand":"New Zealand",
+		"Au":"Australia",
+		"AU":"Australia",
+		"Australia":"Australia"},
 	"sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns
 		{"def":"\\#", "ex":"\\#[:;]", "add_subdef":"\\#"}
 	],
diff --git a/parsers/en_en.py b/parsers/en_en.py
index 1a2fab1..db91422 100644
--- a/parsers/en_en.py
+++ b/parsers/en_en.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-from wikstraktor import Wikstraktor, Pronunciation, Sense, SubSense
+from wikstraktor import Wikstraktor, Pronunciation, Sense, SubSense, Definition
 
 from parsers.en_constants import string_values
 
@@ -42,6 +42,52 @@ class En_en_straktor(Wikstraktor):
 		debugEty += 1
 		return "Etymology" + str(debugEty)
 
+	def parse_template_1(self, templates):
+		the_def = None
+		for t in templates:
+			if t.normal_name() == "1":
+				the_def = Definition(self.entry_language, f"Other wording of “{t.arguments[0].value}”")
+				break
+		return the_def
+
+	def parse_labels(self, a_def, templates):
+		key = "labels"
+		desc = "language"
+		num = 0
+		for t in templates:
+			if t.normal_name() in self.constants['t_lbl']:
+				while a_def.metadata_exists(f"{key}_{num}_{desc}"):
+					num+=1
+				a_def.add_metadata(f"{key}_{num}_{desc}", t.arguments[0].value)
+				complete_previous = False
+				for a in t.arguments[1:]:
+					if a.value == "_":
+						complete_previous = True
+					elif a.value == "and":
+						pass
+					elif a.value in self.constants['regions'].keys():
+						a_def.add_to_metadata("region", self.constants['regions'][a.value])
+					elif complete_previous:
+						a_def.extend_metadata(f"{key}_{num}", a.value, " ")
+						complete_previous = False
+					else:
+						a_def.add_to_metadata(f"{key}_{num}", a.value)
+
+	def parse_definition(self, def_string):
+		the_def = None
+		parsed_def = self.wtp.parse(def_string)
+		def_text = parsed_def.plain_text().strip()
+		templates = parsed_def.templates
+		if def_text != "":
+			the_def = Definition(self.entry_language, def_text)
+		else:
+			the_def = self.parse_template_1(templates)
+		if the_def != None:
+			self.parse_labels(the_def, templates)
+		else:
+			raise ValueError(f"En_en_straktor.parse_definition with empty definition\n\t{def_string}")
+		return the_def
+
 	def process_POS(self,parsedwikitext):
 		pos = None
 		if parsedwikitext in self.constants['POS'].keys():
diff --git a/wikstraktor.py b/wikstraktor.py
index 6339324..20391a0 100755
--- a/wikstraktor.py
+++ b/wikstraktor.py
@@ -122,9 +122,26 @@ class Definition(SubInfo):
 			raise ValueError(f"Definition.__init__: “{text}” empty definition.")
 
 	def add_metadata(self, key, value):
-		if key in self.metadata.keys():
-			self.log.add_log("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata['key']}” by {key}:“{value}”")
-		self.metadata["key"]=value
+		if self.metadata_exists(key):
+			print("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata[key]}” by {key}:“{value}”")
+		self.metadata[key]=value
+
+	def add_to_metadata(self, key, value):
+		if not self.metadata_exists(key):
+			self.metadata[key] = []
+		self.metadata[key].append(value)
+
+	#to add at the end of the metadata, if empty add_metadata not add_to_metadata
+	def extend_metadata(self, key, value, separator=""):
+		if not self.metadata_exists(key):
+			self.add_metadata(key, value)
+		elif type(self.metadata[key]) == list:
+			self.metadata[key][-1] += separator+value
+		else:
+			self.metadata[key] += separator+value
+
+	def metadata_exists(self, key):
+		return key in self.metadata.keys()
 
 	def __eq__(self, other):
 		return isinstance(other, self.__class__) and self.lang == other.lang and self.text == other.text
@@ -204,7 +221,10 @@ class Sense(SubInfo):
 		self.domain = d
 
 	def add_def(self, lang, definition):
-		theDef = Definition(lang, definition)
+		if isinstance(definition, Definition):
+			theDef = definition
+		else:
+			theDef = Definition(lang, definition)
 		if theDef != None and theDef not in self.definitions:
 			theDef.set_id(self.set_id())
 			self.definitions.append(theDef)
diff --git a/wikstraktor.sqlite b/wikstraktor.sqlite
index b57b31ca0b8ab2c4d6a5cea2b52bfabfea67fc90..72c4e0fbff96b7a894f0b3423fdf8ec50dc36376 100644
GIT binary patch
literal 16384
zcmeI2&rj1(9Kc%`KQ<UYL;;n=mvPa|k<#vGjfn%QK?mw6;z6=>>jN4}yL6Sruq9x;
zn0VHUUi42e-u2?mzrY(8{{cOD@fGA{D}l&LafffSu0P&;&8MGz+rHQD79Qp_8>1D|
z+E8pHFar$BGPe+77-o#j0Wu#<ELm}HACw)n9AhSy?uJPz+xM9vADF-Z7ytuc01SWu
zFaQR?02lxRU;qp>W#IWuwrga3oPCkCm7<Q9wDpQ@DeJnqTDRIZotLt*gyig0PC}8o
zk_fsQ3PxrW8%KgOq>4N{Ck1Du+u6mOjHVa!d1+oQI6D{Q?A(Lvr0zju+q4Rs*s5rz
z5edrDV;RlgC-Y)1w^z%w)Pk-Ut520xa_Gb5+PZdZxzpYmlpoc$jLCrAz=mDW)JSl4
zUY71i`38h(Mc1)a*ueIhNvb#q4%xJ6*reJ1j?lFZVx8^(?YIx@FPi!ppb;1V17H9Q
zfB`T72EYIq00UqE41fVJ(6R>Fhn?K~F!Pb=ztOiD{u&<bUFi7``W2FbZ@PbUXS!Z?
ze(#KS)B@iESK7_CFKvTtiB$f-(@tzSBrFdO4h3d+dU%c(Vq797aA=Yfxr{iOR5-Pi
zN+r`7uE<TQLIxLcG8q^61o`5e?L;x2Or_$Oygsq88d!d`UH4LFqHLKZT&WaPyrLPJ
zP2TV9ZofraTY9mIjjCdm))cK&tC2m}h?SL!jm(uQIS#JXcDLW5zs<3VlnTNdW&7Dq
z^aADWa`C3c1n(2CqTARq6kFTGs8Ytdt{JNiDme*4s~$-kJ5Oodc73GvV0Vb}9=6N3
zoZWMj_OM;wXg$~+q`dWZ$%U54h#Ys)*nRTq)4yD6O~>v4rLDK?Gpz@^{gl^jH!X5P
ztJ&?Nv~IgT(t5BPro3*uDN$%SyS<dwZP!Oy4|aPfuiI`?<WBq)_WRBdqO@+iKGJ%y
z8>GB$yKym|X|+2;H>Gvk^^w+t-7d=OvdeQ~qUB<@lhV5E`bpco-Kh@Bo2cvWlBcl$
zyt5W2X;-znc&}Q-BtJ{iua&B1?ESx4-|MwcI^AES;|<eLj_b4s=t(q$NLq}iL_XeX
z7hpT3Z3q#6X+4BU8|7^Xk+a}EwGf$Nk35OE$R~UXk$vf>brR0_zoYjx)1U8qAO0Dh
zA*(Qf0WbgtzyKHk17H9QfB`T72F}pHPV_PzTGfxfNs5mbnn*kRV~Vfow8JGzTR-~t
onYQ`Scl09Vb=f7zt(TA-rL-=)e$sldJ3@I~cKu9GHyOKs0C0TLEC2ui

delta 37
rcmZo@U~F)hAkE6iz`(#XQNf;(abv;)el}hpmqB2&puk)Hi2=L-nEVI`

-- 
GitLab