Skip to content
Snippets Groups Projects
Commit 1c005715 authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files

region + labels at sense level

parent 90eac4c6
No related branches found
No related tags found
No related merge requests found
......@@ -31,6 +31,10 @@ string_values = {
"Eng":"England",
"En":"England",
"England":"England",
"Canada":"Canada",
"Canadian":"Canada",
'North American':'North America',
'North America':"North America",
"US":"United States of America",
"USA":"United States of America",
"United States":"United States of America",
......@@ -39,7 +43,11 @@ string_values = {
"New Zealand":"New Zealand",
"Au":"Australia",
"AU":"Australia",
"Australia":"Australia"},
"Australia":"Australia",
"India":"India",
"Indian":"India",
"Nigeria":"Nigeria",
"Nigerian":"Nigeria"},
"sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns
{"def":"\\#", "ex":"\\#[:;]", "add_subdef":"\\#"}
],
......
......@@ -50,15 +50,15 @@ class En_en_straktor(Wikstraktor):
break
return the_def
def parse_labels(self, a_def, templates):
def parse_labels(self, a_sense, templates):
key = "labels"
desc = "language"
num = 0
for t in templates:
if t.normal_name() in self.constants['t_lbl']:
while a_def.metadata_exists(f"{key}_{num}_{desc}"):
while a_sense.metadata_exists(f"{key}_{num}_{desc}"):
num+=1
a_def.add_metadata(f"{key}_{num}_{desc}", t.arguments[0].value)
a_sense.add_metadata(f"{key}_{num}_{desc}", t.arguments[0].value)
complete_previous = False
for a in t.arguments[1:]:
if a.value == "_":
......@@ -66,28 +66,32 @@ class En_en_straktor(Wikstraktor):
elif a.value == "and":
pass
elif a.value in self.constants['regions'].keys():
a_def.add_to_metadata("region", self.constants['regions'][a.value])
a_sense.add_region(self.constants['regions'][a.value])
elif complete_previous:
a_def.extend_metadata(f"{key}_{num}", a.value, " ")
a_sense.extend_metadata(f"{key}_{num}", a.value, " ")
complete_previous = False
else:
a_def.add_to_metadata(f"{key}_{num}", a.value)
a_sense.add_to_metadata(f"{key}_{num}", a.value)
def parse_definition(self, def_string):
the_def = None
parsed_def = self.wtp.parse(def_string)
def parse_definition(self, parsed_def):
if not isinstance(parsed_def, self.wtp.WikiText):
parsed_def = self.wtp.parse(parsed_def)
def_text = parsed_def.plain_text().strip()
templates = parsed_def.templates
if def_text != "":
the_def = self.parse_template_1(templates)
if the_def == None:
the_def = Definition(self.entry_language, def_text)
else:
the_def = self.parse_template_1(templates)
if the_def != None:
self.parse_labels(the_def, templates)
else:
if the_def == None:
raise ValueError(f"En_en_straktor.parse_definition with empty definition\n\t{def_string}")
return the_def
def get_sense_metadata(self, sense, parsed_def):
if not isinstance(parsed_def, self.wtp.WikiText):
parsed_def = self.wtp.parse(parsed_def)
templates = parsed_def.templates
self.parse_labels(sense, templates)
def process_POS(self,parsedwikitext):
pos = None
if parsedwikitext in self.constants['POS'].keys():
......
......@@ -117,32 +117,9 @@ class Definition(SubInfo):
if text != "":
self.lang = lang
self.text = text
self.metadata = {}
else:
raise ValueError(f"Definition.__init__: “{text}” empty definition.")
def add_metadata(self, key, value):
if self.metadata_exists(key):
print("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata[key]}” by {key}:“{value}")
self.metadata[key]=value
def add_to_metadata(self, key, value):
if not self.metadata_exists(key):
self.metadata[key] = []
self.metadata[key].append(value)
#to add at the end of the metadata, if empty add_metadata not add_to_metadata
def extend_metadata(self, key, value, separator=""):
if not self.metadata_exists(key):
self.add_metadata(key, value)
elif type(self.metadata[key]) == list:
self.metadata[key][-1] += separator+value
else:
self.metadata[key] += separator+value
def metadata_exists(self, key):
return key in self.metadata.keys()
def __eq__(self, other):
return isinstance(other, self.__class__) and self.lang == other.lang and self.text == other.text
......@@ -150,8 +127,6 @@ class Definition(SubInfo):
res = super().serializable(prefix)
res["lang"] = self.lang
res[self.__class__.key] = self.text
if len(self.metadata.keys()) > 0 :
res["metadata"] = self.metadata
return res
class Translation(Definition):
......@@ -202,12 +177,39 @@ class Sense(SubInfo):
self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels)
self.translations = [] #liste des traductions dans d'autres langues
self.domain = None #domaine d'usage du mot dans ce sens
self.metadata = {}
self.regions = set()
if definition != None:
try:
self.add_def(wiki_lang, definition)
except ValueError as err:
raise ValueError(f"Sense.__init__() with empty definition\n{err}")
def add_metadata(self, key, value):
if self.metadata_exists(key):
print("Definition.add_metadata", f"for {self.text} replaced {key}:“{self.metadata[key]}” by {key}:“{value}")
self.metadata[key]=value
def add_to_metadata(self, key, value):
if not self.metadata_exists(key):
self.metadata[key] = []
self.metadata[key].append(value)
def add_region(self, region):
self.regions.add(region)
#to add at the end of the metadata, if empty add_metadata not add_to_metadata
def extend_metadata(self, key, value, separator=""):
if not self.metadata_exists(key):
self.add_metadata(key, value)
elif type(self.metadata[key]) == list:
self.metadata[key][-1] += separator+value
else:
self.metadata[key] += separator+value
def metadata_exists(self, key):
return key in self.metadata.keys()
def set_id(self, prefix=None):
if prefix != None and self.label == None:
self.label = f"{prefix}_{self.__class__.next_id}" #l'identifiant du sens
......@@ -251,7 +253,7 @@ class Sense(SubInfo):
self.subsenses.append(subsense)
def __eq__(self, other):
res = isinstance(other, self.__class__) and self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain
res = isinstance(other, self.__class__) and self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain and len(other.metadata) == len(self.metadata) and other.regions == self.regions
i = 0
while res and i < len(self.examples):
res = self.examples[i] in other.examples
......@@ -268,20 +270,30 @@ class Sense(SubInfo):
while res and i < len(self.subsenses):
res = self.subsenses[i] in other.subsenses
i+=1
i = 0
l = list(self.metadata.keys())
while res and i < len(l):
res = l[i] in other.metadata.keys() and type(self.metadata[l[i]]) == type(other.metadata[l[i]])
if res and type(self.metadata[l[i]]) == list and len(self.metadata[l[i]]) == len(other.metadata[l[i]]):
j = 0
while res and j < len(self.metadata[l[i]]):
res = self.metadata[l[i]][j] in other.metadata[l[i]]
j+=1
i+=1
return res
def serializable(self, prefix = None):
res = {}
if self.domain != None:
res["Domain"] = self.domain
if len(self.regions) > 0:
res['Regions'] = list(self.regions)
if len(self.definitions) > 0:
res["Definitions"] = []
for d in self.definitions:
res["Definitions"].append(d.serializable(prefix))
if len(self.subsenses) > 0:
res["Subsenses"] = {}
for t in self.subsenses:
res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix)
if len(self.metadata.keys()) > 0 :
res["Metadata"] = self.metadata
if len(self.examples) > 0 :
res["Examples"] = []
for e in self.examples:
......@@ -290,6 +302,10 @@ class Sense(SubInfo):
res["Translations"] = []
for t in self.translations:
res["Translations"].append(t.serializable(prefix))
if len(self.subsenses) > 0:
res["Subsenses"] = {}
for t in self.subsenses:
res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix)
return res
def __str__(self):
......@@ -616,19 +632,30 @@ class Wikstraktor:
return res
#can be overloaded
def parse_definition(self, definition):
return self.wtp.parse(definition).plain_text().strip()
def parse_definition(self, definition_wikitext):
if type(definition_wikitext) == str:
res = self.wtp.parse(definition_wikitext).plain_text().strip()
elif isinstance(definition_wikitext, wikitextparser.WikiText):
res = definition_wikitext.plain_text().strip()
return res
#can be overloaded
def get_sense_metadata(self, sense, definition_wikitext):
pass
#can be overloaded
def process_definition(self, definition, sub_items, def_level = True):
#does not process wk_en quotations
try:
parsed_def = self.wtp.parse(definition)
if def_level:
newSense = Sense(self.entry_language, self.parse_definition(definition), self.wiki_language)
newSense = Sense(self.entry_language, self.parse_definition(parsed_def), self.wiki_language)
self.get_sense_metadata(newSense, parsed_def)
pattern_ex = self.constants['sense_pattern'][0]["ex"]
pattern_subdef = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["def"]
else:
newSense = SubSense(self.entry_language, self.parse_definition(definition), self.wiki_language)
newSense = SubSense(self.entry_language, self.parse_definition(parsed_def), self.wiki_language)
self.get_sense_metadata(newSense, parsed_def)
pattern_subdef = None
pattern_ex = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["ex"]
#Process examples
......
No preview for this file type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment