Skip to content
Snippets Groups Projects
Commit da8f8f5c authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files

Improve stack to handle

=== Noun ===
=== Adj ===
=== Pronounciations ===
parent 8767d00e
No related branches found
No related tags found
No related merge requests found
......@@ -9,7 +9,9 @@ string_values = {
"t_ipa":"pron", #template for transcription
"t_snd":"écouter", #template for audio
"t_acc":["US", "UK"], #template for accents
"sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns
{"def":"\\#", "ex":"\\#\\*", "add_subdef":"\\#"}
],
"POS":{
"adjectif":["adjectif","adjectif qualificatif","adj"],
"adjectif démonstratif":["adjectif démonstratif","adj-dém","adjectif dém"],
......@@ -77,5 +79,5 @@ string_values = {
"variante par contrainte typographique":["variante typographique","variante typo","variante par contrainte typographique","var-typo"],
"verbe pronominal":["verbe pronominal","verb-pr","verbe pr"],
"verbe":["verbe","verb"]
}
}
}
......@@ -55,49 +55,55 @@ class Fr_en_straktor(Wikstraktor):
ik += 1
return pos
def process_example(self, example_wiki_text):
k = 0
isEx = 0
res = None
#process templates
while k < len(self.wtp.parse(example_wiki_text).templates) and isEx == 0 :
if (self.wtp.parse(example_wiki_text).templates[k].normal_name() in self.constants['t_ex']):
res = self.wtp.parse(example_wiki_text).templates[0].arguments[-1].value
isEx = 1
k += 1
if isEx == 0:
res = self.wtp.parse(example_wiki_text).plain_text().strip()
return res
def process_definition(self, definition, sub_items, def_level = True):
if def_level:
newSense = Sense(self.entry_language, self.wtp.parse(definition).plain_text().strip(), self.wiki_language)
pattern_ex = self.constants['sense_pattern'][0]["ex"]
pattern_subdef = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["def"]
else:
newSense = SubSense(self.entry_language, self.wtp.parse(item).plain_text().strip(), self.wiki_language)
pattern_subdef = None
pattern_ex = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["ex"]
#Process examples
a = 0
#print(newSense, sub_items)# DEBUG:
for item_list in sub_items:
if item_list.pattern == pattern_ex:
for item in item_list.items:
newSense.add_example(self.process_example(item))
#Si on veut traiter les sous items (ex traductions), on peut utiliser
#item_list.sublists(a)
if def_level and item_list.pattern == pattern_subdef:
for item in item_list.items:
newSense.add_subsense(self.process_definition(item, item_list.sublists(a), False))
a += 1
return newSense
def process_senses(self, sensesContent):
l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' ))
i = 0
l = sensesContent.get_lists((self.constants['sense_pattern'][0]["def"]))
senses = []
while i < len(l):
if l[i].pattern == '\\# ':
newSense = Sense(self.entry_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), self.wiki_language)
#la version d'enzo ajoute +ieurs defs (for i in l[i].items)
elif l[i].pattern == '\\#:':
for j in l[i].items:
k = 0
isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
isEx = 1
k += 1
if isEx == 0:
newSense.add_example(self.wtp.parse(j).plain_text().strip())
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
senses.append(newSense)
cnt = 0
while i < len(l) and l[i].level == 3 :
cnt +=1
if l[i].pattern == '\\## ':
newSense2 = SubSense(self.entry_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), self.wiki_language)
elif l[i].pattern == '\\##:':
for j in l[i].items:
k = 0
isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
isEx = 1
k += 1
if isEx == 0:
newSense2.add_example(self.wtp.parse(j).plain_text().strip())
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
newSense.add_subsense(newSense2)
if len(l) > 1:
self.log.add_log("Wikstraktor.process_senses", f"===== WARNING ======\nmore than one sense list, make sure we don't forget anything\nignored lists : \n{l[1:]}\n===================")
l = l[0] #l now contains a list of list items
if l.pattern == self.constants['sense_pattern'][0]["def"]:
i = 0
for item in l.items:
senses.append(self.process_definition(item, l.sublists(i)))
i += 1
if cnt > 0:
i -= 1
i += 1
return senses
if __name__ == "__main__":
......
......@@ -391,9 +391,13 @@ class ParserContext:
def pop(self, testNewEntry = True):
if testNewEntry:
self.create_entry()
self.create_entries()
return self.context.pop()
def flush(self):
while len(self.context) > 0:
self.pop(True)
def set_top_wiki(self, wiki_context):
if len(self.context) == 0:
self.push(wiki_context)
......@@ -406,26 +410,32 @@ class ParserContext:
else:
self.context[-1][key] = entry_context
if testNewEntry:
self.create_entry()
self.create_entries()
#Pb là dedans
def create_entry(self):
#Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS
res = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version)
def create_entries(self):
#In the key dict there are traits that describe every thing (ety, pro) and different entities (POS:senses)
tmp = {}
res = 0
pro = None
for l in self.context:
if "pro" in l.keys():
res.set_pronunciations(l['pro'])
if "ety" in l.keys():
pass #On ignore l'étymologie pour le moment
if "POS" in l.keys():
res.set_pos(l['POS'])
if "senses" in l.keys():
res.set_senses(l['senses'])
# TODO: Ajouter les autres types
if res.is_valid() and res not in self.entries:
self.entries.append(res)
else:
res = None
for k,v in l.items():
if k == "pro":
pro = v
elif k == "ety" or k == "wiki":
#wiki context is not necessary
pass #On ignore l'étymologie pour le moment
else:
tmp[k]=v
if(pro!=None and len(tmp)>0):
for pos,senses in tmp.items():
e = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version)
e.set_pronunciations(pro)
e.set_pos(pos)
e.set_senses(senses)
#an improvement would be to remove that sense from context, but we test not to add doubles
if e.is_valid() and e not in self.entries:
res += 1
self.entries.append(e)
return res
def debug_top(self):
......@@ -442,6 +452,20 @@ class ParserContext:
res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}"
return res
def __str__(self):
res = ""
i=0
for c in self.context:
res += f"====={i}======\n"
for k,v in c.items():
if k!= "wiki":
res+=f" {k}{v}\n"
else:
res+=f" {k}{len(v)}\n"
i+=1
return res+f"nb of entries: {len(self.entries)}"
class Wikstraktor:
@classmethod
......@@ -500,8 +524,9 @@ class Wikstraktor:
self.parserContext.push(s)
else:
while self.parserContext.get_level() > s.level:
self.parserContext.pop()
self.parserContext.pop(True)
self.parserContext.set_top_wiki(s)
#get section title
stitle = self.wtp.parse(s.title).templates
if stitle == []:
stitle = s.title
......@@ -511,14 +536,12 @@ class Wikstraktor:
self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents)))
elif self.isEty(stitle):
self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents)))
# elif stitle in self.constants['POS'].keys():
else:
#Edit to process other types of sections
pos = self.process_POS(stitle)
if pos != None :
self.parserContext.set_top_entry_info('POS', pos, False)
self.parserContext.set_top_entry_info('senses', self.process_senses(self.wtp.parse(s.contents))) #cette ligne le prob
# self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus.
self.parserContext.set_top_entry_info(pos, self.process_senses(self.wtp.parse(s.contents)))
self.parserContext.flush()
res = len(self.parserContext.entries)
if res > 0:
for e in self.parserContext.entries:
......@@ -539,6 +562,7 @@ class Wikstraktor:
res = title in self.constants['ety']
return res
#recognizes POS and returns None if it can't
def process_POS(self, parsedwikitext):
pass#in subclass
......@@ -548,6 +572,12 @@ class Wikstraktor:
def process_etymology(self, parsedwikitext):
pass#in subclass
def process_example(self, example_wiki_text):
pass#in subclass
def process_definition(self, definition, sub_items, def_level = True):
pass#in subclass
def process_senses(self, parsedwikitext):
pass#in subclass
......
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment