Skip to content
Snippets Groups Projects
Commit 27514d48 authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files

approximative POS processing

parent 46ab0653
No related branches found
No related tags found
No related merge requests found
string_values = { string_values = {
"ety":"Etymology", "ety":"Etymology",
"ipa":"Pronunciation", "pro":"Pronunciation",
"en":"English", "en":"English",
"fr":"French", "fr":"French",
"t_ipa":"IPA", #template for transcription "t_ipa":"IPA", #template for transcription
"t_snd":"audio", #template for audio "t_snd":"audio", #template for audio
"t_acc":"a" #template for accents "t_acc":"a", #template for accents
"POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS
"Adjective":"Adj",
"Adverb":"Adv",
"Ambiposition":"Ambip",
"Article":"Art",
"Circumposition":"Circump",
"Classifier":"Class",
"Conjunction":"Conj",
"Contraction":"Cont",
"Counter":"Count",
"Determiner":"Det",
"Ideophone":"Ideophone",
"Interjection":"Interj",
"Noun":"N",
"Numeral":"Num",
"Participle":"Part",
"Particle":"Particle",
"Postposition":"Postp",
"Preposition":"Prep",
"Pronoun":"Pro",
"Proper noun":"NP",
"Verb":"V" # TODO: compléter
}
} }
...@@ -4,6 +4,8 @@ from pronunciation import Pronunciation ...@@ -4,6 +4,8 @@ from pronunciation import Pronunciation
from parsers.en_constants import string_values from parsers.en_constants import string_values
debugEty = 0
class En_en_straktor(Wikstraktor): class En_en_straktor(Wikstraktor):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
...@@ -40,6 +42,15 @@ class En_en_straktor(Wikstraktor): ...@@ -40,6 +42,15 @@ class En_en_straktor(Wikstraktor):
print(pronunciations[0], pronunciations[1]) print(pronunciations[0], pronunciations[1])
return pronunciations return pronunciations
def process_etymology(self, etyContent):
global debugEty
debugEty += 1
return "Etymology" + str(debugEty)
def process_senses(self, sensesContent):
import random as r
return "Cool"+r.choice(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
if __name__ == "__main__": if __name__ == "__main__":
ensk = En_en_straktor() ensk = En_en_straktor()
print(ensk.fetch("test"), "entries added") print(ensk.fetch("test"), "entries added")
...@@ -9,14 +9,20 @@ class Entry: ...@@ -9,14 +9,20 @@ class Entry:
def __init__(self, lemma): def __init__(self, lemma):
self.lemma = lemma self.lemma = lemma
def set_pronunciation(self, pron): def set_pronunciations(self, pron):
if isinstance(pron, Pronunciation): if isinstance(pron, Pronunciation):
self.pronunciation = pron self.pronunciations = pron
else: else:
raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).")
def set_POS(self, pos):
self.pos = pos
def __str__(self): def __str__(self):
res = f"{self.lemma} ({self.cat})" res = f"{self.lemma} ({self.pos})\n"
for p in self.pronunciations:
res += f"{str(p)}\n"
return res
class ParserContext: class ParserContext:
def __init__(self, entry): def __init__(self, entry):
...@@ -30,8 +36,8 @@ class ParserContext: ...@@ -30,8 +36,8 @@ class ParserContext:
res = self.context[-1]["wiki"].level res = self.context[-1]["wiki"].level
return res return res
def push(self, wiki_context, entry_context=None): def push(self, wiki_context):
self.context.append({"wiki":wiki_context, "entry_info":entry_context}) self.context.append({"wiki":wiki_context})
def pop(self): def pop(self):
return self.context.pop() return self.context.pop()
...@@ -42,22 +48,22 @@ class ParserContext: ...@@ -42,22 +48,22 @@ class ParserContext:
else: else:
self.context[-1]['wiki'] = wiki_context self.context[-1]['wiki'] = wiki_context
def set_top_entry_info(self, entry_context): def set_top_entry_info(self, key, entry_context):
if len(self.context) == 0: if len(self.context) == 0:
raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.") raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.")
else: else:
self.context[-1]['entry_info'] = entry_context self.context[-1][key] = entry_context
def create_entry(self): def create_entry(self):
res = Entry(self.lemma) res = Entry(self.lemma)
for l in self.context: for l in self.context:
if l['entry_info'] == None: if l['pro'] != None:
pass res.set_pronunciations(l['entry_info'])
elif l['entry_info'].__class__.__name__ == "Pronunciation": if l['ety'] != None:
res.set_pronunciation(l['entry_info']) pass #On ignore l'étymologie pour le moment
else: if l['POS'] != None:
# TODO: Ajouter les autres types res.set_pos(l['POS'])
pass # TODO: Ajouter les autres types
return res return res
def debug_top(self): def debug_top(self):
...@@ -65,7 +71,13 @@ class ParserContext: ...@@ -65,7 +71,13 @@ class ParserContext:
if len(self.context) == 0 : if len(self.context) == 0 :
res += "0" res += "0"
else: else:
res += f"{len(self.context)}, {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {str(self.context[-1]['entry_info'])}" info = ""
for k,v in self.context[-1].items():
if k != 'wiki':
if info != "":
info += "\n\t\t\t"
info += f"{k}{str(v)}"
res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}"
return res return res
...@@ -125,8 +137,13 @@ class Wikstraktor: ...@@ -125,8 +137,13 @@ class Wikstraktor:
while self.parserContext.get_level() > s.level: while self.parserContext.get_level() > s.level:
self.parserContext.pop() self.parserContext.pop()
self.parserContext.set_top_wiki(s) self.parserContext.set_top_wiki(s)
if s.title == self.constants['ipa']: if s.title == self.constants['pro']:
self.parserContext.set_top_entry_info(self.process_pronunciation(self.wtp.parse(s.contents))) self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents)))
elif self.constants['ety'] in s.title:
self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents)))
elif s.title in self.constants['POS'].keys():
self.parserContext.set_top_entry_info('POS', self.constants['POS'][s.title])
self.parserContext.set_top_entry_info('senses', self.process_senses(self.wtp.parse(s.contents)))
print(self.parserContext.debug_top()) print(self.parserContext.debug_top())
print("ok") print("ok")
...@@ -139,8 +156,8 @@ class Wikstraktor: ...@@ -139,8 +156,8 @@ class Wikstraktor:
if __name__ == "__main__": if __name__ == "__main__":
e = Wikstraktor.get_instance('en', "en") e = Wikstraktor.get_instance('en', "en")
print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav"))
print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav"))
print(e.fetch("test"), "entries added") print(e.fetch("test"), "entries added")
# site = pywikibot.Site(f'wiktionary:en') # site = pywikibot.Site(f'wiktionary:en')
# p = pywikibot.FilePage(site, "File:LL-Q1860 (eng)-Nattes à chat----parent.wav") # p = pywikibot.FilePage(site, "File:LL-Q1860 (eng)-Nattes à chat----parent.wav")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment