Skip to content
Snippets Groups Projects
Commit ca4c97d8 authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files
parents 22577fb7 7b7eed90
No related branches found
No related tags found
No related merge requests found
wikstraktor wikstraktor
=========== ===========
A python tool to query the [wiktionary](https://wiktionary.org) and extract structured lexical data. A python tool to query the [wiktionary](https://wiktionary.org) and extract [structured lexical data](https://gitlab.liris.cnrs.fr/lex-game/wikstraktor/-/wikis/Entry-structure).
## Dependencies ## Dependencies
This project does depend on python packages. This project does depend on python packages.
......
...@@ -17,6 +17,7 @@ class En_en_straktor(Wikstraktor): ...@@ -17,6 +17,7 @@ class En_en_straktor(Wikstraktor):
# TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux
l = proContent.get_lists()[0] l = proContent.get_lists()[0]
i = 0 i = 0
cpt = 0
pronunciations = [] pronunciations = []
while i < len(l.fullitems): while i < len(l.fullitems):
p = Pronunciation() p = Pronunciation()
...@@ -32,6 +33,8 @@ class En_en_straktor(Wikstraktor): ...@@ -32,6 +33,8 @@ class En_en_straktor(Wikstraktor):
p.add_sound(self.get_file_url(t.arguments[1].value), a) p.add_sound(self.get_file_url(t.arguments[1].value), a)
if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] :
if p.ipa != None or p.accent != None: if p.ipa != None or p.accent != None:
cpt += 1
p.id= f"p_{cpt}"
pronunciations.append(p) pronunciations.append(p)
p = Pronunciation() p = Pronunciation()
i += 1 i += 1
...@@ -57,19 +60,27 @@ class En_en_straktor(Wikstraktor): ...@@ -57,19 +60,27 @@ class En_en_straktor(Wikstraktor):
while i < len(l): while i < len(l):
if l[i].pattern == '\\# ': if l[i].pattern == '\\# ':
nombreDef += 1 nombreDef += 1
<<<<<<< HEAD
newSense = Sense(f"{baseId}{nombreDef}", self.entry_language) newSense = Sense(f"{baseId}{nombreDef}", self.entry_language)
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
=======
newSense = Sense(f"{baseId}{nombreDef}")
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"d_{nombreDef}")
newSense.add_translation(f"t_{nombreDef}_0")
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
elif l[i].pattern == '\\#:': elif l[i].pattern == '\\#:':
cptEx=0
for j in l[i].items: for j in l[i].items:
k = 0 k = 0
isEx = 0 isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 : while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) cptEx +=1
newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"e_{nombreDef}_{cptEx}")
isEx = 1 isEx = 1
k += 1 k += 1
if isEx == 0: if isEx == 0:
newSense.add_example(self.wtp.parse(j).plain_text().strip()) newSense.add_example(self.wtp.parse(j).plain_text().strip(), f"e_{nombreDef}_{cptEx}")
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
senses.append(newSense) senses.append(newSense)
cnt = 0 cnt = 0
...@@ -78,19 +89,27 @@ class En_en_straktor(Wikstraktor): ...@@ -78,19 +89,27 @@ class En_en_straktor(Wikstraktor):
cnt +=1 cnt +=1
if l[i].pattern == '\\## ': if l[i].pattern == '\\## ':
nombreSousDef += 1 nombreSousDef += 1
<<<<<<< HEAD
newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language) newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language)
newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
=======
newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}")
newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"sd_{nombreDef}_{nombreSousDef}")
newSense2.add_translation(f"t_{nombreDef}_{nombreSousDef}_0")
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
elif l[i].pattern == '\\##:': elif l[i].pattern == '\\##:':
cptex2 = 0
for j in l[i].items: for j in l[i].items:
k = 0 k = 0
isEx = 0 isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 : while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) cptex2 +=1
newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"se_{nombreDef}_{nombreSousDef}_{cptex2}")
isEx = 1 isEx = 1
k += 1 k += 1
if isEx == 0: if isEx == 0:
newSense2.add_example(self.wtp.parse(j).plain_text().strip()) newSense2.add_example(self.wtp.parse(j).plain_text().strip(), f"se_{nombreDef}_{nombreSousDef}_{cptex2}")
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
newSense.add_subsense(newSense2) newSense.add_subsense(newSense2)
i += 1 i += 1
......
...@@ -53,7 +53,7 @@ class Fr_en_straktor(Wikstraktor): ...@@ -53,7 +53,7 @@ class Fr_en_straktor(Wikstraktor):
keys = list(self.constants['POS'].keys()) keys = list(self.constants['POS'].keys())
pos = keys[ik] pos = keys[ik]
ik += 1 ik += 1
# print(pos) print(pos)
return pos return pos
def process_senses(self, entry, pos, sensesContent): def process_senses(self, entry, pos, sensesContent):
...@@ -64,9 +64,17 @@ class Fr_en_straktor(Wikstraktor): ...@@ -64,9 +64,17 @@ class Fr_en_straktor(Wikstraktor):
nombreDef = 0 nombreDef = 0
while i < len(l): while i < len(l):
if l[i].pattern == '\\# ': if l[i].pattern == '\\# ':
<<<<<<< HEAD
nombreDef += 1 nombreDef += 1
newSense = Sense(f"{baseId}{nombreDef}", self.entry_language) newSense = Sense(f"{baseId}{nombreDef}", self.entry_language)
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
=======
#A revoir ça, très douteux
for nbDef in l[i].items :
nombreDef += 1
newSense = Sense(f"{baseId}{nombreDef}")
newSense.add_def(self.wiki_language, self.wtp.parse(nbDef).plain_text().strip())
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
elif l[i].pattern == '\\#:': elif l[i].pattern == '\\#:':
for j in l[i].items: for j in l[i].items:
k = 0 k = 0
......
...@@ -5,7 +5,7 @@ if __name__ == "__main__": ...@@ -5,7 +5,7 @@ if __name__ == "__main__":
# print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav"))
# print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav"))
#e.fetch("water") #e.fetch("water")
f.fetch("blue") f.fetch("water")
# print(e.fetch("test"), "entries added") # print(e.fetch("test"), "entries added")
#print(e) #print(e)
file_path = 'test.json' file_path = 'test.json'
...@@ -22,3 +22,5 @@ if __name__ == "__main__": ...@@ -22,3 +22,5 @@ if __name__ == "__main__":
# print(p.get_file_url()) # print(p.get_file_url())
#print(e) #print(e)
#Entry("test", wtp.parse(page.text))) #Entry("test", wtp.parse(page.text)))
# PRENDS PAS LE FICHIER AUDIO POUR "LIVE" EN_EN
...@@ -19,7 +19,7 @@ class Sound: ...@@ -19,7 +19,7 @@ class Sound:
if self.accent == None: if self.accent == None:
res = {"url":self.url} res = {"url":self.url}
else: else:
res = {"accent":self.accent, "url":self.url} res = { "accent":self.accent, "url":self.url}
return res return res
class Pronunciation: class Pronunciation:
...@@ -27,6 +27,7 @@ class Pronunciation: ...@@ -27,6 +27,7 @@ class Pronunciation:
self.ipa = None self.ipa = None
self.sounds = [] self.sounds = []
self.accent = None self.accent = None
self.id = None
def set_transcription(self, tscpt): def set_transcription(self, tscpt):
self.ipa = tscpt self.ipa = tscpt
...@@ -42,9 +43,9 @@ class Pronunciation: ...@@ -42,9 +43,9 @@ class Pronunciation:
for s in self.sounds: for s in self.sounds:
snds.append(s.serializable()) snds.append(s.serializable())
if self.accent == None: if self.accent == None:
res = {"transcript":self.ipa, "sounds":snds} res = {"ID":self.id, "transcript":self.ipa, "sounds":snds}
else: else:
res = {"accent":self.accent, "transcript":self.ipa, "sounds":snds} res = {"ID":self.id,"accent":self.accent, "transcript":self.ipa, "sounds":snds}
return res return res
def __str__(self): def __str__(self):
...@@ -70,7 +71,12 @@ class Pronunciation: ...@@ -70,7 +71,12 @@ class Pronunciation:
####### #######
class Definition: class Definition:
<<<<<<< HEAD
def __init__(self, lang, text, id=None): def __init__(self, lang, text, id=None):
=======
def __init__(self, lang, text, id):
self.id = id
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
self.lang = lang self.lang = lang
self.text = text self.text = text
self.id = id self.id = id
...@@ -81,6 +87,7 @@ class Definition: ...@@ -81,6 +87,7 @@ class Definition:
def __eq__(self, other): def __eq__(self, other):
return self.lang == other.lang and self.text == other.text return self.lang == other.lang and self.text == other.text
<<<<<<< HEAD
def serializable(self, id = True): def serializable(self, id = True):
res = {} res = {}
if id and self.id != None: if id and self.id != None:
...@@ -100,6 +107,21 @@ class Translation(Definition): ...@@ -100,6 +107,21 @@ class Translation(Definition):
class Example: class Example:
def __init__(self, transcript, id=None, source=None, url=None): def __init__(self, transcript, id=None, source=None, url=None):
=======
def serializable(self):
return {"ID":self.id, "lang":self.lang, "definition":self.text}
class Translation():
def __init__(self, id, lang=None, text=None):
self.id = id
self.lang = lang
self.text = text
def serializable(self):
return {"ID:" : self.id, "lang":self.lang, "translation":self.text}
class Example:
def __init__(self, transcript, id, source=None, url=None):
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
self.text = transcript self.text = transcript
self.source = source self.source = source
self.url = url self.url = url
...@@ -108,6 +130,7 @@ class Example: ...@@ -108,6 +130,7 @@ class Example:
def __eq__(self, other): def __eq__(self, other):
return self.text==other.text and self.source==other.source and self.url==other.url return self.text==other.text and self.source==other.source and self.url==other.url
<<<<<<< HEAD
def set_id(self, id): def set_id(self, id):
self.id = id self.id = id
...@@ -116,6 +139,10 @@ class Example: ...@@ -116,6 +139,10 @@ class Example:
res = {"id":self.id, "example":self.text} res = {"id":self.id, "example":self.text}
else: else:
res = {"example":self.text} res = {"example":self.text}
=======
def serializable(self):
res = {"ID":self.id, "example":self.text}
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
if self.source != None: if self.source != None:
res["source"] = self.source res["source"] = self.source
if self.url != None: if self.url != None:
...@@ -135,21 +162,30 @@ class Sense: ...@@ -135,21 +162,30 @@ class Sense:
def set_domain(self, d): def set_domain(self, d):
self.domain = d self.domain = d
<<<<<<< HEAD
def add_def(self, lang, definition): def add_def(self, lang, definition):
theDef = Definition(lang, definition) theDef = Definition(lang, definition)
if theDef not in self.definitions: if theDef not in self.definitions:
theDef.set_id(f"{self.label}_def{len(self.definitions)}") theDef.set_id(f"{self.label}_def{len(self.definitions)}")
self.definitions.append(theDef) self.definitions.append(theDef)
=======
def add_example(self, transcript, src=None, url=None): def add_def(self, lang, definition, id):
theEx = Example(transcript, src, url) theDef = Definition(lang, definition, id)
if self.definition == None:
self.definition = theDef
elif self.definition != theDef:
raise ValueError(f"Superposition de deux définitions:\n\t{self.definition}\nremplacée par\n\t{theDef}")
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
def add_example(self, transcript, id, src=None, url=None):
theEx = Example(transcript, id, src, url)
if theEx not in self.examples: if theEx not in self.examples:
theEx.set_id(f"{self.label}_ex{len(self.examples)}") theEx.set_id(f"{self.label}_ex{len(self.examples)}")
self.examples.append(theEx) self.examples.append(theEx)
def add_translation(self, lang, translation): def add_translation(self, id, lang=None, translation=None):
theTranslation = Translation(lang, translation) theTranslation = Translation(id, lang, translation)
if theTranslation not in self.translations: if theTranslation not in self.translations:
theTranslation.set_id(f"{self.label}_trad{len(self.translations)}") theTranslation.set_id(f"{self.label}_trad{len(self.translations)}")
self.translations.append(theTranslation) self.translations.append(theTranslation)
...@@ -180,6 +216,7 @@ class Sense: ...@@ -180,6 +216,7 @@ class Sense:
def serializable(self, key = False): def serializable(self, key = False):
res = {} res = {}
<<<<<<< HEAD
if self.domain != None: if self.domain != None:
res["Domain"] = self.domain res["Domain"] = self.domain
if len(self.definitions) > 0: if len(self.definitions) > 0:
...@@ -198,6 +235,41 @@ class Sense: ...@@ -198,6 +235,41 @@ class Sense:
res["Translations"] = [] res["Translations"] = []
for t in self.translations: for t in self.translations:
res["Translations"].append(t.serializable(key)) res["Translations"].append(t.serializable(key))
=======
if key:
res[self.label]={}
if self.domain != None:
res[self.label]["Domain"] = self.domain
res[self.label]["Definition"] = self.definition.serializable()
if len(self.subsenses) > 0:
res[self.label]["Subsenses"] = []
for t in self.subsenses:
res[self.label]["Subsenses"].append(t.serializable())
if len(self.examples) > 0 :
res[self.label]["Examples"] = []
for e in self.examples:
res[self.label]["Examples"].append(e.serializable())
#if len(self.translations) > 0:
res[self.label]["Translations"] = []
for t in self.translations:
res[self.label]["Translations"].append(t.serializable())
else:
if self.domain != None:
res["Domain"] = self.domain
res["Definition"] = self.definition.serializable()
if len(self.subsenses) > 0:
res["Subsenses"] = {}
for t in self.subsenses:
res["Subsenses"][t.label]= t.serializable(key)
if len(self.examples) > 0 :
res["Examples"] = []
for e in self.examples:
res["Examples"].append(e.serializable())
#if len(self.translations) > 0:
res["Translations"] = []
for t in self.translations:
res["Translations"].append(t.serializable())
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
return res return res
...@@ -298,9 +370,14 @@ class ParserContext: ...@@ -298,9 +370,14 @@ class ParserContext:
if testNewEntry: if testNewEntry:
self.create_entry() self.create_entry()
#Pb là dedans
def create_entry(self): def create_entry(self):
<<<<<<< HEAD
#Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS
res = Entry(self.lemma, self.lang) res = Entry(self.lemma, self.lang)
=======
res = Entry(self.lemma)
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
for l in self.context: for l in self.context:
#print(l.keys()) #print(l.keys())
if "pro" in l.keys(): if "pro" in l.keys():
...@@ -403,7 +480,8 @@ class Wikstraktor: ...@@ -403,7 +480,8 @@ class Wikstraktor:
pos = self.process_POS(stitle) pos = self.process_POS(stitle)
if pos != None : if pos != None :
self.parserContext.set_top_entry_info('POS', pos, False) self.parserContext.set_top_entry_info('POS', pos, False)
self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) #cette ligne le prob
# self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus.
res = len(self.parserContext.entries) res = len(self.parserContext.entries)
if res > 0: if res > 0:
for e in self.parserContext.entries: for e in self.parserContext.entries:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment