Skip to content
Snippets Groups Projects
Commit dd4fe91f authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files

commit tout bugué

parent ca4c97d8
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
from wikstraktor import Wikstraktor, Pronunciation, Sense
from wikstraktor import Wikstraktor, Pronunciation, Sense, SubSense
from parsers.en_constants import string_values
......@@ -17,10 +17,9 @@ class En_en_straktor(Wikstraktor):
# TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux
l = proContent.get_lists()[0]
i = 0
cpt = 0
pronunciations = []
while i < len(l.fullitems):
p = Pronunciation()
p = Pronunciation(len(pronunciations))
templates = self.wtp.parse(l.fullitems[i]).templates
a = None
for j, t in enumerate(templates):
......@@ -33,8 +32,6 @@ class En_en_straktor(Wikstraktor):
p.add_sound(self.get_file_url(t.arguments[1].value), a)
if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] :
if p.ipa != None or p.accent != None:
cpt += 1
p.id= f"p_{cpt}"
pronunciations.append(p)
p = Pronunciation()
i += 1
......@@ -51,71 +48,54 @@ class En_en_straktor(Wikstraktor):
pos = self.constants['POS'][parsedwikitext]
return pos
def process_senses(self, entry, pos, sensesContent):
baseId = f"{entry}_{pos}_"
def process_senses(self, sensesContent):
print("process_senses")
l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' ))
i = 0
senses = []
nombreDef = 0
while i < len(l):
if l[i].pattern == '\\# ':
nombreDef += 1
<<<<<<< HEAD
newSense = Sense(f"{baseId}{nombreDef}", self.entry_language)
newSense = Sense(lang=self.entry_language)
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
=======
newSense = Sense(f"{baseId}{nombreDef}")
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"d_{nombreDef}")
newSense.add_translation(f"t_{nombreDef}_0")
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
#newSence.add_translation()
elif l[i].pattern == '\\#:':
cptEx=0
for j in l[i].items:
k = 0
isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
cptEx +=1
newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"e_{nombreDef}_{cptEx}")
newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
isEx = 1
k += 1
if isEx == 0:
newSense.add_example(self.wtp.parse(j).plain_text().strip(), f"e_{nombreDef}_{cptEx}")
newSense.add_example(self.wtp.parse(j).plain_text().strip())
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
senses.append(newSense)
cnt = 0
nombreSousDef = 0
while i < len(l) and l[i].level == 3 :
cnt +=1
if l[i].pattern == '\\## ':
nombreSousDef += 1
<<<<<<< HEAD
newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language)
newSense2 = SubSense(lang=self.entry_language)
newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
=======
newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}")
newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"sd_{nombreDef}_{nombreSousDef}")
newSense2.add_translation(f"t_{nombreDef}_{nombreSousDef}_0")
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
#newSense2.add_translation()
elif l[i].pattern == '\\##:':
cptex2 = 0
for j in l[i].items:
k = 0
isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
cptex2 +=1
newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"se_{nombreDef}_{nombreSousDef}_{cptex2}")
newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
isEx = 1
k += 1
if isEx == 0:
newSense2.add_example(self.wtp.parse(j).plain_text().strip(), f"se_{nombreDef}_{nombreSousDef}_{cptex2}")
newSense2.add_example(self.wtp.parse(j).plain_text().strip())
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
newSense.add_subsense(newSense2)
i += 1
if cnt > 0:
i -= 1
i += 1
print(f"process_senses done {senses[-1].serializable()}")
return senses
if __name__ == "__main__":
......
......@@ -56,25 +56,15 @@ class Fr_en_straktor(Wikstraktor):
print(pos)
return pos
def process_senses(self, entry, pos, sensesContent):
baseId = f"{entry}_{pos}_"
def process_senses(self, sensesContent):
l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' ))
i = 0
senses = []
nombreDef = 0
while i < len(l):
if l[i].pattern == '\\# ':
<<<<<<< HEAD
nombreDef += 1
newSense = Sense(f"{baseId}{nombreDef}", self.entry_language)
newSense = Sense(lang=self.entry_language)
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
=======
#A revoir ça, très douteux
for nbDef in l[i].items :
nombreDef += 1
newSense = Sense(f"{baseId}{nombreDef}")
newSense.add_def(self.wiki_language, self.wtp.parse(nbDef).plain_text().strip())
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
#la version d'enzo ajoute +ieurs defs (for i in l[i].items)
elif l[i].pattern == '\\#:':
for j in l[i].items:
k = 0
......@@ -89,12 +79,10 @@ class Fr_en_straktor(Wikstraktor):
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
senses.append(newSense)
cnt = 0
nombreSousDef = 0
while i < len(l) and l[i].level == 3 :
cnt +=1
if l[i].pattern == '\\## ':
nombreSousDef += 1
newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language)
newSense2 = SubSense(lang=self.entry_language)
newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
elif l[i].pattern == '\\##:':
for j in l[i].items:
......
......@@ -4,6 +4,37 @@ import wikitextparser
import importlib
import json
class SubInfo:
next_id = 1
prfx = "err"
@classmethod
def inc_n_id(cls):
cls.next_id += 1
@classmethod
def reset(cls):
cls.next_id = 0
def __init__(self, prefix = None):
self.id = None
self.set_id(prefix)
def set_id(self, prefix):
if self.id == None and prefix != None:
self.id = f"{prefix}_{self.__class__.prfx}{self.__class__.next_id}"
self.__class__.inc_n_id()
return self.id
def serializable(self, prefix = None):
res = {}
if self.set_id(prefix) != None:
res["id"] = self.id
return res
#######
# Oral
#######
......@@ -19,15 +50,17 @@ class Sound:
if self.accent == None:
res = {"url":self.url}
else:
res = { "accent":self.accent, "url":self.url}
res = {"accent":self.accent, "url":self.url}
return res
class Pronunciation:
def __init__(self):
class Pronunciation(SubInfo):
prfx = "prn"
def __init__(self, prefix = None):
super().__init__(prefix)
self.ipa = None
self.sounds = []
self.accent = None
self.id = None
def set_transcription(self, tscpt):
self.ipa = tscpt
......@@ -38,18 +71,19 @@ class Pronunciation:
def add_sound(self, url, accent=None):
self.sounds.append(Sound(url,accent))
def serializable(self):
def serializable(self, prefix = None):
snds = []
for s in self.sounds:
snds.append(s.serializable())
if self.accent == None:
res = {"ID":self.id, "transcript":self.ipa, "sounds":snds}
else:
res = {"ID":self.id,"accent":self.accent, "transcript":self.ipa, "sounds":snds}
res = super().serializable(prefix)
res['transcript'] = self.ipa
if self.accent != None:
res['accent'] = self.accent
res['sounds'] = snds
return res
def __str__(self):
return f"{self.serializable()}"
return f"{self.serializable('')}"
def __eq__(self, other):
res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds)
......@@ -70,127 +104,99 @@ class Pronunciation:
# TODO: créer une classe Translations
#######
class Definition:
<<<<<<< HEAD
def __init__(self, lang, text, id=None):
=======
def __init__(self, lang, text, id):
self.id = id
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
class Definition(SubInfo):
prfx = "def"
key = "definition"
def __init__(self, lang, text, prefix=None):
super().__init__(prefix)
self.lang = lang
self.text = text
self.id = id
def set_id(self, id):
self.id = id
def __eq__(self, other):
return self.lang == other.lang and self.text == other.text
<<<<<<< HEAD
def serializable(self, id = True):
res = {}
if id and self.id != None:
res["id"] = self.id
def serializable(self, prefix = None):
res = super().serializable(prefix)
res["lang"] = self.lang
res["definition"] = self.text
res[self.__class__.key] = self.text
return res
class Translation(Definition):
def serializable(self, id = True):
res = {}
if id and self.id != None:
res["id"] = self.id
res["lang"] = self.lang
res["translation"] = self.text
return res
prfx = "trad"
key = "translation"
class Example:
def __init__(self, transcript, id=None, source=None, url=None):
=======
def serializable(self):
return {"ID":self.id, "lang":self.lang, "definition":self.text}
class Example(SubInfo):
prfx = "ex"
class Translation():
def __init__(self, id, lang=None, text=None):
self.id = id
self.lang = lang
self.text = text
def serializable(self):
return {"ID:" : self.id, "lang":self.lang, "translation":self.text}
class Example:
def __init__(self, transcript, id, source=None, url=None):
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
def __init__(self, transcript, source=None, url=None, prefix=None):
super().__init__(prefix)
self.text = transcript
self.source = source
self.url = url
self.id = id
def __eq__(self, other):
return self.text==other.text and self.source==other.source and self.url==other.url
<<<<<<< HEAD
def set_id(self, id):
self.id = id
def serializable(self, id = True):
if id:
res = {"id":self.id, "example":self.text}
else:
res = {"example":self.text}
=======
def serializable(self):
res = {"ID":self.id, "example":self.text}
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
def serializable(self, prefix = None):
res = super().serializable(prefix)
res["example"]=self.text
if self.source != None:
res["source"] = self.source
if self.url != None:
res["url"] = self.url
return res
class Sense:
def __init__(self, label, lang):
class Sense(SubInfo):
prfx = ""
def __init__(self, prefix=None, lang=None):
self.lang = lang
self.label = lang+"."+label #l'identifiant du sens
self.label = None
self.set_id(prefix)
#On réinitialise les identifiants des sous-éléments
if not isinstance(self, SubSense):
Definition.reset()
Example.reset()
Translation.reset()
SubSense.reset()
self.definitions = [] #liste des définitions (elles auront une langue et un texte)
self.subsenses = [] #liste des sous-définitions (récursif…)
self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels)
self.translations = [] #liste des traductions dans d'autres langues
self.domain = None #domaine d'usage du mot dans ce sens
def set_id(self, prefix):
if prefix != None and self.label == None:
self.label = f"{self.lang}.{prefix}_{self.__class__.next_id}" #l'identifiant du sens
self.__class__.inc_n_id()
return self.label
def set_domain(self, d):
self.domain = d
<<<<<<< HEAD
def add_def(self, lang, definition):
theDef = Definition(lang, definition)
if theDef not in self.definitions:
theDef.set_id(f"{self.label}_def{len(self.definitions)}")
theDef.set_id(self.set_id())
self.definitions.append(theDef)
=======
def add_def(self, lang, definition, id):
theDef = Definition(lang, definition, id)
if self.definition == None:
self.definition = theDef
elif self.definition != theDef:
raise ValueError(f"Superposition de deux définitions:\n\t{self.definition}\nremplacée par\n\t{theDef}")
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
def add_example(self, transcript, id, src=None, url=None):
theEx = Example(transcript, id, src, url)
def add_example(self, transcript, src=None, url=None, prefix=None):
theEx = Example(transcript, src, url, prefix)
if theEx not in self.examples:
theEx.set_id(f"{self.label}_ex{len(self.examples)}")
theEx.set_id(self.set_id())
self.examples.append(theEx)
def add_translation(self, id, lang=None, translation=None):
theTranslation = Translation(id, lang, translation)
def add_translation(self, lang=None, translation=None):
theTranslation = Translation(lang, translation)
if theTranslation not in self.translations:
theTranslation.set_id(f"{self.label}_trad{len(self.translations)}")
theTranslation.set_id(self.set_id())
self.translations.append(theTranslation)
def add_subsense(self, subsense):
if self.label!=None:
subsense.set_id(self.set_id())
if subsense not in self.subsenses:
self.subsenses.append(subsense)
......@@ -214,90 +220,88 @@ class Sense:
i+=1
return res
def serializable(self, key = False):
def serializable(self, prefix = None):
res = {}
<<<<<<< HEAD
if self.domain != None:
res["Domain"] = self.domain
if len(self.definitions) > 0:
res["Definitions"] = []
for d in self.definitions:
res["Definitions"].append(d.serializable(key))
res["Definitions"].append(d.serializable(prefix))
if len(self.subsenses) > 0:
res["Subsenses"] = {}
for t in self.subsenses:
res["Subsenses"][t.label]= t.serializable(key)
res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix)
if len(self.examples) > 0 :
res["Examples"] = []
for e in self.examples:
res["Examples"].append(e.serializable(key))
res["Examples"].append(e.serializable(prefix))
if len(self.translations) > 0:
res["Translations"] = []
for t in self.translations:
res["Translations"].append(t.serializable(key))
=======
if key:
res[self.label]={}
if self.domain != None:
res[self.label]["Domain"] = self.domain
res[self.label]["Definition"] = self.definition.serializable()
if len(self.subsenses) > 0:
res[self.label]["Subsenses"] = []
for t in self.subsenses:
res[self.label]["Subsenses"].append(t.serializable())
if len(self.examples) > 0 :
res[self.label]["Examples"] = []
for e in self.examples:
res[self.label]["Examples"].append(e.serializable())
#if len(self.translations) > 0:
res[self.label]["Translations"] = []
for t in self.translations:
res[self.label]["Translations"].append(t.serializable())
else:
if self.domain != None:
res["Domain"] = self.domain
res["Definition"] = self.definition.serializable()
if len(self.subsenses) > 0:
res["Subsenses"] = {}
for t in self.subsenses:
res["Subsenses"][t.label]= t.serializable(key)
if len(self.examples) > 0 :
res["Examples"] = []
for e in self.examples:
res["Examples"].append(e.serializable())
#if len(self.translations) > 0:
res["Translations"] = []
for t in self.translations:
res["Translations"].append(t.serializable())
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
res["Translations"].append(t.serializable(prefix))
return res
class SubSense(Sense):
def set_id(self, prefix):
if prefix != None and self.label == None:
self.label = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens
self.__class__.inc_n_id()
return self.label
class Entry:
def __init__(self, lemma, lang):
#version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id)
def __init__(self, lemma, lang, wiki_lang, version_id):
self.lemma = lemma
self.lang = lang
#Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile
self.sources = []
self.sources.append({wiki_lang:version_id})
self.current_source = 0
self.pronunciations = []
self.pos = None
self.senses = []
#l'identifiant unique de la version de la page du wiktionnaire
Sense.reset()
def set_pos(self, pos):
self.pos = pos
def get_id(self, source_id=0):
if self.pos !=None:
pos = "_"+self.pos
else:
pos = ""
return f"{self.lang}-{source_id}.{self.lemma}{pos}"
def set_pronunciations(self, pron):
if isinstance(pron, Pronunciation):
self.pronunciations.append(pron)
self.add_pronunciation(pron)
elif type(pron) == list:
for p in pron:
if isinstance(p, Pronunciation):
self.pronunciations.append(p)
self.add_pronunciation(p)
else:
raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).")
raise ValueError(f"Entry.set_pronunciations: {p} is not a Pronunciation object ({p.__class__.__name__}).")
else:
raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).")
raise ValueError(f"Entry.set_pronunciations: {pron} is not a Pronunciation object ({pron.__class__.__name__}).")
def set_pos(self, pos):
self.pos = pos
def add_pronunciation(self, p):
if p not in self.pronunciations:
p.set_id(self.get_id())
self.pronunciations.append(p)
def set_senses(self, senses):
self.senses = senses
for s in self.senses:
if isinstance(s, Sense):
self.add_sense(s)
else:
raise ValueError(f"Entry.set_senses: {s} is not a Sense object ({p.__class__.__name__}).")
def add_sense(self, s):
if s not in self.senses:
s.set_id(self.get_id())
self.senses.append(s)##ICITE
def is_valid(self):
return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0
......@@ -314,16 +318,22 @@ class Entry:
i += 1
return res
def serializable(self, id = False):
def serializable(self, id=True):
res = {}
res['id'] = self.lemma+"_"+self.pos+"."+self.lang
res['sources'] = self.sources
if id:
id = self.get_id()
res['id'] = id
else:
id == None
res[self.lemma] = {"pos":self.pos}
res[self.lemma]["pronunciations"] = []
for p in self.pronunciations:
res[self.lemma]["pronunciations"].append(p.serializable())
res[self.lemma]["pronunciations"].append(p.serializable(id))
res[self.lemma]["senses"] = {}
for s in self.senses:
res[self.lemma]["senses"][s.label]=s.serializable(id)
res[self.lemma]["senses"][s.get_id()]=s.serializable(id)
print(f"Entry:{res}")##
return res
def __str__(self):
......@@ -335,9 +345,11 @@ class Entry:
return res
class ParserContext:
def __init__(self, entry, lang):
def __init__(self, entry, lang, wiki_lang, version_id):
self.lemma = entry
self.lang = lang
self.wiki_lang = wiki_lang
self.version_id = version_id
self.context = []
self.entries = []
......@@ -372,12 +384,8 @@ class ParserContext:
#Pb là dedans
def create_entry(self):
<<<<<<< HEAD
#Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS
res = Entry(self.lemma, self.lang)
=======
res = Entry(self.lemma)
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
res = Entry(self.lemma, self.lang, self.wiki_lang, self.version_id)
for l in self.context:
#print(l.keys())
if "pro" in l.keys():
......@@ -389,6 +397,7 @@ class ParserContext:
if "senses" in l.keys():
res.set_senses(l['senses'])
# TODO: Ajouter les autres types
print(res)
if res.is_valid() and res not in self.entries:
self.entries.append(res)
else:
......@@ -452,11 +461,11 @@ class Wikstraktor:
if not found:
i += 1
if found:
nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections)
nb_entries_added = self.parse(page.title(), page.latest_revision_id, sections[i].sections)#self.wtp.parse(s.contents).sections)
return nb_entries_added
def parse(self, entry, sections):
self.parserContext = ParserContext(entry, self.entry_language)
def parse(self, entry, v_id, sections):
self.parserContext = ParserContext(entry, self.entry_language, self.wiki_language, v_id)
for s in sections:
if s.title != None :
#handle wiki context
......@@ -480,7 +489,7 @@ class Wikstraktor:
pos = self.process_POS(stitle)
if pos != None :
self.parserContext.set_top_entry_info('POS', pos, False)
self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) #cette ligne le prob
self.parserContext.set_top_entry_info('senses', self.process_senses(self.wtp.parse(s.contents))) #cette ligne le prob
# self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus.
res = len(self.parserContext.entries)
if res > 0:
......@@ -512,7 +521,7 @@ class Wikstraktor:
def process_etymology(self, parsedwikitext):
pass#in subclass
def process_senses(self, entry, pos, parsedwikitext):
def process_senses(self, parsedwikitext):
pass#in subclass
def __str__(self):
......@@ -522,6 +531,7 @@ class Wikstraktor:
res = []
for e in self.entries:
res.append(e.serializable(id))
print(f"Export{res}")##
if compact:
return json.dumps(res, ensure_ascii=ascii)
else:
......@@ -548,6 +558,8 @@ if __name__ == "__main__":
resp = None
if w.fetch(args.mot) > 0:
resp = w.export(not args.no_id, args.force_ascii, args.compact)
print(w)##
print(resp)##
if args.destination_file != None:
f = open(args.destination_file, "w")
f.write(resp)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment