Skip to content
Snippets Groups Projects
Commit dd4fe91f authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files

commit tout bugué

parent ca4c97d8
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3 #!/usr/bin/env python3
from wikstraktor import Wikstraktor, Pronunciation, Sense from wikstraktor import Wikstraktor, Pronunciation, Sense, SubSense
from parsers.en_constants import string_values from parsers.en_constants import string_values
...@@ -17,10 +17,9 @@ class En_en_straktor(Wikstraktor): ...@@ -17,10 +17,9 @@ class En_en_straktor(Wikstraktor):
# TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux
l = proContent.get_lists()[0] l = proContent.get_lists()[0]
i = 0 i = 0
cpt = 0
pronunciations = [] pronunciations = []
while i < len(l.fullitems): while i < len(l.fullitems):
p = Pronunciation() p = Pronunciation(len(pronunciations))
templates = self.wtp.parse(l.fullitems[i]).templates templates = self.wtp.parse(l.fullitems[i]).templates
a = None a = None
for j, t in enumerate(templates): for j, t in enumerate(templates):
...@@ -33,8 +32,6 @@ class En_en_straktor(Wikstraktor): ...@@ -33,8 +32,6 @@ class En_en_straktor(Wikstraktor):
p.add_sound(self.get_file_url(t.arguments[1].value), a) p.add_sound(self.get_file_url(t.arguments[1].value), a)
if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] :
if p.ipa != None or p.accent != None: if p.ipa != None or p.accent != None:
cpt += 1
p.id= f"p_{cpt}"
pronunciations.append(p) pronunciations.append(p)
p = Pronunciation() p = Pronunciation()
i += 1 i += 1
...@@ -51,71 +48,54 @@ class En_en_straktor(Wikstraktor): ...@@ -51,71 +48,54 @@ class En_en_straktor(Wikstraktor):
pos = self.constants['POS'][parsedwikitext] pos = self.constants['POS'][parsedwikitext]
return pos return pos
def process_senses(self, entry, pos, sensesContent): def process_senses(self, sensesContent):
baseId = f"{entry}_{pos}_" print("process_senses")
l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' ))
i = 0 i = 0
senses = [] senses = []
nombreDef = 0
while i < len(l): while i < len(l):
if l[i].pattern == '\\# ': if l[i].pattern == '\\# ':
nombreDef += 1 newSense = Sense(lang=self.entry_language)
<<<<<<< HEAD
newSense = Sense(f"{baseId}{nombreDef}", self.entry_language)
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
======= #newSence.add_translation()
newSense = Sense(f"{baseId}{nombreDef}")
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"d_{nombreDef}")
newSense.add_translation(f"t_{nombreDef}_0")
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
elif l[i].pattern == '\\#:': elif l[i].pattern == '\\#:':
cptEx=0
for j in l[i].items: for j in l[i].items:
k = 0 k = 0
isEx = 0 isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 : while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
cptEx +=1 newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"e_{nombreDef}_{cptEx}")
isEx = 1 isEx = 1
k += 1 k += 1
if isEx == 0: if isEx == 0:
newSense.add_example(self.wtp.parse(j).plain_text().strip(), f"e_{nombreDef}_{cptEx}") newSense.add_example(self.wtp.parse(j).plain_text().strip())
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
senses.append(newSense) senses.append(newSense)
cnt = 0 cnt = 0
nombreSousDef = 0
while i < len(l) and l[i].level == 3 : while i < len(l) and l[i].level == 3 :
cnt +=1 cnt +=1
if l[i].pattern == '\\## ': if l[i].pattern == '\\## ':
nombreSousDef += 1 newSense2 = SubSense(lang=self.entry_language)
<<<<<<< HEAD
newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language)
newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
======= #newSense2.add_translation()
newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}")
newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip(), f"sd_{nombreDef}_{nombreSousDef}")
newSense2.add_translation(f"t_{nombreDef}_{nombreSousDef}_0")
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
elif l[i].pattern == '\\##:': elif l[i].pattern == '\\##:':
cptex2 = 0
for j in l[i].items: for j in l[i].items:
k = 0 k = 0
isEx = 0 isEx = 0
while k < len(self.wtp.parse(j).templates) and isEx == 0 : while k < len(self.wtp.parse(j).templates) and isEx == 0 :
if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']):
cptex2 +=1 newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value)
newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value, f"se_{nombreDef}_{nombreSousDef}_{cptex2}")
isEx = 1 isEx = 1
k += 1 k += 1
if isEx == 0: if isEx == 0:
newSense2.add_example(self.wtp.parse(j).plain_text().strip(), f"se_{nombreDef}_{nombreSousDef}_{cptex2}") newSense2.add_example(self.wtp.parse(j).plain_text().strip())
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
newSense.add_subsense(newSense2) newSense.add_subsense(newSense2)
i += 1 i += 1
if cnt > 0: if cnt > 0:
i -= 1 i -= 1
i += 1 i += 1
print(f"process_senses done {senses[-1].serializable()}")
return senses return senses
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -56,25 +56,15 @@ class Fr_en_straktor(Wikstraktor): ...@@ -56,25 +56,15 @@ class Fr_en_straktor(Wikstraktor):
print(pos) print(pos)
return pos return pos
def process_senses(self, entry, pos, sensesContent): def process_senses(self, sensesContent):
baseId = f"{entry}_{pos}_"
l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' ))
i = 0 i = 0
senses = [] senses = []
nombreDef = 0
while i < len(l): while i < len(l):
if l[i].pattern == '\\# ': if l[i].pattern == '\\# ':
<<<<<<< HEAD newSense = Sense(lang=self.entry_language)
nombreDef += 1
newSense = Sense(f"{baseId}{nombreDef}", self.entry_language)
newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
======= #la version d'enzo ajoute +ieurs defs (for i in l[i].items)
#A revoir ça, très douteux
for nbDef in l[i].items :
nombreDef += 1
newSense = Sense(f"{baseId}{nombreDef}")
newSense.add_def(self.wiki_language, self.wtp.parse(nbDef).plain_text().strip())
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
elif l[i].pattern == '\\#:': elif l[i].pattern == '\\#:':
for j in l[i].items: for j in l[i].items:
k = 0 k = 0
...@@ -89,12 +79,10 @@ class Fr_en_straktor(Wikstraktor): ...@@ -89,12 +79,10 @@ class Fr_en_straktor(Wikstraktor):
if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ':
senses.append(newSense) senses.append(newSense)
cnt = 0 cnt = 0
nombreSousDef = 0
while i < len(l) and l[i].level == 3 : while i < len(l) and l[i].level == 3 :
cnt +=1 cnt +=1
if l[i].pattern == '\\## ': if l[i].pattern == '\\## ':
nombreSousDef += 1 newSense2 = SubSense(lang=self.entry_language)
newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}", self.entry_language)
newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip())
elif l[i].pattern == '\\##:': elif l[i].pattern == '\\##:':
for j in l[i].items: for j in l[i].items:
......
...@@ -4,6 +4,37 @@ import wikitextparser ...@@ -4,6 +4,37 @@ import wikitextparser
import importlib import importlib
import json import json
class SubInfo:
next_id = 1
prfx = "err"
@classmethod
def inc_n_id(cls):
cls.next_id += 1
@classmethod
def reset(cls):
cls.next_id = 0
def __init__(self, prefix = None):
self.id = None
self.set_id(prefix)
def set_id(self, prefix):
if self.id == None and prefix != None:
self.id = f"{prefix}_{self.__class__.prfx}{self.__class__.next_id}"
self.__class__.inc_n_id()
return self.id
def serializable(self, prefix = None):
res = {}
if self.set_id(prefix) != None:
res["id"] = self.id
return res
####### #######
# Oral # Oral
####### #######
...@@ -19,15 +50,17 @@ class Sound: ...@@ -19,15 +50,17 @@ class Sound:
if self.accent == None: if self.accent == None:
res = {"url":self.url} res = {"url":self.url}
else: else:
res = { "accent":self.accent, "url":self.url} res = {"accent":self.accent, "url":self.url}
return res return res
class Pronunciation: class Pronunciation(SubInfo):
def __init__(self): prfx = "prn"
def __init__(self, prefix = None):
super().__init__(prefix)
self.ipa = None self.ipa = None
self.sounds = [] self.sounds = []
self.accent = None self.accent = None
self.id = None
def set_transcription(self, tscpt): def set_transcription(self, tscpt):
self.ipa = tscpt self.ipa = tscpt
...@@ -38,18 +71,19 @@ class Pronunciation: ...@@ -38,18 +71,19 @@ class Pronunciation:
def add_sound(self, url, accent=None): def add_sound(self, url, accent=None):
self.sounds.append(Sound(url,accent)) self.sounds.append(Sound(url,accent))
def serializable(self): def serializable(self, prefix = None):
snds = [] snds = []
for s in self.sounds: for s in self.sounds:
snds.append(s.serializable()) snds.append(s.serializable())
if self.accent == None: res = super().serializable(prefix)
res = {"ID":self.id, "transcript":self.ipa, "sounds":snds} res['transcript'] = self.ipa
else: if self.accent != None:
res = {"ID":self.id,"accent":self.accent, "transcript":self.ipa, "sounds":snds} res['accent'] = self.accent
res['sounds'] = snds
return res return res
def __str__(self): def __str__(self):
return f"{self.serializable()}" return f"{self.serializable('')}"
def __eq__(self, other): def __eq__(self, other):
res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds)
...@@ -70,127 +104,99 @@ class Pronunciation: ...@@ -70,127 +104,99 @@ class Pronunciation:
# TODO: créer une classe Translations # TODO: créer une classe Translations
####### #######
class Definition: class Definition(SubInfo):
<<<<<<< HEAD prfx = "def"
def __init__(self, lang, text, id=None): key = "definition"
=======
def __init__(self, lang, text, id): def __init__(self, lang, text, prefix=None):
self.id = id super().__init__(prefix)
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
self.lang = lang self.lang = lang
self.text = text self.text = text
self.id = id
def set_id(self, id):
self.id = id
def __eq__(self, other): def __eq__(self, other):
return self.lang == other.lang and self.text == other.text return self.lang == other.lang and self.text == other.text
<<<<<<< HEAD def serializable(self, prefix = None):
def serializable(self, id = True): res = super().serializable(prefix)
res = {}
if id and self.id != None:
res["id"] = self.id
res["lang"] = self.lang res["lang"] = self.lang
res["definition"] = self.text res[self.__class__.key] = self.text
return res return res
class Translation(Definition): class Translation(Definition):
def serializable(self, id = True): prfx = "trad"
res = {} key = "translation"
if id and self.id != None:
res["id"] = self.id
res["lang"] = self.lang
res["translation"] = self.text
return res
class Example: class Example(SubInfo):
def __init__(self, transcript, id=None, source=None, url=None): prfx = "ex"
=======
def serializable(self):
return {"ID":self.id, "lang":self.lang, "definition":self.text}
class Translation(): def __init__(self, transcript, source=None, url=None, prefix=None):
def __init__(self, id, lang=None, text=None): super().__init__(prefix)
self.id = id
self.lang = lang
self.text = text
def serializable(self):
return {"ID:" : self.id, "lang":self.lang, "translation":self.text}
class Example:
def __init__(self, transcript, id, source=None, url=None):
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
self.text = transcript self.text = transcript
self.source = source self.source = source
self.url = url self.url = url
self.id = id
def __eq__(self, other): def __eq__(self, other):
return self.text==other.text and self.source==other.source and self.url==other.url return self.text==other.text and self.source==other.source and self.url==other.url
<<<<<<< HEAD def serializable(self, prefix = None):
def set_id(self, id): res = super().serializable(prefix)
self.id = id res["example"]=self.text
def serializable(self, id = True):
if id:
res = {"id":self.id, "example":self.text}
else:
res = {"example":self.text}
=======
def serializable(self):
res = {"ID":self.id, "example":self.text}
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
if self.source != None: if self.source != None:
res["source"] = self.source res["source"] = self.source
if self.url != None: if self.url != None:
res["url"] = self.url res["url"] = self.url
return res return res
class Sense: class Sense(SubInfo):
def __init__(self, label, lang): prfx = ""
def __init__(self, prefix=None, lang=None):
self.lang = lang self.lang = lang
self.label = lang+"."+label #l'identifiant du sens self.label = None
self.set_id(prefix)
#On réinitialise les identifiants des sous-éléments
if not isinstance(self, SubSense):
Definition.reset()
Example.reset()
Translation.reset()
SubSense.reset()
self.definitions = [] #liste des définitions (elles auront une langue et un texte) self.definitions = [] #liste des définitions (elles auront une langue et un texte)
self.subsenses = [] #liste des sous-définitions (récursif…) self.subsenses = [] #liste des sous-définitions (récursif…)
self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels)
self.translations = [] #liste des traductions dans d'autres langues self.translations = [] #liste des traductions dans d'autres langues
self.domain = None #domaine d'usage du mot dans ce sens self.domain = None #domaine d'usage du mot dans ce sens
def set_id(self, prefix):
if prefix != None and self.label == None:
self.label = f"{self.lang}.{prefix}_{self.__class__.next_id}" #l'identifiant du sens
self.__class__.inc_n_id()
return self.label
def set_domain(self, d): def set_domain(self, d):
self.domain = d self.domain = d
<<<<<<< HEAD
def add_def(self, lang, definition): def add_def(self, lang, definition):
theDef = Definition(lang, definition) theDef = Definition(lang, definition)
if theDef not in self.definitions: if theDef not in self.definitions:
theDef.set_id(f"{self.label}_def{len(self.definitions)}") theDef.set_id(self.set_id())
self.definitions.append(theDef) self.definitions.append(theDef)
======= def add_example(self, transcript, src=None, url=None, prefix=None):
def add_def(self, lang, definition, id): theEx = Example(transcript, src, url, prefix)
theDef = Definition(lang, definition, id)
if self.definition == None:
self.definition = theDef
elif self.definition != theDef:
raise ValueError(f"Superposition de deux définitions:\n\t{self.definition}\nremplacée par\n\t{theDef}")
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
def add_example(self, transcript, id, src=None, url=None):
theEx = Example(transcript, id, src, url)
if theEx not in self.examples: if theEx not in self.examples:
theEx.set_id(f"{self.label}_ex{len(self.examples)}") theEx.set_id(self.set_id())
self.examples.append(theEx) self.examples.append(theEx)
def add_translation(self, id, lang=None, translation=None): def add_translation(self, lang=None, translation=None):
theTranslation = Translation(id, lang, translation) theTranslation = Translation(lang, translation)
if theTranslation not in self.translations: if theTranslation not in self.translations:
theTranslation.set_id(f"{self.label}_trad{len(self.translations)}") theTranslation.set_id(self.set_id())
self.translations.append(theTranslation) self.translations.append(theTranslation)
def add_subsense(self, subsense): def add_subsense(self, subsense):
if self.label!=None:
subsense.set_id(self.set_id())
if subsense not in self.subsenses: if subsense not in self.subsenses:
self.subsenses.append(subsense) self.subsenses.append(subsense)
...@@ -214,90 +220,88 @@ class Sense: ...@@ -214,90 +220,88 @@ class Sense:
i+=1 i+=1
return res return res
def serializable(self, key = False): def serializable(self, prefix = None):
res = {} res = {}
<<<<<<< HEAD
if self.domain != None: if self.domain != None:
res["Domain"] = self.domain res["Domain"] = self.domain
if len(self.definitions) > 0: if len(self.definitions) > 0:
res["Definitions"] = [] res["Definitions"] = []
for d in self.definitions: for d in self.definitions:
res["Definitions"].append(d.serializable(key)) res["Definitions"].append(d.serializable(prefix))
if len(self.subsenses) > 0: if len(self.subsenses) > 0:
res["Subsenses"] = {} res["Subsenses"] = {}
for t in self.subsenses: for t in self.subsenses:
res["Subsenses"][t.label]= t.serializable(key) res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix)
if len(self.examples) > 0 : if len(self.examples) > 0 :
res["Examples"] = [] res["Examples"] = []
for e in self.examples: for e in self.examples:
res["Examples"].append(e.serializable(key)) res["Examples"].append(e.serializable(prefix))
if len(self.translations) > 0: if len(self.translations) > 0:
res["Translations"] = [] res["Translations"] = []
for t in self.translations: for t in self.translations:
res["Translations"].append(t.serializable(key)) res["Translations"].append(t.serializable(prefix))
=======
if key:
res[self.label]={}
if self.domain != None:
res[self.label]["Domain"] = self.domain
res[self.label]["Definition"] = self.definition.serializable()
if len(self.subsenses) > 0:
res[self.label]["Subsenses"] = []
for t in self.subsenses:
res[self.label]["Subsenses"].append(t.serializable())
if len(self.examples) > 0 :
res[self.label]["Examples"] = []
for e in self.examples:
res[self.label]["Examples"].append(e.serializable())
#if len(self.translations) > 0:
res[self.label]["Translations"] = []
for t in self.translations:
res[self.label]["Translations"].append(t.serializable())
else:
if self.domain != None:
res["Domain"] = self.domain
res["Definition"] = self.definition.serializable()
if len(self.subsenses) > 0:
res["Subsenses"] = {}
for t in self.subsenses:
res["Subsenses"][t.label]= t.serializable(key)
if len(self.examples) > 0 :
res["Examples"] = []
for e in self.examples:
res["Examples"].append(e.serializable())
#if len(self.translations) > 0:
res["Translations"] = []
for t in self.translations:
res["Translations"].append(t.serializable())
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
return res return res
class SubSense(Sense):
def set_id(self, prefix):
if prefix != None and self.label == None:
self.label = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens
self.__class__.inc_n_id()
return self.label
class Entry: class Entry:
def __init__(self, lemma, lang): #version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id)
def __init__(self, lemma, lang, wiki_lang, version_id):
self.lemma = lemma self.lemma = lemma
self.lang = lang self.lang = lang
#Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile
self.sources = []
self.sources.append({wiki_lang:version_id})
self.current_source = 0
self.pronunciations = [] self.pronunciations = []
self.pos = None self.pos = None
self.senses = [] self.senses = []
#l'identifiant unique de la version de la page du wiktionnaire
Sense.reset()
def set_pos(self, pos):
self.pos = pos
def get_id(self, source_id=0):
if self.pos !=None:
pos = "_"+self.pos
else:
pos = ""
return f"{self.lang}-{source_id}.{self.lemma}{pos}"
def set_pronunciations(self, pron): def set_pronunciations(self, pron):
if isinstance(pron, Pronunciation): if isinstance(pron, Pronunciation):
self.pronunciations.append(pron) self.add_pronunciation(pron)
elif type(pron) == list: elif type(pron) == list:
for p in pron: for p in pron:
if isinstance(p, Pronunciation): if isinstance(p, Pronunciation):
self.pronunciations.append(p) self.add_pronunciation(p)
else: else:
raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).") raise ValueError(f"Entry.set_pronunciations: {p} is not a Pronunciation object ({p.__class__.__name__}).")
else: else:
raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") raise ValueError(f"Entry.set_pronunciations: {pron} is not a Pronunciation object ({pron.__class__.__name__}).")
def set_pos(self, pos): def add_pronunciation(self, p):
self.pos = pos if p not in self.pronunciations:
p.set_id(self.get_id())
self.pronunciations.append(p)
def set_senses(self, senses): def set_senses(self, senses):
self.senses = senses for s in self.senses:
if isinstance(s, Sense):
self.add_sense(s)
else:
raise ValueError(f"Entry.set_senses: {s} is not a Sense object ({p.__class__.__name__}).")
def add_sense(self, s):
if s not in self.senses:
s.set_id(self.get_id())
self.senses.append(s)##ICITE
def is_valid(self): def is_valid(self):
return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0
...@@ -314,16 +318,22 @@ class Entry: ...@@ -314,16 +318,22 @@ class Entry:
i += 1 i += 1
return res return res
def serializable(self, id = False): def serializable(self, id=True):
res = {} res = {}
res['id'] = self.lemma+"_"+self.pos+"."+self.lang res['sources'] = self.sources
if id:
id = self.get_id()
res['id'] = id
else:
id == None
res[self.lemma] = {"pos":self.pos} res[self.lemma] = {"pos":self.pos}
res[self.lemma]["pronunciations"] = [] res[self.lemma]["pronunciations"] = []
for p in self.pronunciations: for p in self.pronunciations:
res[self.lemma]["pronunciations"].append(p.serializable()) res[self.lemma]["pronunciations"].append(p.serializable(id))
res[self.lemma]["senses"] = {} res[self.lemma]["senses"] = {}
for s in self.senses: for s in self.senses:
res[self.lemma]["senses"][s.label]=s.serializable(id) res[self.lemma]["senses"][s.get_id()]=s.serializable(id)
print(f"Entry:{res}")##
return res return res
def __str__(self): def __str__(self):
...@@ -335,9 +345,11 @@ class Entry: ...@@ -335,9 +345,11 @@ class Entry:
return res return res
class ParserContext: class ParserContext:
def __init__(self, entry, lang): def __init__(self, entry, lang, wiki_lang, version_id):
self.lemma = entry self.lemma = entry
self.lang = lang self.lang = lang
self.wiki_lang = wiki_lang
self.version_id = version_id
self.context = [] self.context = []
self.entries = [] self.entries = []
...@@ -372,12 +384,8 @@ class ParserContext: ...@@ -372,12 +384,8 @@ class ParserContext:
#Pb là dedans #Pb là dedans
def create_entry(self): def create_entry(self):
<<<<<<< HEAD
#Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS
res = Entry(self.lemma, self.lang) res = Entry(self.lemma, self.lang, self.wiki_lang, self.version_id)
=======
res = Entry(self.lemma)
>>>>>>> 7b7eed90f573ec80eaf21b87b3e6bf327e516dbe
for l in self.context: for l in self.context:
#print(l.keys()) #print(l.keys())
if "pro" in l.keys(): if "pro" in l.keys():
...@@ -389,6 +397,7 @@ class ParserContext: ...@@ -389,6 +397,7 @@ class ParserContext:
if "senses" in l.keys(): if "senses" in l.keys():
res.set_senses(l['senses']) res.set_senses(l['senses'])
# TODO: Ajouter les autres types # TODO: Ajouter les autres types
print(res)
if res.is_valid() and res not in self.entries: if res.is_valid() and res not in self.entries:
self.entries.append(res) self.entries.append(res)
else: else:
...@@ -452,11 +461,11 @@ class Wikstraktor: ...@@ -452,11 +461,11 @@ class Wikstraktor:
if not found: if not found:
i += 1 i += 1
if found: if found:
nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections) nb_entries_added = self.parse(page.title(), page.latest_revision_id, sections[i].sections)#self.wtp.parse(s.contents).sections)
return nb_entries_added return nb_entries_added
def parse(self, entry, sections): def parse(self, entry, v_id, sections):
self.parserContext = ParserContext(entry, self.entry_language) self.parserContext = ParserContext(entry, self.entry_language, self.wiki_language, v_id)
for s in sections: for s in sections:
if s.title != None : if s.title != None :
#handle wiki context #handle wiki context
...@@ -480,7 +489,7 @@ class Wikstraktor: ...@@ -480,7 +489,7 @@ class Wikstraktor:
pos = self.process_POS(stitle) pos = self.process_POS(stitle)
if pos != None : if pos != None :
self.parserContext.set_top_entry_info('POS', pos, False) self.parserContext.set_top_entry_info('POS', pos, False)
self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) #cette ligne le prob self.parserContext.set_top_entry_info('senses', self.process_senses(self.wtp.parse(s.contents))) #cette ligne le prob
# self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus. # self.parserContext.entries augmente pas même lorsque qu'on entre dans le if au dessus.
res = len(self.parserContext.entries) res = len(self.parserContext.entries)
if res > 0: if res > 0:
...@@ -512,7 +521,7 @@ class Wikstraktor: ...@@ -512,7 +521,7 @@ class Wikstraktor:
def process_etymology(self, parsedwikitext): def process_etymology(self, parsedwikitext):
pass#in subclass pass#in subclass
def process_senses(self, entry, pos, parsedwikitext): def process_senses(self, parsedwikitext):
pass#in subclass pass#in subclass
def __str__(self): def __str__(self):
...@@ -522,6 +531,7 @@ class Wikstraktor: ...@@ -522,6 +531,7 @@ class Wikstraktor:
res = [] res = []
for e in self.entries: for e in self.entries:
res.append(e.serializable(id)) res.append(e.serializable(id))
print(f"Export{res}")##
if compact: if compact:
return json.dumps(res, ensure_ascii=ascii) return json.dumps(res, ensure_ascii=ascii)
else: else:
...@@ -548,6 +558,8 @@ if __name__ == "__main__": ...@@ -548,6 +558,8 @@ if __name__ == "__main__":
resp = None resp = None
if w.fetch(args.mot) > 0: if w.fetch(args.mot) > 0:
resp = w.export(not args.no_id, args.force_ascii, args.compact) resp = w.export(not args.no_id, args.force_ascii, args.compact)
print(w)##
print(resp)##
if args.destination_file != None: if args.destination_file != None:
f = open(args.destination_file, "w") f = open(args.destination_file, "w")
f.write(resp) f.write(resp)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment