Skip to content
Snippets Groups Projects
Commit 80d83b85 authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files

On se rapproche

parent e1bf5ca3
No related branches found
No related tags found
No related merge requests found
......@@ -5,6 +5,7 @@ import importlib
import json
from wikstraktor_version import version as the_version
from wikstraklog import Wikstraklog
import re
def get_list_string_level(wikitext):
list_chars = {"*", "#", ":"}
......@@ -35,10 +36,26 @@ class SubInfo:
self.__class__.inc_n_id()
return self.id
def replace_src_in_id(self, former_src, new_src):
##Attention si on nettoie en mettant des sources partout, il faudra changer
res = None
if self.id != None and former_src != None and new_src != None :
self.id = re.sub(r'^([\w\.]+)-('+str(former_src)+')',r"\1-"+str(new_src), self.id)
res = self.id
return res
def get_src_from_id(self):
res = None
if self.id != None:
gp = re.match(r'^[\w\.]+-(\d{1,2})', self.id)
if gp:
res = int(gp.group(1))
return res
def serializable(self, prefix = None):
res = {}
if self.set_id(prefix) != None:
res["id"] = self.id
if prefix != None:
res["id"] = self.set_id(prefix)
return res
......@@ -83,10 +100,10 @@ class Pronunciation(SubInfo):
self.sounds.append(Sound(url,accent))
def serializable(self, prefix = None):
res = super().serializable(prefix)
snds = []
for s in self.sounds:
snds.append(s.serializable())
res = super().serializable(prefix)
res['transcript'] = self.ipa
if self.has_accents():
res['accents'] = list(self.accents)
......@@ -166,12 +183,12 @@ class Example(SubInfo):
return res
class Sense(SubInfo):
prfx = ""
prfx = "s"
def __init__(self, lang=None, definition=None, wiki_lang=None, prefix=None):
print(prefix)##
super().__init__(prefix)
self.lang = lang
self.label = None
self.set_id(prefix)
#On réinitialise les identifiants des sous-éléments
if not isinstance(self, SubSense):
Definition.reset()
......@@ -217,14 +234,8 @@ class Sense(SubInfo):
def metadata_exists(self, key):
return key in self.metadata.keys()
def set_id(self, prefix=None):
if prefix != None and self.label == None:
self.label = f"{prefix}_{self.__class__.next_id}" #l'identifiant du sens
self.__class__.inc_n_id()
return self.label
def get_id(self):
return f"{self.lang}.{self.label}"
return self.id
def set_domain(self, d):
self.domain = d
......@@ -235,14 +246,15 @@ class Sense(SubInfo):
else:
theDef = Definition(lang, definition)
if theDef != None and theDef not in self.definitions:
theDef.set_id(self.set_id())
print("def set id", self.get_id())##
theDef.set_id(self.get_id())
self.definitions.append(theDef)
def add_example(self, transcript, src=None, url=None, prefix=None):
try:
theEx = Example(transcript, src, url, prefix)
if theEx != None and theEx not in self.examples:
theEx.set_id(self.set_id())
theEx.set_id(self.get_id())
self.examples.append(theEx)
except ValueError as e:
print(f"Skipped empty example")
......@@ -250,17 +262,17 @@ class Sense(SubInfo):
def add_translation(self, lang=None, translation=None):
theTranslation = Translation(lang, translation)
if theTranslation != None and theTranslation not in self.translations:
theTranslation.set_id(self.set_id())
theTranslation.set_id(self.get_id())
self.translations.append(theTranslation)
def add_subsense(self, subsense):
if self.label!=None:
if self.id!=None:
subsense.set_id(self.set_id())
if subsense not in self.subsenses:
self.subsenses.append(subsense)
def __eq__(self, other):
res = isinstance(other, self.__class__) and self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain and len(other.metadata) == len(self.metadata) and other.regions == self.regions
res = isinstance(other, self.__class__) and self.id == other.id and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain and len(other.metadata) == len(self.metadata) and other.regions == self.regions
i = 0
while res and i < len(self.examples):
res = self.examples[i] in other.examples
......@@ -289,8 +301,10 @@ class Sense(SubInfo):
i+=1
return res
def serializable(self, prefix = None):
def serializable(self, id = True):
res = {}
if id:
prefix = self.get_id()
if self.domain != None:
res["Domain"] = self.domain
if len(self.regions) > 0:
......@@ -312,7 +326,7 @@ class Sense(SubInfo):
if len(self.subsenses) > 0:
res["Subsenses"] = {}
for t in self.subsenses:
res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix)
res["Subsenses"][t.set_id(self.id)]= t.serializable(prefix)
return res
def __str__(self):
......@@ -320,10 +334,10 @@ class Sense(SubInfo):
class SubSense(Sense):
def set_id(self, prefix=None):
if prefix != None and self.label == None:
self.label = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens
if prefix != None and self.id == None:
self.id = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens
self.__class__.inc_n_id()
return self.label
return self.id
class Entry:
#version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id)
......@@ -332,8 +346,7 @@ class Entry:
self.lang = lang
#Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile
self.sources = []
self.sources.append({"wiktionary_language":wiki_lang,"permanentId":version_id,"wikstraktor_version":wkskt_version})
self.current_source = 0
self.add_source(wiki_lang, version_id, wkskt_version)
self.pronunciations = []
self.pos = None
self.senses = []
......@@ -343,13 +356,24 @@ class Entry:
def set_pos(self, pos):
self.pos = pos
def get_id(self, source_id=0):
#TODO: remplacer un jour le source id par la bonne source
def add_source(self, wiki_lang, version_id, wkskt_version):
self.sources.append({"wiktionary_language":wiki_lang,"permanentId":version_id,"wikstraktor_version":wkskt_version})
self.current_source = len(self.sources)-1
def set_current_source(self, src):
self.current_source = src
def get_prefix(self, source_id=-1):
if self.pos != None:
pos = self.pos
pos = "."+self.pos
else:
pos = ""
return f"{self.lang}-{source_id}.{self.lemma}{pos}"
if source_id == -1:
source_id = self.current_source
return f"{self.lang}.{self.lemma}{pos}-{source_id}"
def get_id(self):
return f"{self.lang}.{self.lemma}.{self.pos}"
def set_pronunciations(self, pron):
if isinstance(pron, Pronunciation):
......@@ -365,7 +389,7 @@ class Entry:
def add_pronunciation(self, p):
if p not in self.pronunciations:
p.set_id(self.get_id())
p.set_id(self.get_prefix())
self.pronunciations.append(p)
def set_senses(self, senses):
......@@ -377,15 +401,62 @@ class Entry:
def add_sense(self, s):
if s not in self.senses:
s.set_id(self.get_id())
s.set_id(self.get_prefix())
self.senses.append(s)
def is_valid(self):
return self.lemma != None and self.pos != None and len(self.senses) > 0
# and len(self.pronunciations) > 0 ← must work without pronounciations
def same(self, other):
return isinstance(other, self.__class__) and self.lemma == other.lemma and self.lang == other.lang and self.pos == other.pos
def merge(self, other):
if self.same(other) and self != other:
i = 0
src_map = []
max_id = len(self.sources) - 1
while i < len(other.sources):
if other.sources[i] in self.sources:
src_map.append(self.sources.index(other.sources[i])) #should append at rank i
else :
self.add_source(other.sources[i]["wiktionary_language"],other.sources[i]["permanentId"],other.sources[i]["wikstraktor_version"])
src_map.append(self.current_source) #should append at rank i
i += 1
for p in other.pronunciations:
src = p.get_src_from_id()
if src != None and src <= max_id and src_map[src] != src:
#max_id, c'est parce qu'un même objet peut être
#à plusieurs endroits et avoir déjà été modifié
p.replace_src_in_id(src, src_map[src])
self.add_pronunciation(p)
for s in other.senses:
src = s.get_src_from_id()
if src != None and src_map[src] != src:
s.replace_src_in_id(src, src_map[src])
for ss in s.subsenses:
ss.replace_src_in_id(src, src_map[src])
for d in ss.definitions:
d.replace_src_in_id(src, src_map[src])
for e in ss.examples:
e.replace_src_in_id(src, src_map[src])
for t in ss.translations:
t.replace_src_in_id(src, src_map[src])
for d in s.definitions:
d.replace_src_in_id(src, src_map[src])
for e in s.examples:
e.replace_src_in_id(src, src_map[src])
for t in s.translations:
t.replace_src_in_id(src, src_map[src])
self.add_sense(s)
else:
if isinstance(other, self.__class__):
raise TypeError(f"Entry.merge() error : {other.__class__} object cannot be merged with Entry")
else:
raise ValueError(f"Entry.merge() error : {self.id} cannot be merged with {other.id}")
def __eq__(self, other):
res = isinstance(other, self.__class__) and self.lemma == other.lemma and self.lang == other.lang and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses)
res = self.same(other) and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses)
i = 0
while res and i < len(self.senses):
res = self.senses[i] == other.senses[i]
......@@ -400,14 +471,14 @@ class Entry:
res = {}
res['sources'] = self.sources
if id:
id = self.get_id()
res['id'] = id
prefix = self.get_prefix()
res['id'] = self.get_id()
else:
id == None
prefix == None
res[self.lemma] = {"pos":self.pos}
res[self.lemma]["pronunciations"] = []
for p in self.pronunciations:
res[self.lemma]["pronunciations"].append(p.serializable(id))
res[self.lemma]["pronunciations"].append(p.serializable(prefix))
res[self.lemma]["senses"] = {}
for s in self.senses:
res[self.lemma]["senses"][s.get_id()]=s.serializable(id)
......@@ -522,10 +593,10 @@ class ParserContext:
class Wikstraktor:
@classmethod
def get_instance(cls, wiki_language, entry_language, existing_entries=None):
def get_instance(cls, wiki_language, entry_language):
try:
m_name = f"{wiki_language}_{entry_language}".capitalize()
instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")(existing_entries)
instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")()
instance.version = the_version
instance.log = Wikstraklog(the_version, entry_language, wiki_language)
except ModuleNotFoundError:
......@@ -533,11 +604,8 @@ class Wikstraktor:
instance = None
return instance
def __init__(self, existing_entries=None):
if existing_entries = None:
self.entries = []
else:
self.entries = existing_entries
def __init__(self):
self.entries = {}
self.pwb = pywikibot
self.wtp = wikitextparser
self.parserContext = None
......@@ -551,6 +619,13 @@ class Wikstraktor:
print(f"{file_page_name} does not exist in {self.site}.")
return res
def add_entry(self, e):
if e.get_id() in self.entries.keys():
if e != self.entries[e.get_id()]:
self.entries[e.get_id()].merge(e)
else:
self.entries[e.get_id()] = e
#retrieves the content of a page and processes it (adding the entries to the list of entries)
#returns the number of entries added
def fetch(self, graphy):
......@@ -603,7 +678,7 @@ class Wikstraktor:
res = len(self.parserContext.entries)
if res > 0:
for e in self.parserContext.entries:
self.entries.append(e)
self.add_entry(e)
return res
def isPro(self, title):
......@@ -722,12 +797,23 @@ class Wikstraktor:
print("Skipped empty definition")
return senses
def __add__(self, other):
if isinstance(other, Wikstraktor):
for k,e in other.entries.items():
if k in self.entries.keys():
self.entries[k].merge(e)
else:
self.entries[k] = e
else:
raise TypeError(f"Wikstraktor '+' : {other.__class__} object cannot be added to {self.__class__}")
return self
def __str__(self):
return self.export()
def serialize(self, id=True):
res = []
for e in self.entries:
for e in self.entries.values():
res.append(e.serializable(id))
return res
......@@ -737,12 +823,6 @@ class Wikstraktor:
else:
return json.dumps(self.serialize(id), ensure_ascii=ascii, indent=4)
def export_multi_wikt(serialized, ascii=False, compact=False):
if compact:
return json.dumps(serialized, ensure_ascii=ascii)
else:
return json.dumps(serialized, ensure_ascii=ascii, indent=4)
if __name__ == "__main__":
import argparse
from argparse import RawTextHelpFormatter #pour le formattage de l'aide
......@@ -762,14 +842,18 @@ if __name__ == "__main__":
wiki_languages = args.wiki_language.split("+")
languages = args.language.split("+")
if args.mot != None:
resp = []
resp = None
for w_l in wiki_languages:
for l in languages :
w = Wikstraktor.get_instance(w_l, l)
if w.fetch(args.mot) > 0:
resp += w.serialize(not args.no_id)
if len(resp) > 0 :
resp = export_multi_wikt(resp, args.force_ascii, args.compact)
if resp == None:
resp = w
else:
resp += w
print(resp != None)
if resp != None :
resp = resp.export(not args.no_id, args.force_ascii, args.compact)
if args.destination_file != None:
f = open(args.destination_file, "w")
f.write(resp)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment