From 6081dccb19d28b14a167a21430fb9255a61c3564 Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Wed, 22 Mar 2023 18:05:20 +0100 Subject: [PATCH] Store sources --- .gitignore | 1 + README.md | 2 ++ parsers/en_en.py | 1 - setup.py | 8 ++++++++ wikstraktor.py | 15 +++++++++------ 5 files changed, 20 insertions(+), 7 deletions(-) create mode 100755 setup.py diff --git a/.gitignore b/.gitignore index 69e9892..7e96d59 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ KNM.csv .~lock* *.json *.lwp +wikstraktor_version.py wikstraktorenv diff --git a/README.md b/README.md index 84975f0..f81037f 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,10 @@ This project does depend on python packages. (maybe to be replaced by an automation of some sort, using a virtual environment might be better, see [server version](#wikstraktor-server)) * [```pip install pywikibot```](https://pypi.org/project/pywikibot/) * [```pip install wikitextparser```](https://pypi.org/project/wikitextparser/) +* [```pip install gitpython```](https://gitpython.readthedocs.io/en/stable/) * [```pip install importlib```](https://pypi.org/project/importlib/) _Optional (for python 2.*, not tested)_ +* run ``./setup.py`` (used to store wikstraktor version in wiktionary extracts) ### Wikstraktor Server If you want wikstraktor as a server, you need to install [flask](https://flask.palletsprojects.com/en/2.0.x/installation/) and [flask-cors](https://flask-cors.readthedocs.io/en/latest/) — to allow other domains to query —, and best practice is to do so in a [virtual environment](https://docs.python.org/3/library/venv.html#module-venv). diff --git a/parsers/en_en.py b/parsers/en_en.py index cf93078..d840524 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -56,7 +56,6 @@ class En_en_straktor(Wikstraktor): if l[i].pattern == '\\# ': theDef = self.wtp.parse(l[i].items[0]).plain_text().strip() if theDef != "": - print(theDef)# DEBUG: newSense = Sense(self.entry_language, theDef, self.wiki_language) #newSence.add_translation() elif l[i].pattern == '\\#:': diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..cd3142b --- /dev/null +++ b/setup.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 + +import git +sha = git.Repo(search_parent_directories=True).head.object.hexsha + +v = open("wikstraktor_version.py", "w") +v.write(f"version = '{sha}'") +v.close() diff --git a/wikstraktor.py b/wikstraktor.py index c1e98fb..b567c94 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -274,12 +274,12 @@ class SubSense(Sense): class Entry: #version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id) - def __init__(self, lemma, lang, wiki_lang, version_id): + def __init__(self, lemma, lang, wiki_lang, version_id, wkskt_version): self.lemma = lemma self.lang = lang #Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile self.sources = [] - self.sources.append({wiki_lang:version_id}) + self.sources.append({"wiktionary_language":wiki_lang,"permanentId":version_id,"wikstraktor_version":wkskt_version}) self.current_source = 0 self.pronunciations = [] self.pos = None @@ -368,11 +368,12 @@ class Entry: return res class ParserContext: - def __init__(self, entry, lang, wiki_lang, version_id): + def __init__(self, entry, lang, wiki_lang, wversion_id, version_id): self.lemma = entry self.lang = lang self.wiki_lang = wiki_lang - self.version_id = version_id + self.page_version_id = wversion_id + self.wikstraktor_version = version_id self.context = [] self.entries = [] @@ -408,7 +409,7 @@ class ParserContext: #Pb là dedans def create_entry(self): #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS - res = Entry(self.lemma, self.lang, self.wiki_lang, self.version_id) + res = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version) for l in self.context: if "pro" in l.keys(): res.set_pronunciations(l['pro']) @@ -446,6 +447,8 @@ class Wikstraktor: try: m_name = f"{wiki_language}_{entry_language}".capitalize() instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() + from wikstraktor_version import version as v + instance.version = v except ModuleNotFoundError: print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module") instance = None @@ -486,7 +489,7 @@ class Wikstraktor: return nb_entries_added def parse(self, entry, v_id, sections): - self.parserContext = ParserContext(entry, self.entry_language, self.wiki_language, v_id) + self.parserContext = ParserContext(entry, self.entry_language, self.wiki_language, v_id, self.version) for s in sections: if s.title != None : #handle wiki context -- GitLab