diff --git a/public/wikstraktor.sqlite b/public/wikstraktor.sqlite new file mode 100644 index 0000000000000000000000000000000000000000..5e5c396f63d579f490be2adce541a0c44166a3a2 Binary files /dev/null and b/public/wikstraktor.sqlite differ diff --git a/src/Manager/WiktionaryManager.php b/src/Manager/WiktionaryManager.php index 9ebdf38610634e44a79b7c3c36218a3069fff804..8eec9ecc77d5a3fc841cc265c9adebf528629dc8 100644 --- a/src/Manager/WiktionaryManager.php +++ b/src/Manager/WiktionaryManager.php @@ -104,23 +104,23 @@ class WiktionaryManager $morphologicalLabels = []; $entryData = []; - $word = array_key_first($wiktionaryData[0]); + $word = array_keys($wiktionaryData[0])[2]; $entryData['Headword'] = $word; $items = []; - foreach ($wiktionaryData as $wordForm) { - if (array_key_first($wordForm) != $word) { + foreach ($wiktionaryData as $pos) { + if (array_keys($pos)[2] != $word) { throw new \Exception(sprintf("Items se rapportant à des mots différents dans les données du wiktionnaire pour le mot %s.", $word)); } - $wordFormData = $wordForm[$word]; + $posData = $pos[$word]; $item = []; - $item['PartOfSpeech'] = $wordFormData['pos']; - $morphologicalLabels[] = $wordFormData['pos']; + $item['PartOfSpeech'] = $posData['pos']; + $morphologicalLabels[] = $posData['pos']; $sense = []; - $sense['Pronunciations'] = $this->getPronunciations($wordFormData['pronunciations']); - $sense['Definitions'] = $this->getDefinitions($wordFormData['senses']); + $sense['Pronunciations'] = $this->getPronunciations($posData['pronunciations']); + $sense['Definitions'] = $this->getDefinitions($posData['senses']); $item['Sense'] = $sense; $items[] = $item; @@ -143,6 +143,9 @@ class WiktionaryManager if (isset($wikPronunciation['accent'])) { $pronunciation['accent'] = $wikPronunciation['accent']; } + if (isset($wikPronunciation['sounds'])) { + $pronunciation['sounds'] = $wikPronunciation['sounds']; + } $result[] = $pronunciation; } @@ -154,12 +157,12 @@ class WiktionaryManager $result = []; foreach ($wikSenses as $wikSense) { $definition = [ - 'Def' => reset($wikSense)['defs'][0]['definition'], + 'Def' => $wikSense['Definitions'][0]['definition'], ]; // Extraction des exemples $examples = []; - foreach (reset($wikSense)['exs'] as $wikExample) { + foreach ($wikSense['Examples'] ?? [] as $wikExample) { $ex = $wikExample['example']; if ($ex) { $examples[] = $wikExample; diff --git a/src/Wikstraktor/.gitignore b/src/Wikstraktor/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..7e96d597ef6d0eeec4bd69d749ba93774f4bc022 --- /dev/null +++ b/src/Wikstraktor/.gitignore @@ -0,0 +1,13 @@ +user-config.py +user-password.py +__pycache__ +apicache-py3 +logs +throttle.ctrl +user_list.py +KNM.csv +.~lock* +*.json +*.lwp +wikstraktor_version.py +wikstraktorenv diff --git a/src/Wikstraktor/README.md b/src/Wikstraktor/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f5420933068aac0e652f2322e1c3a95e55c9d272 --- /dev/null +++ b/src/Wikstraktor/README.md @@ -0,0 +1,84 @@ +wikstraktor +=========== + +A python tool to query the [wiktionary](https://wiktionary.org) and extract [structured lexical data](https://gitlab.liris.cnrs.fr/lex-game/wikstraktor/-/wikis/Entry-structure). + +## Dependencies +This project does depend on python packages. +* [``pywikibot``](https://github.com/wikimedia/pywikibot) allows to use the mediawiki API + * [documentation](https://doc.wikimedia.org/pywikibot/stable/api_ref/pywikibot.html) + * [manual](https://www.mediawiki.org/wiki/Manual:Pywikibot) + * [configuration for the wiktionary](https://github.com/wikimedia/pywikibot/blob/master/pywikibot/families/wiktionary_family.py) +* [``wikitextparser``](https://github.com/5j9/wikitextparser) can parse mediawiki pages and extract sections, templates and links + * [documentation](https://wikitextparser.readthedocs.io/en/latest/#api-reference) +* [``importlib``](https://docs.python.org/3/library/importlib.html) : to import parser modules + +## Installation +(maybe to be replaced by an automation of some sort, using a virtual environment might be better, see [server version](#wikstraktor-server)) +* [```pip install pywikibot```](https://pypi.org/project/pywikibot/) +* [```pip install wikitextparser```](https://pypi.org/project/wikitextparser/) +* [```pip install gitpython```](https://gitpython.readthedocs.io/en/stable/) +* [```pip install sqlite3```](https://docs.python.org/3/library/sqlite3.html) Might be provided with python +* [```pip install importlib```](https://pypi.org/project/importlib/) +_Optional (for python 2.*, not tested)_ +* run ``./setup.py`` (used to store wikstraktor version in wiktionary extracts) + +### Wikstraktor Server +If you want wikstraktor as a server, you need to install [flask](https://flask.palletsprojects.com/en/2.0.x/installation/) and [flask-cors](https://flask-cors.readthedocs.io/en/latest/) — to allow other domains to query —, and best practice is to do so in a [virtual environment](https://docs.python.org/3/library/venv.html#module-venv). + +The following commands are extracted from the aforementionned documentation, it is probably more secure to click on the link and follow the modules documentation : +```bash +python3 -m venv wikstraktorenv #create wikstraktorenv environment +. wikstraktorenv/bin/activate #activate environment +pip install Flask #install Flask +pip install -U flask-cors #install Flask cors +``` + +## Use +### Wikstraktor +#### Python +```python +from wikstraktor import Wikstraktor +f = Wikstraktor.get_instance('fr', 'en') #create a wikstraktor, + # first parameter is the language of the wiki + # second parameter is the language of the word sought for +f.fetch("blue") #fetch an article +str(f) #convert content to json +``` + +#### Bash +``` +usage: wikstraktor.py [-h] [-l LANGUAGE] [-w WIKI_LANGUAGE] [-m MOT] + [-f DESTINATION_FILE] [-A] [-C] + +Interroger un wiktionnaire + ex : + ‣./wikstraktor.py -m blue + ‣./wikstraktor.py -m blue -f blue.json -A -C + ‣./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C + +options: + -h, --help show this help message and exit + -l LANGUAGE, --language LANGUAGE + la langue du mot + -w WIKI_LANGUAGE, --wiki_language WIKI_LANGUAGE + la langue du wiki + -m MOT, --mot MOT le mot à chercher + -f DESTINATION_FILE, --destination_file DESTINATION_FILE + le fichier dans lequel stocker le résultat + -A, --force_ascii json avec que des caractères ascii + -C, --compact json sans indentation +``` + +### Wikstraktor Server +The server runs by default on port 5000, you can change that in the ```wikstraktor_server_config.py``` file. +```bash +./wikstraktor_server.py +``` +Then there is a very simple API : +* ``GET server_url/search/<word>`` : Searches the word in the default wiktionary +* ``GET server_url/search/<wiktlang>/<wordlang>/<word>`` : Searches the word In wordlang in the wiktlang wiktionary +Both API calls return a json object. + +## Licence +TODO but will be open source diff --git a/src/Wikstraktor/en_constants.py b/src/Wikstraktor/en_constants.py deleted file mode 100644 index c933d6ab9b796b86e53f6b4ad1c9b9e96ea8eeb7..0000000000000000000000000000000000000000 --- a/src/Wikstraktor/en_constants.py +++ /dev/null @@ -1,34 +0,0 @@ -string_values = { - "ety":"Etymology", - "pro":"Pronunciation", - "en":"English", - "fr":"French", - "t_ipa":"IPA", #template for transcription - "t_snd":"audio", #template for audio - "t_acc":"a", #template for accents - "t_deflabel":"lb", - "t_ex":["ux", "usex"], - "POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS - "Adjective":"Adj", - "Adverb":"Adv", - "Ambiposition":"Ambip", - "Article":"Art", - "Circumposition":"Circump", - "Classifier":"Class", - "Conjunction":"Conj", - "Contraction":"Cont", - "Counter":"Count", - "Determiner":"Det", - "Ideophone":"Ideophone", - "Interjection":"Interj", - "Noun":"N", - "Numeral":"Num", - "Participle":"Part", - "Particle":"Particle", - "Postposition":"Postp", - "Preposition":"Prep", - "Pronoun":"Pro", - "Proper noun":"NP", - "Verb":"V" # TODO: compléter - } -} diff --git a/src/Wikstraktor/en_en.py b/src/Wikstraktor/en_en.py deleted file mode 100644 index 549fbcffcc3adcec78c1ba4edc720b7aacbc15d7..0000000000000000000000000000000000000000 --- a/src/Wikstraktor/en_en.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python3 -from wikstraktor import Wikstraktor, Pronunciation, Sense - -from en_constants import string_values - -debugEty = 0 - -class En_en_straktor(Wikstraktor): - def __init__(self): - super().__init__() - self.wiki_language = "en" - self.entry_language = "en" - self.constants = string_values - self.site = self.pwb.Site(f'wiktionary:en') - - def process_pronunciation(self, proContent): - # TODO: ne marche que pour les listes à 2 niveaux, voir water pour 3 niveaux - l = proContent.get_lists()[0] - i = 0 - pronunciations = [] - while i < len(l.fullitems): - p = Pronunciation() - templates = self.wtp.parse(l.fullitems[i]).templates - a = None - for j, t in enumerate(templates): - if (t.normal_name() == self.constants['t_acc'] and templates[j+1].normal_name()!= self.constants['t_acc']): - a = t.arguments[0].value - elif t.normal_name() == self.constants['t_ipa']: - p.set_transcription(t.arguments[1].value) - p.set_accent(a) - elif t.normal_name() == self.constants['t_snd']: - p.add_sound(self.get_file_url(t.arguments[1].value), a) - if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : - if p.ipa != None or p.accent != None: - pronunciations.append(p) - p = Pronunciation() - i += 1 - return pronunciations - - def process_etymology(self, etyContent): - global debugEty - debugEty += 1 - return "Etymology" + str(debugEty) - - def process_POS(self,parsedwikitext): - pos = None - if parsedwikitext in self.constants['POS'].keys(): - pos = self.constants['POS'][parsedwikitext] - return pos - - def process_senses(self, entry, pos, sensesContent): - baseId = f"{entry}_{pos}_" - l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) - i = 0 - senses = [] - nombreDef = 0 - while i < len(l): - if l[i].pattern == '\\# ': - nombreDef += 1 - newSense = Sense(f"{baseId}{nombreDef}") - newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) - elif l[i].pattern == '\\#:': - for j in l[i].items: - k = 0 - isEx = 0 - while k < len(self.wtp.parse(j).templates) and isEx == 0 : - if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) - isEx = 1 - k += 1 - if isEx == 0: - newSense.add_example(self.wtp.parse(j).plain_text().strip()) - if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - senses.append(newSense) - cnt = 0 - nombreSousDef = 0 - while i < len(l) and l[i].level == 3 : - cnt +=1 - if l[i].pattern == '\\## ': - nombreSousDef += 1 - newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") - newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) - elif l[i].pattern == '\\##:': - for j in l[i].items: - k = 0 - isEx = 0 - while k < len(self.wtp.parse(j).templates) and isEx == 0 : - if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) - isEx = 1 - k += 1 - if isEx == 0: - newSense2.add_example(self.wtp.parse(j).plain_text().strip()) - if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - senses.append(newSense2) - i += 1 - if cnt > 0: - i -= 1 - i += 1 - return senses - -if __name__ == "__main__": - ensk = En_en_straktor() - print(ensk.fetch("test"), "entries added") diff --git a/src/Wikstraktor/parsers/Structure_json.json b/src/Wikstraktor/parsers/Structure_json.json index c38503c4148023192a7ae08469ed5bf33adee85b..bb0cd9b8062f6ed235cf06ffe1295b7c73a14031 100644 --- a/src/Wikstraktor/parsers/Structure_json.json +++ b/src/Wikstraktor/parsers/Structure_json.json @@ -15,8 +15,8 @@ "url1":"https://upload.wikimedia.org/wikipedia/commons/1/19/LL-Q1860_%28eng%29-Back_ache-water.wav" } ], - "Senses":[ - { + "Senses":{ + "v1":{ "Translations":[ "translation1", "...", @@ -26,16 +26,16 @@ "Stilles Mineralwasser.jpg", "..." ], - "Definition":"blabla", + "Definition":{"lang":"fr", "definition" : "blabla"}, "Examples":[ "blabla", "blabli", "blablou" ], - "subSense":[ + "SubSenses":[ { - "subdef":"blabla", - "subex":[ + "Definition":{"lang":"en", "definition" : "whatnot"}, + "Examples":[ "subexa", "subexb", "subexz" @@ -43,7 +43,7 @@ } ] } - ] + } } ] } @@ -61,4 +61,3 @@ \"Supplementary field for devs 5\" ... \"Supplementary field for devs 10\ */ - diff --git a/src/Wikstraktor/parsers/en_constants.py b/src/Wikstraktor/parsers/en_constants.py index c933d6ab9b796b86e53f6b4ad1c9b9e96ea8eeb7..de100b7f77d7768038dffe54a27f6208c1224c09 100644 --- a/src/Wikstraktor/parsers/en_constants.py +++ b/src/Wikstraktor/parsers/en_constants.py @@ -8,6 +8,9 @@ string_values = { "t_acc":"a", #template for accents "t_deflabel":"lb", "t_ex":["ux", "usex"], + "sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns + {"def":"\\#", "ex":"\\#[:;]", "add_subdef":"\\#"} + ], "POS": { #https://en.wiktionary.org/wiki/Wiktionary:POS "Adjective":"Adj", "Adverb":"Adv", diff --git a/src/Wikstraktor/parsers/en_en.py b/src/Wikstraktor/parsers/en_en.py index b31790825d55566011c8c7ed01217b2b85123b81..1a2fab1bad8094859a53f166b88e337f9d18e9a1 100644 --- a/src/Wikstraktor/parsers/en_en.py +++ b/src/Wikstraktor/parsers/en_en.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -from wikstraktor import Wikstraktor, Pronunciation, Sense +from wikstraktor import Wikstraktor, Pronunciation, Sense, SubSense from parsers.en_constants import string_values @@ -41,64 +41,13 @@ class En_en_straktor(Wikstraktor): global debugEty debugEty += 1 return "Etymology" + str(debugEty) - + def process_POS(self,parsedwikitext): pos = None if parsedwikitext in self.constants['POS'].keys(): pos = self.constants['POS'][parsedwikitext] return pos - def process_senses(self, entry, pos, sensesContent): - baseId = f"{entry}_{pos}_" - l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) - i = 0 - senses = [] - nombreDef = 0 - while i < len(l): - if l[i].pattern == '\\# ': - nombreDef += 1 - newSense = Sense(f"{baseId}{nombreDef}") - newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) - elif l[i].pattern == '\\#:': - for j in l[i].items: - k = 0 - isEx = 0 - while k < len(self.wtp.parse(j).templates) and isEx == 0 : - if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) - isEx = 1 - k += 1 - if isEx == 0: - newSense.add_example(self.wtp.parse(j).plain_text().strip()) - if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - senses.append(newSense) - cnt = 0 - nombreSousDef = 0 - while i < len(l) and l[i].level == 3 : - cnt +=1 - if l[i].pattern == '\\## ': - nombreSousDef += 1 - newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") - newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) - elif l[i].pattern == '\\##:': - for j in l[i].items: - k = 0 - isEx = 0 - while k < len(self.wtp.parse(j).templates) and isEx == 0 : - if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) - isEx = 1 - k += 1 - if isEx == 0: - newSense2.add_example(self.wtp.parse(j).plain_text().strip()) - if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - senses.append(newSense2) - i += 1 - if cnt > 0: - i -= 1 - i += 1 - return senses - if __name__ == "__main__": ensk = En_en_straktor() print(ensk.fetch("test"), "entries added") diff --git a/src/Wikstraktor/parsers/fr_constants.py b/src/Wikstraktor/parsers/fr_constants.py index cbf821283a8791bf4adcc9efbed37903fec2ccae..73d68129734122ee52b397f66357b76bceb7452d 100644 --- a/src/Wikstraktor/parsers/fr_constants.py +++ b/src/Wikstraktor/parsers/fr_constants.py @@ -9,7 +9,9 @@ string_values = { "t_ipa":"pron", #template for transcription "t_snd":"écouter", #template for audio "t_acc":["US", "UK"], #template for accents - +"sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns + {"def":"\\#", "ex":"\\#\\*", "add_subdef":"\\#"} +], "POS":{ "adjectif":["adjectif","adjectif qualificatif","adj"], "adjectif démonstratif":["adjectif démonstratif","adj-dém","adjectif dém"], @@ -77,5 +79,5 @@ string_values = { "variante par contrainte typographique":["variante typographique","variante typo","variante par contrainte typographique","var-typo"], "verbe pronominal":["verbe pronominal","verb-pr","verbe pr"], "verbe":["verbe","verb"] - } + } } diff --git a/src/Wikstraktor/parsers/fr_en.py b/src/Wikstraktor/parsers/fr_en.py index 175f22b6c6cc585ab887e1b3964cfb866d8a12b4..91c43743ef70931cdf9db22e27afa6ce7c9bfdd3 100644 --- a/src/Wikstraktor/parsers/fr_en.py +++ b/src/Wikstraktor/parsers/fr_en.py @@ -43,7 +43,7 @@ class Fr_en_straktor(Wikstraktor): global debugEty debugEty += 1 return "Etymology" + str(debugEty) - + def process_POS(self,parsedwikitext): pos = None ik = 0 @@ -53,62 +53,8 @@ class Fr_en_straktor(Wikstraktor): keys = list(self.constants['POS'].keys()) pos = keys[ik] ik += 1 -# print(pos) return pos - def process_senses(self, entry, pos, sensesContent): - baseId = f"{entry}_{pos}_" - l = sensesContent.get_lists(('\\# ', '\\#:','\\## ', '\\##:' )) - i = 0 - senses = [] - nombreDef = 0 - while i < len(l): - if l[i].pattern == '\\# ': - nombreDef += 1 - newSense = Sense(f"{baseId}{nombreDef}") - newSense.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) - elif l[i].pattern == '\\#:': - for j in l[i].items: - k = 0 - isEx = 0 - while k < len(self.wtp.parse(j).templates) and isEx == 0 : - if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) - isEx = 1 - k += 1 - if isEx == 0: - newSense.add_example(self.wtp.parse(j).plain_text().strip()) - if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - senses.append(newSense) - cnt = 0 - nombreSousDef = 0 - while i < len(l) and l[i].level == 3 : - cnt +=1 - if l[i].pattern == '\\## ': - nombreSousDef += 1 - newSense2 = Sense(f"{baseId}{nombreDef}_{nombreSousDef}") - newSense2.add_def(self.wiki_language, self.wtp.parse(l[i].items[0]).plain_text().strip()) - elif l[i].pattern == '\\##:': - for j in l[i].items: - k = 0 - isEx = 0 - while k < len(self.wtp.parse(j).templates) and isEx == 0 : - if (self.wtp.parse(j).templates[k].normal_name() in self.constants['t_ex']): - newSense2.add_example(self.wtp.parse(j).templates[0].arguments[-1].value) - isEx = 1 - k += 1 - if isEx == 0: - newSense2.add_example(self.wtp.parse(j).plain_text().strip()) - if i == len(l)-1 or l[i+1].pattern == '\\# ' or l[i+1].pattern == '\\## ': - senses.append(newSense2) - i += 1 - if cnt > 0: - i -= 1 - i += 1 - return senses - if __name__ == "__main__": ensk = Fr_en_straktor() print(ensk.fetch("test"), "entries added") - - diff --git a/src/Wikstraktor/setup.py b/src/Wikstraktor/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..cd3142b21757a640147b51c8ad6c159f0814e1c9 --- /dev/null +++ b/src/Wikstraktor/setup.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 + +import git +sha = git.Repo(search_parent_directories=True).head.object.hexsha + +v = open("wikstraktor_version.py", "w") +v.write(f"version = '{sha}'") +v.close() diff --git a/src/Wikstraktor/test_wikstraktor.py b/src/Wikstraktor/test_wikstraktor.py new file mode 100644 index 0000000000000000000000000000000000000000..00d62280ee1bf3a8a0904ec8f91e8be141fb94b9 --- /dev/null +++ b/src/Wikstraktor/test_wikstraktor.py @@ -0,0 +1,26 @@ +from wikstraktor import Wikstraktor +if __name__ == "__main__": + #e = Wikstraktor.get_instance('en', "en") + f = Wikstraktor.get_instance('en', 'en') + # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) + # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) + #e.fetch("water") + f.fetch("water") + # print(e.fetch("test"), "entries added") + #print(e) + file_path = 'test.json' + fichier = open(file_path, "w") + #fichier.write(str(f)) + fichier.write(str(f)) + fichier.close() + # site = pywikibot.Site(f'wiktionary:en') + # p = pywikibot.FilePage(site, "File:LL-Q1860 (eng)-Nattes à chat----parent.wav") + # print(p) + # if not p.exists(): + # site = pywikibot.Site('commons') + # p = pywikibot.FilePage(site, "File:LL-Q1860 (eng)-Nattes à chat-parent.wav") + # print(p.get_file_url()) + #print(e) + #Entry("test", wtp.parse(page.text))) + + # PRENDS PAS LE FICHIER AUDIO POUR "LIVE" EN_EN diff --git a/src/Wikstraktor/wikstraklog.py b/src/Wikstraktor/wikstraklog.py new file mode 100644 index 0000000000000000000000000000000000000000..239ee013491ab9d840eeeecdabd293aa10750146 --- /dev/null +++ b/src/Wikstraktor/wikstraklog.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +import sqlite3 + +class Wikstraklog: + table_name = "Wikstraklog" + def __init__(self, wikstraktor_version, word_language, wiki_language, file="wikstraktor.sqlite"): + self.__connector__ = sqlite3.connect(file) + self.__cursor__ = self.__connector__.cursor() + if self.__execute__(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{Wikstraklog.table_name}';").fetchone() is None : + self.__execute__(f"""CREATE TABLE "{Wikstraklog.table_name}" ( + "Date" DATETIME DEFAULT CURRENT_TIMESTAMP, + "Wikstraktor_version" TEXT NOT NULL, + "Word_language" TEXT NOT NULL, + "Wiki_language" TEXT NOT NULL, + "Word_form" TEXT NOT NULL, + "Wiki_permanent_id" INTEGER NOT NULL, + "Caller_method" TEXT, + "Content" TEXT +);""") + self.wx_v = wikstraktor_version + self.w_l = word_language + self.wk_l = wiki_language + + def set_context(self, word, permanentId): + self.cur_w = word + self.cur_pid = permanentId + + def __execute__(self, query, params = None): + if params == None: + res = self.__cursor__.execute(query) + else: + res = self.__cursor__.execute(query, params) + self.__connector__.commit() + return res + + def add_log(self, caller, content, word=None, permanentId=None): + if word == None: + word = self.cur_w + if permanentId == None: + permanentId = self.cur_pid + res = self.__execute__(f"INSERT INTO `{Wikstraklog.table_name}` (`Wikstraktor_version`, `Word_language`, `Wiki_language`, `Word_form`, `Wiki_permanent_id`, `Caller_method`, `Content`) VALUES (?, ?, ?, ?, ?, ?, ?)", (self.wx_v, self.w_l, self.wk_l, word, permanentId, caller, str(content))) + return res + +if __name__ == "__main__": + from wikstraktor_version import version as the_version + log = Wikstraklog(the_version, "en", "fr") + log.set_context("blue", 123456789) + log.add_log("exampleMethod", "no relevant content") diff --git a/src/Wikstraktor/wikstraktor.py b/src/Wikstraktor/wikstraktor.py index b9780d8e58b3fb8ac2eea8b5a244c75a626375fd..fec1efb131518ec026be156fa3cdd4f42538cf62 100644 --- a/src/Wikstraktor/wikstraktor.py +++ b/src/Wikstraktor/wikstraktor.py @@ -3,6 +3,41 @@ import pywikibot import wikitextparser import importlib import json +from wikstraktor_version import version as the_version +from wikstraklog import Wikstraklog + +#ICITE : fr marche pas, en prend des trucs vides à virer (cf. yellow… def & example) + + +class SubInfo: + next_id = 1 + prfx = "err" + + @classmethod + def inc_n_id(cls): + cls.next_id += 1 + + @classmethod + def reset(cls): + cls.next_id = 0 + + def __init__(self, prefix = None): + self.id = None + self.set_id(prefix) + + def set_id(self, prefix): + if self.id == None and prefix != None: + self.id = f"{prefix}_{self.__class__.prfx}{self.__class__.next_id}" + self.__class__.inc_n_id() + return self.id + + def serializable(self, prefix = None): + res = {} + if self.set_id(prefix) != None: + res["id"] = self.id + return res + + ####### # Oral @@ -13,7 +48,7 @@ class Sound: self.accent = accent def __eq__(self, other): - return self.url == other.url and self.accent == other.accent + return isinstance(other, self.__class__) and self.url == other.url and self.accent == other.accent def serializable(self): if self.accent == None: @@ -22,8 +57,11 @@ class Sound: res = {"accent":self.accent, "url":self.url} return res -class Pronunciation: - def __init__(self): +class Pronunciation(SubInfo): + prfx = "prn" + + def __init__(self, prefix = None): + super().__init__(prefix) self.ipa = None self.sounds = [] self.accent = None @@ -37,21 +75,22 @@ class Pronunciation: def add_sound(self, url, accent=None): self.sounds.append(Sound(url,accent)) - def serializable(self): + def serializable(self, prefix = None): snds = [] for s in self.sounds: snds.append(s.serializable()) - if self.accent == None: - res = {"transcript":self.ipa, "sounds":snds} - else: - res = {"accent":self.accent, "transcript":self.ipa, "sounds":snds} + res = super().serializable(prefix) + res['transcript'] = self.ipa + if self.accent != None: + res['accent'] = self.accent + res['sounds'] = snds return res def __str__(self): - return f"{self.serializable()}" + return json.dumps(self.serializable('')) def __eq__(self, other): - res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) + res = isinstance(other, self.__class__) and self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) i = 0 while res and i<len(self.sounds): res = self.sounds[i] == other.sounds[i] @@ -69,66 +108,122 @@ class Pronunciation: # TODO: créer une classe Translations ####### -class Definition: - def __init__(self, lang, text): - self.lang = lang - self.text = text +class Definition(SubInfo): + prfx = "def" + key = "definition" + + def __init__(self, lang, text, prefix=None): + super().__init__(prefix) + if text != "": + self.lang = lang + self.text = text + else: + raise ValueError(f"Definition.__init__: “{text}†empty definition.") def __eq__(self, other): - return self.lang == other.lang and self.text == other.text + return isinstance(other, self.__class__) and self.lang == other.lang and self.text == other.text - def serializable(self): - return {"lang":self.lang, "definition":self.text} + def serializable(self, prefix = None): + res = super().serializable(prefix) + res["lang"] = self.lang + res[self.__class__.key] = self.text + return res class Translation(Definition): - def serializable(self): - return {"lang":self.lang, "translation":self.text} + prfx = "trad" + key = "translation" + +class Example(SubInfo): + prfx = "ex" + + def __init__(self, transcript, source=None, url=None, prefix=None): + super().__init__(prefix) + if transcript != "": + self.text = transcript + self.source = source + self.url = url + else: + raise ValueError(f"Example.__init__: “{transcript}†empty example.") -class Example: - def __init__(self, transcript, source=None, url=None): - self.text = transcript - self.source = source - self.url = url def __eq__(self, other): - return self.text==other.text and self.source==other.source and self.url==other.url + return isinstance(other, self.__class__) and self.text==other.text and self.source==other.source and self.url==other.url - def serializable(self): - res = {"example":self.text} + def serializable(self, prefix = None): + res = super().serializable(prefix) + res["example"]=self.text if self.source != None: res["source"] = self.source if self.url != None: res["url"] = self.url return res -class Sense: - def __init__(self, label): - self.label = label #l'identifiant du sens +class Sense(SubInfo): + prfx = "" + + def __init__(self, lang=None, definition=None, wiki_lang=None, prefix=None): + self.lang = lang + self.label = None + self.set_id(prefix) + #On réinitialise les identifiants des sous-éléments + if not isinstance(self, SubSense): + Definition.reset() + Example.reset() + Translation.reset() + SubSense.reset() + self.definitions = [] #liste des définitions (elles auront une langue et un texte) + self.subsenses = [] #liste des sous-définitions (récursif…) self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) self.translations = [] #liste des traductions dans d'autres langues self.domain = None #domaine d'usage du mot dans ce sens + if definition != None: + try: + self.add_def(wiki_lang, definition) + except ValueError as err: + raise ValueError(f"Sense.__init__() with empty definition\n{err}") + + def set_id(self, prefix=None): + if prefix != None and self.label == None: + self.label = f"{prefix}_{self.__class__.next_id}" #l'identifiant du sens + self.__class__.inc_n_id() + return self.label + + def get_id(self): + return f"{self.lang}.{self.label}" def set_domain(self, d): self.domain = d def add_def(self, lang, definition): theDef = Definition(lang, definition) - if theDef not in self.definitions: + if theDef != None and theDef not in self.definitions: + theDef.set_id(self.set_id()) self.definitions.append(theDef) - def add_example(self, transcript, src=None, url=None): - theEx = Example(transcript, src, url) - if theEx not in self.examples: - self.examples.append(theEx) - - def add_translation(self, lang, translation): + def add_example(self, transcript, src=None, url=None, prefix=None): + try: + theEx = Example(transcript, src, url, prefix) + if theEx != None and theEx not in self.examples: + theEx.set_id(self.set_id()) + self.examples.append(theEx) + except ValueError as e: + print(f"Skipped empty example") + + def add_translation(self, lang=None, translation=None): theTranslation = Translation(lang, translation) - if theTranslation not in self.translations: + if theTranslation != None and theTranslation not in self.translations: + theTranslation.set_id(self.set_id()) self.translations.append(theTranslation) + def add_subsense(self, subsense): + if self.label!=None: + subsense.set_id(self.set_id()) + if subsense not in self.subsenses: + self.subsenses.append(subsense) + def __eq__(self, other): - res = self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain + res = isinstance(other, self.__class__) and self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain i = 0 while res and i < len(self.examples): res = self.examples[i] in other.examples @@ -141,55 +236,104 @@ class Sense: while res and i < len(self.definitions): res = self.definitions[i] in other.definitions i+=1 + i = 0 + while res and i < len(self.subsenses): + res = self.subsenses[i] in other.subsenses + i+=1 return res - def serializable(self): + def serializable(self, prefix = None): res = {} - res[self.label]={} if self.domain != None: - res[self.label]["domain"] = self.domain - res[self.label]["defs"] = [] - for d in self.definitions: - res[self.label]["defs"].append(d.serializable()) - res[self.label]["exs"] = [] - for e in self.examples: - res[self.label]["exs"].append(e.serializable()) - res[self.label]["trad"] = [] - for t in self.translations: - res[self.label]["trad"].append(t.serializable()) + res["Domain"] = self.domain + if len(self.definitions) > 0: + res["Definitions"] = [] + for d in self.definitions: + res["Definitions"].append(d.serializable(prefix)) + if len(self.subsenses) > 0: + res["Subsenses"] = {} + for t in self.subsenses: + res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix) + if len(self.examples) > 0 : + res["Examples"] = [] + for e in self.examples: + res["Examples"].append(e.serializable(prefix)) + if len(self.translations) > 0: + res["Translations"] = [] + for t in self.translations: + res["Translations"].append(t.serializable(prefix)) return res + def __str__(self): + return json.dumps(self.serializable()) + +class SubSense(Sense): + def set_id(self, prefix=None): + if prefix != None and self.label == None: + self.label = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens + self.__class__.inc_n_id() + return self.label class Entry: - def __init__(self, lemma): + #version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id) + def __init__(self, lemma, lang, wiki_lang, version_id, wkskt_version): self.lemma = lemma + self.lang = lang + #Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile + self.sources = [] + self.sources.append({"wiktionary_language":wiki_lang,"permanentId":version_id,"wikstraktor_version":wkskt_version}) + self.current_source = 0 self.pronunciations = [] self.pos = None self.senses = [] + #l'identifiant unique de la version de la page du wiktionnaire + Sense.reset() + + def set_pos(self, pos): + self.pos = pos + + def get_id(self, source_id=0): + #TODO: remplacer un jour le source id par la bonne source + if self.pos != None: + pos = self.pos + else: + pos = "" + return f"{self.lang}-{source_id}.{self.lemma}{pos}" def set_pronunciations(self, pron): if isinstance(pron, Pronunciation): - self.pronunciations.append(pron) + self.add_pronunciation(pron) elif type(pron) == list: for p in pron: if isinstance(p, Pronunciation): - self.pronunciations.append(p) + self.add_pronunciation(p) else: - raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).") + raise ValueError(f"Entry.set_pronunciations: {p} is not a Pronunciation object ({p.__class__.__name__}).") else: - raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") + raise ValueError(f"Entry.set_pronunciations: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") - def set_pos(self, pos): - self.pos = pos + def add_pronunciation(self, p): + if p not in self.pronunciations: + p.set_id(self.get_id()) + self.pronunciations.append(p) def set_senses(self, senses): - self.senses = senses + for s in senses: + if isinstance(s, Sense): + self.add_sense(s) + else: + raise ValueError(f"Entry.set_senses: {s} is not a Sense object ({p.__class__.__name__}).") + + def add_sense(self, s): + if s not in self.senses: + s.set_id(self.get_id()) + self.senses.append(s) def is_valid(self): return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 def __eq__(self, other): - res = self.lemma == other.lemma and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) + res = isinstance(other, self.__class__) and self.lemma == other.lemma and self.lang == other.lang and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) i = 0 while res and i < len(self.senses): res = self.senses[i] == other.senses[i] @@ -200,19 +344,25 @@ class Entry: i += 1 return res - def serializable(self): + def serializable(self, id=True): res = {} + res['sources'] = self.sources + if id: + id = self.get_id() + res['id'] = id + else: + id == None res[self.lemma] = {"pos":self.pos} res[self.lemma]["pronunciations"] = [] for p in self.pronunciations: - res[self.lemma]["pronunciations"].append(p.serializable()) - res[self.lemma]["senses"] = [] + res[self.lemma]["pronunciations"].append(p.serializable(id)) + res[self.lemma]["senses"] = {} for s in self.senses: - res[self.lemma]["senses"].append(s.serializable()) + res[self.lemma]["senses"][s.get_id()]=s.serializable(id) return res def __str__(self): - res = f"{self.lemma} ({self.pos})\n" + res = f"{self.lemma}_{self.lang} ({self.pos})\n" for p in self.pronunciations: res += f"{str(p)}\n" for s in self.senses: @@ -220,8 +370,12 @@ class Entry: return res class ParserContext: - def __init__(self, entry): + def __init__(self, entry, lang, wiki_lang, wversion_id, version_id): self.lemma = entry + self.lang = lang + self.wiki_lang = wiki_lang + self.page_version_id = wversion_id + self.wikstraktor_version = version_id self.context = [] self.entries = [] @@ -237,9 +391,13 @@ class ParserContext: def pop(self, testNewEntry = True): if testNewEntry: - self.create_entry() + self.create_entries() return self.context.pop() + def flush(self): + while len(self.context) > 0: + self.pop(True) + def set_top_wiki(self, wiki_context): if len(self.context) == 0: self.push(wiki_context) @@ -252,26 +410,32 @@ class ParserContext: else: self.context[-1][key] = entry_context if testNewEntry: - self.create_entry() + self.create_entries() - def create_entry(self): - #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS - res = Entry(self.lemma) + def create_entries(self): + #In the key dict there are traits that describe every thing (ety, pro) and different entities (POS:senses) + tmp = {} + res = 0 + pro = None for l in self.context: - #print(l.keys()) - if "pro" in l.keys(): - res.set_pronunciations(l['pro']) - if "ety" in l.keys(): - pass #On ignore l'étymologie pour le moment - if "POS" in l.keys(): - res.set_pos(l['POS']) - if "senses" in l.keys(): - res.set_senses(l['senses']) - # TODO: Ajouter les autres types - if res.is_valid() and res not in self.entries: - self.entries.append(res) - else: - res = None + for k,v in l.items(): + if k == "pro": + pro = v + elif k == "ety" or k == "wiki": + #wiki context is not necessary + pass #On ignore l'étymologie pour le moment + else: + tmp[k]=v + if(pro!=None and len(tmp)>0): + for pos,senses in tmp.items(): + e = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version) + e.set_pronunciations(pro) + e.set_pos(pos) + e.set_senses(senses) + #an improvement would be to remove that sense from context, but we test not to add doubles + if e.is_valid() and e not in self.entries: + res += 1 + self.entries.append(e) return res def debug_top(self): @@ -288,6 +452,20 @@ class ParserContext: res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}" return res + def __str__(self): + res = "" + i=0 + for c in self.context: + res += f"====={i}======\n" + for k,v in c.items(): + if k!= "wiki": + res+=f" {k}→{v}\n" + else: + res+=f" {k}→{len(v)}\n" + i+=1 + return res+f"nb of entries: {len(self.entries)}" + + class Wikstraktor: @classmethod @@ -295,6 +473,8 @@ class Wikstraktor: try: m_name = f"{wiki_language}_{entry_language}".capitalize() instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() + instance.version = the_version + instance.log = Wikstraklog(the_version, entry_language, wiki_language) except ModuleNotFoundError: print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module") instance = None @@ -331,11 +511,12 @@ class Wikstraktor: if not found: i += 1 if found: - nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections) + nb_entries_added = self.parse(page.title(), page.latest_revision_id, sections[i].sections)#self.wtp.parse(s.contents).sections) return nb_entries_added - def parse(self, entry, sections): - self.parserContext = ParserContext(entry) + def parse(self, entry, v_id, sections): + self.parserContext = ParserContext(entry, self.entry_language, self.wiki_language, v_id, self.version) + self.log.set_context(entry, v_id) for s in sections: if s.title != None : #handle wiki context @@ -343,8 +524,9 @@ class Wikstraktor: self.parserContext.push(s) else: while self.parserContext.get_level() > s.level: - self.parserContext.pop() + self.parserContext.pop(True) self.parserContext.set_top_wiki(s) + #get section title stitle = self.wtp.parse(s.title).templates if stitle == []: stitle = s.title @@ -354,12 +536,12 @@ class Wikstraktor: self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents))) elif self.isEty(stitle): self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents))) -# elif stitle in self.constants['POS'].keys(): else: + #Edit to process other types of sections pos = self.process_POS(stitle) if pos != None : - self.parserContext.set_top_entry_info('POS', pos, False) - self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) + self.parserContext.set_top_entry_info(pos, self.process_senses(self.wtp.parse(s.contents))) + self.parserContext.flush() res = len(self.parserContext.entries) if res > 0: for e in self.parserContext.entries: @@ -371,7 +553,6 @@ class Wikstraktor: res = title == self.constants['pro'] else: res = title in self.constants['pro'] - #print(title, res) return res def isEty(self, title): @@ -381,6 +562,7 @@ class Wikstraktor: res = title in self.constants['ety'] return res + #recognizes POS and returns None if it can't def process_POS(self, parsedwikitext): pass#in subclass @@ -390,16 +572,66 @@ class Wikstraktor: def process_etymology(self, parsedwikitext): pass#in subclass - def process_senses(self, entry, pos, parsedwikitext): - pass#in subclass + #can be overloaded + def process_example(self, example_wiki_text): + k = 0 + isEx = 0 + res = None + #process templates + while k < len(self.wtp.parse(example_wiki_text).templates) and isEx == 0 : + if (self.wtp.parse(example_wiki_text).templates[k].normal_name() in self.constants['t_ex']): + res = self.wtp.parse(example_wiki_text).templates[0].arguments[-1].value + isEx = 1 + k += 1 + if isEx == 0: + res = self.wtp.parse(example_wiki_text).plain_text().strip() + return res + + #can be overloaded + def process_definition(self, definition, sub_items, def_level = True): + if def_level: + newSense = Sense(self.entry_language, self.wtp.parse(definition).plain_text().strip(), self.wiki_language) + pattern_ex = self.constants['sense_pattern'][0]["ex"] + pattern_subdef = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["def"] + else: + newSense = SubSense(self.entry_language, self.wtp.parse(definition).plain_text().strip(), self.wiki_language) + pattern_subdef = None + pattern_ex = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["ex"] + #Process examples + a = 0 + #print(newSense, sub_items)# DEBUG: + for item_list in sub_items: + if item_list.pattern == pattern_ex: + for item in item_list.items: + newSense.add_example(self.process_example(item)) + #Si on veut traiter les sous items (ex traductions), on peut utiliser + #item_list.sublists(a) + if def_level and item_list.pattern == pattern_subdef: + for item in item_list.items: + newSense.add_subsense(self.process_definition(item, item_list.sublists(a), False)) + a += 1 + return newSense + + def process_senses(self, sensesContent): + l = sensesContent.get_lists((self.constants['sense_pattern'][0]["def"])) + senses = [] + if len(l) > 1: + self.log.add_log("Wikstraktor.process_senses", f"===== WARNING ======\nmore than one sense list, make sure we don't forget anything\nignored lists : \n{l[1:]}\n===================") + l = l[0] #l now contains a list of list items + if l.pattern == self.constants['sense_pattern'][0]["def"]: + i = 0 + for item in l.items: + senses.append(self.process_definition(item, l.sublists(i))) + i += 1 + return senses def __str__(self): return self.export() - def export(self, ascii=False, compact=False): + def export(self, id=True, ascii=False, compact=False): res = [] for e in self.entries: - res.append(e.serializable()) + res.append(e.serializable(id)) if compact: return json.dumps(res, ensure_ascii=ascii) else: @@ -412,19 +644,20 @@ if __name__ == "__main__": \033[1m\033[32mex :\033[0m ‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m ‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m - ‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C\033[0m""") + ‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -n -A -C\033[0m""") parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en") parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en") parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None) parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None) parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true") parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true") + parser.add_argument("-n", "--no_id", help="json sans id", action="store_true") args = parser.parse_args() if args.mot != None: w = Wikstraktor.get_instance(args.wiki_language, args.language) resp = None if w.fetch(args.mot) > 0: - resp = w.export(args.force_ascii, args.compact) + resp = w.export(not args.no_id, args.force_ascii, args.compact) if args.destination_file != None: f = open(args.destination_file, "w") f.write(resp) diff --git a/src/Wikstraktor/wikstraktor.sqlite b/src/Wikstraktor/wikstraktor.sqlite new file mode 100644 index 0000000000000000000000000000000000000000..b57b31ca0b8ab2c4d6a5cea2b52bfabfea67fc90 Binary files /dev/null and b/src/Wikstraktor/wikstraktor.sqlite differ diff --git a/src/Wikstraktor/wikstraktor_new.py b/src/Wikstraktor/wikstraktor_new.py deleted file mode 100644 index fec1efb131518ec026be156fa3cdd4f42538cf62..0000000000000000000000000000000000000000 --- a/src/Wikstraktor/wikstraktor_new.py +++ /dev/null @@ -1,668 +0,0 @@ -#!/usr/bin/env python3 -import pywikibot -import wikitextparser -import importlib -import json -from wikstraktor_version import version as the_version -from wikstraklog import Wikstraklog - -#ICITE : fr marche pas, en prend des trucs vides à virer (cf. yellow… def & example) - - -class SubInfo: - next_id = 1 - prfx = "err" - - @classmethod - def inc_n_id(cls): - cls.next_id += 1 - - @classmethod - def reset(cls): - cls.next_id = 0 - - def __init__(self, prefix = None): - self.id = None - self.set_id(prefix) - - def set_id(self, prefix): - if self.id == None and prefix != None: - self.id = f"{prefix}_{self.__class__.prfx}{self.__class__.next_id}" - self.__class__.inc_n_id() - return self.id - - def serializable(self, prefix = None): - res = {} - if self.set_id(prefix) != None: - res["id"] = self.id - return res - - - -####### -# Oral -####### -class Sound: - def __init__(self, url, accent): - self.url = url - self.accent = accent - - def __eq__(self, other): - return isinstance(other, self.__class__) and self.url == other.url and self.accent == other.accent - - def serializable(self): - if self.accent == None: - res = {"url":self.url} - else: - res = {"accent":self.accent, "url":self.url} - return res - -class Pronunciation(SubInfo): - prfx = "prn" - - def __init__(self, prefix = None): - super().__init__(prefix) - self.ipa = None - self.sounds = [] - self.accent = None - - def set_transcription(self, tscpt): - self.ipa = tscpt - - def set_accent(self, accent): - self.accent = accent - - def add_sound(self, url, accent=None): - self.sounds.append(Sound(url,accent)) - - def serializable(self, prefix = None): - snds = [] - for s in self.sounds: - snds.append(s.serializable()) - res = super().serializable(prefix) - res['transcript'] = self.ipa - if self.accent != None: - res['accent'] = self.accent - res['sounds'] = snds - return res - - def __str__(self): - return json.dumps(self.serializable('')) - - def __eq__(self, other): - res = isinstance(other, self.__class__) and self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) - i = 0 - while res and i<len(self.sounds): - res = self.sounds[i] == other.sounds[i] - i += 1 - return res - -####### -# Metadata -## TODO: -# * POS : créer une classe POS avec les traits dépendants (ex: masc en fr) -####### - -####### -# Senses -# TODO: créer une classe Translations -####### - -class Definition(SubInfo): - prfx = "def" - key = "definition" - - def __init__(self, lang, text, prefix=None): - super().__init__(prefix) - if text != "": - self.lang = lang - self.text = text - else: - raise ValueError(f"Definition.__init__: “{text}†empty definition.") - - def __eq__(self, other): - return isinstance(other, self.__class__) and self.lang == other.lang and self.text == other.text - - def serializable(self, prefix = None): - res = super().serializable(prefix) - res["lang"] = self.lang - res[self.__class__.key] = self.text - return res - -class Translation(Definition): - prfx = "trad" - key = "translation" - -class Example(SubInfo): - prfx = "ex" - - def __init__(self, transcript, source=None, url=None, prefix=None): - super().__init__(prefix) - if transcript != "": - self.text = transcript - self.source = source - self.url = url - else: - raise ValueError(f"Example.__init__: “{transcript}†empty example.") - - - def __eq__(self, other): - return isinstance(other, self.__class__) and self.text==other.text and self.source==other.source and self.url==other.url - - def serializable(self, prefix = None): - res = super().serializable(prefix) - res["example"]=self.text - if self.source != None: - res["source"] = self.source - if self.url != None: - res["url"] = self.url - return res - -class Sense(SubInfo): - prfx = "" - - def __init__(self, lang=None, definition=None, wiki_lang=None, prefix=None): - self.lang = lang - self.label = None - self.set_id(prefix) - #On réinitialise les identifiants des sous-éléments - if not isinstance(self, SubSense): - Definition.reset() - Example.reset() - Translation.reset() - SubSense.reset() - - self.definitions = [] #liste des définitions (elles auront une langue et un texte) - self.subsenses = [] #liste des sous-définitions (récursif…) - self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) - self.translations = [] #liste des traductions dans d'autres langues - self.domain = None #domaine d'usage du mot dans ce sens - if definition != None: - try: - self.add_def(wiki_lang, definition) - except ValueError as err: - raise ValueError(f"Sense.__init__() with empty definition\n{err}") - - def set_id(self, prefix=None): - if prefix != None and self.label == None: - self.label = f"{prefix}_{self.__class__.next_id}" #l'identifiant du sens - self.__class__.inc_n_id() - return self.label - - def get_id(self): - return f"{self.lang}.{self.label}" - - def set_domain(self, d): - self.domain = d - - def add_def(self, lang, definition): - theDef = Definition(lang, definition) - if theDef != None and theDef not in self.definitions: - theDef.set_id(self.set_id()) - self.definitions.append(theDef) - - def add_example(self, transcript, src=None, url=None, prefix=None): - try: - theEx = Example(transcript, src, url, prefix) - if theEx != None and theEx not in self.examples: - theEx.set_id(self.set_id()) - self.examples.append(theEx) - except ValueError as e: - print(f"Skipped empty example") - - def add_translation(self, lang=None, translation=None): - theTranslation = Translation(lang, translation) - if theTranslation != None and theTranslation not in self.translations: - theTranslation.set_id(self.set_id()) - self.translations.append(theTranslation) - - def add_subsense(self, subsense): - if self.label!=None: - subsense.set_id(self.set_id()) - if subsense not in self.subsenses: - self.subsenses.append(subsense) - - def __eq__(self, other): - res = isinstance(other, self.__class__) and self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain - i = 0 - while res and i < len(self.examples): - res = self.examples[i] in other.examples - i+=1 - i = 0 - while res and i < len(self.translations): - res = self.translations[i] in other.translations - i+=1 - i = 0 - while res and i < len(self.definitions): - res = self.definitions[i] in other.definitions - i+=1 - i = 0 - while res and i < len(self.subsenses): - res = self.subsenses[i] in other.subsenses - i+=1 - return res - - def serializable(self, prefix = None): - res = {} - if self.domain != None: - res["Domain"] = self.domain - if len(self.definitions) > 0: - res["Definitions"] = [] - for d in self.definitions: - res["Definitions"].append(d.serializable(prefix)) - if len(self.subsenses) > 0: - res["Subsenses"] = {} - for t in self.subsenses: - res["Subsenses"][t.set_id(self.label)]= t.serializable(prefix) - if len(self.examples) > 0 : - res["Examples"] = [] - for e in self.examples: - res["Examples"].append(e.serializable(prefix)) - if len(self.translations) > 0: - res["Translations"] = [] - for t in self.translations: - res["Translations"].append(t.serializable(prefix)) - return res - - def __str__(self): - return json.dumps(self.serializable()) - -class SubSense(Sense): - def set_id(self, prefix=None): - if prefix != None and self.label == None: - self.label = f"{prefix}.{self.__class__.next_id}" #l'identifiant du sens - self.__class__.inc_n_id() - return self.label - -class Entry: - #version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id) - def __init__(self, lemma, lang, wiki_lang, version_id, wkskt_version): - self.lemma = lemma - self.lang = lang - #Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile - self.sources = [] - self.sources.append({"wiktionary_language":wiki_lang,"permanentId":version_id,"wikstraktor_version":wkskt_version}) - self.current_source = 0 - self.pronunciations = [] - self.pos = None - self.senses = [] - #l'identifiant unique de la version de la page du wiktionnaire - Sense.reset() - - def set_pos(self, pos): - self.pos = pos - - def get_id(self, source_id=0): - #TODO: remplacer un jour le source id par la bonne source - if self.pos != None: - pos = self.pos - else: - pos = "" - return f"{self.lang}-{source_id}.{self.lemma}{pos}" - - def set_pronunciations(self, pron): - if isinstance(pron, Pronunciation): - self.add_pronunciation(pron) - elif type(pron) == list: - for p in pron: - if isinstance(p, Pronunciation): - self.add_pronunciation(p) - else: - raise ValueError(f"Entry.set_pronunciations: {p} is not a Pronunciation object ({p.__class__.__name__}).") - else: - raise ValueError(f"Entry.set_pronunciations: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") - - def add_pronunciation(self, p): - if p not in self.pronunciations: - p.set_id(self.get_id()) - self.pronunciations.append(p) - - def set_senses(self, senses): - for s in senses: - if isinstance(s, Sense): - self.add_sense(s) - else: - raise ValueError(f"Entry.set_senses: {s} is not a Sense object ({p.__class__.__name__}).") - - def add_sense(self, s): - if s not in self.senses: - s.set_id(self.get_id()) - self.senses.append(s) - - def is_valid(self): - return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 - - def __eq__(self, other): - res = isinstance(other, self.__class__) and self.lemma == other.lemma and self.lang == other.lang and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) - i = 0 - while res and i < len(self.senses): - res = self.senses[i] == other.senses[i] - i += 1 - i = 0 - while res and i < len(self.pronunciations): - res = self.pronunciations[i] == other.pronunciations[i] - i += 1 - return res - - def serializable(self, id=True): - res = {} - res['sources'] = self.sources - if id: - id = self.get_id() - res['id'] = id - else: - id == None - res[self.lemma] = {"pos":self.pos} - res[self.lemma]["pronunciations"] = [] - for p in self.pronunciations: - res[self.lemma]["pronunciations"].append(p.serializable(id)) - res[self.lemma]["senses"] = {} - for s in self.senses: - res[self.lemma]["senses"][s.get_id()]=s.serializable(id) - return res - - def __str__(self): - res = f"{self.lemma}_{self.lang} ({self.pos})\n" - for p in self.pronunciations: - res += f"{str(p)}\n" - for s in self.senses: - res += f"{str(s)}\n" - return res - -class ParserContext: - def __init__(self, entry, lang, wiki_lang, wversion_id, version_id): - self.lemma = entry - self.lang = lang - self.wiki_lang = wiki_lang - self.page_version_id = wversion_id - self.wikstraktor_version = version_id - self.context = [] - self.entries = [] - - def get_level(self): - if len(self.context) == 0: - res = -1 - else: - res = self.context[-1]["wiki"].level - return res - - def push(self, wiki_context): - self.context.append({"wiki":wiki_context}) - - def pop(self, testNewEntry = True): - if testNewEntry: - self.create_entries() - return self.context.pop() - - def flush(self): - while len(self.context) > 0: - self.pop(True) - - def set_top_wiki(self, wiki_context): - if len(self.context) == 0: - self.push(wiki_context) - else: - self.context[-1]['wiki'] = wiki_context - - def set_top_entry_info(self, key, entry_context, testNewEntry=True): - if len(self.context) == 0: - raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.") - else: - self.context[-1][key] = entry_context - if testNewEntry: - self.create_entries() - - def create_entries(self): - #In the key dict there are traits that describe every thing (ety, pro) and different entities (POS:senses) - tmp = {} - res = 0 - pro = None - for l in self.context: - for k,v in l.items(): - if k == "pro": - pro = v - elif k == "ety" or k == "wiki": - #wiki context is not necessary - pass #On ignore l'étymologie pour le moment - else: - tmp[k]=v - if(pro!=None and len(tmp)>0): - for pos,senses in tmp.items(): - e = Entry(self.lemma, self.lang, self.wiki_lang, self.page_version_id, self.wikstraktor_version) - e.set_pronunciations(pro) - e.set_pos(pos) - e.set_senses(senses) - #an improvement would be to remove that sense from context, but we test not to add doubles - if e.is_valid() and e not in self.entries: - res += 1 - self.entries.append(e) - return res - - def debug_top(self): - res = "Context: " - if len(self.context) == 0 : - res += "0" - else: - info = "" - for k,v in self.context[-1].items(): - if k != 'wiki': - if info != "": - info += "\n\t\t\t" - info += f"{k} → {str(v)}" - res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}" - return res - - def __str__(self): - res = "" - i=0 - for c in self.context: - res += f"====={i}======\n" - for k,v in c.items(): - if k!= "wiki": - res+=f" {k}→{v}\n" - else: - res+=f" {k}→{len(v)}\n" - i+=1 - return res+f"nb of entries: {len(self.entries)}" - - - -class Wikstraktor: - @classmethod - def get_instance(cls, wiki_language, entry_language): - try: - m_name = f"{wiki_language}_{entry_language}".capitalize() - instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() - instance.version = the_version - instance.log = Wikstraklog(the_version, entry_language, wiki_language) - except ModuleNotFoundError: - print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module") - instance = None - return instance - - def __init__(self): - self.entries = [] - self.pwb = pywikibot - self.wtp = wikitextparser - self.parserContext = None - - def get_file_url(self, file_page_name): - res = None - try: - f = self.pwb.FilePage(self.site, file_page_name) - res = f.get_file_url() - except pywikibot.exceptions.NoPageError: - print(f"{file_page_name} does not exist in {self.site}.") - return res - - #retrieves the content of a page and processes it (adding the entries to the list of entries) - #returns the number of entries added - def fetch(self, graphy): - nb_entries_added = 0 - page = self.pwb.Page(self.site, graphy) - to_parse = [] - if page.text != "": - sections = self.wtp.parse(page.text).sections - found = False - i = 0 - ### find language - while i < len(sections) and not found: - found = sections[i].title != None and sections[i].title.capitalize() == self.constants[self.entry_language] - if not found: - i += 1 - if found: - nb_entries_added = self.parse(page.title(), page.latest_revision_id, sections[i].sections)#self.wtp.parse(s.contents).sections) - return nb_entries_added - - def parse(self, entry, v_id, sections): - self.parserContext = ParserContext(entry, self.entry_language, self.wiki_language, v_id, self.version) - self.log.set_context(entry, v_id) - for s in sections: - if s.title != None : - #handle wiki context - if self.parserContext.get_level() < s.level: - self.parserContext.push(s) - else: - while self.parserContext.get_level() > s.level: - self.parserContext.pop(True) - self.parserContext.set_top_wiki(s) - #get section title - stitle = self.wtp.parse(s.title).templates - if stitle == []: - stitle = s.title - else: - stitle = stitle[0].arguments[0].value - if self.isPro(stitle): - self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents))) - elif self.isEty(stitle): - self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents))) - else: - #Edit to process other types of sections - pos = self.process_POS(stitle) - if pos != None : - self.parserContext.set_top_entry_info(pos, self.process_senses(self.wtp.parse(s.contents))) - self.parserContext.flush() - res = len(self.parserContext.entries) - if res > 0: - for e in self.parserContext.entries: - self.entries.append(e) - return res - - def isPro(self, title): - if type(self.constants['pro']) == str: - res = title == self.constants['pro'] - else: - res = title in self.constants['pro'] - return res - - def isEty(self, title): - if type(self.constants['ety']) == str: - res = title == self.constants['ety'] - else: - res = title in self.constants['ety'] - return res - - #recognizes POS and returns None if it can't - def process_POS(self, parsedwikitext): - pass#in subclass - - def process_pronunciation(self, parsedwikitext): - pass#in subclass - - def process_etymology(self, parsedwikitext): - pass#in subclass - - #can be overloaded - def process_example(self, example_wiki_text): - k = 0 - isEx = 0 - res = None - #process templates - while k < len(self.wtp.parse(example_wiki_text).templates) and isEx == 0 : - if (self.wtp.parse(example_wiki_text).templates[k].normal_name() in self.constants['t_ex']): - res = self.wtp.parse(example_wiki_text).templates[0].arguments[-1].value - isEx = 1 - k += 1 - if isEx == 0: - res = self.wtp.parse(example_wiki_text).plain_text().strip() - return res - - #can be overloaded - def process_definition(self, definition, sub_items, def_level = True): - if def_level: - newSense = Sense(self.entry_language, self.wtp.parse(definition).plain_text().strip(), self.wiki_language) - pattern_ex = self.constants['sense_pattern'][0]["ex"] - pattern_subdef = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["def"] - else: - newSense = SubSense(self.entry_language, self.wtp.parse(definition).plain_text().strip(), self.wiki_language) - pattern_subdef = None - pattern_ex = self.constants['sense_pattern'][0]["add_subdef"] + self.constants['sense_pattern'][0]["ex"] - #Process examples - a = 0 - #print(newSense, sub_items)# DEBUG: - for item_list in sub_items: - if item_list.pattern == pattern_ex: - for item in item_list.items: - newSense.add_example(self.process_example(item)) - #Si on veut traiter les sous items (ex traductions), on peut utiliser - #item_list.sublists(a) - if def_level and item_list.pattern == pattern_subdef: - for item in item_list.items: - newSense.add_subsense(self.process_definition(item, item_list.sublists(a), False)) - a += 1 - return newSense - - def process_senses(self, sensesContent): - l = sensesContent.get_lists((self.constants['sense_pattern'][0]["def"])) - senses = [] - if len(l) > 1: - self.log.add_log("Wikstraktor.process_senses", f"===== WARNING ======\nmore than one sense list, make sure we don't forget anything\nignored lists : \n{l[1:]}\n===================") - l = l[0] #l now contains a list of list items - if l.pattern == self.constants['sense_pattern'][0]["def"]: - i = 0 - for item in l.items: - senses.append(self.process_definition(item, l.sublists(i))) - i += 1 - return senses - - def __str__(self): - return self.export() - - def export(self, id=True, ascii=False, compact=False): - res = [] - for e in self.entries: - res.append(e.serializable(id)) - if compact: - return json.dumps(res, ensure_ascii=ascii) - else: - return json.dumps(res, ensure_ascii=ascii, indent=4) - -if __name__ == "__main__": - import argparse - from argparse import RawTextHelpFormatter #pour le formattage de l'aide - parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire - \033[1m\033[32mex :\033[0m - ‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m - ‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m - ‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -n -A -C\033[0m""") - parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en") - parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en") - parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None) - parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None) - parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true") - parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true") - parser.add_argument("-n", "--no_id", help="json sans id", action="store_true") - args = parser.parse_args() - if args.mot != None: - w = Wikstraktor.get_instance(args.wiki_language, args.language) - resp = None - if w.fetch(args.mot) > 0: - resp = w.export(not args.no_id, args.force_ascii, args.compact) - if args.destination_file != None: - f = open(args.destination_file, "w") - f.write(resp) - f.close - else: - print(resp) - else: - raise NameError("Pas de mot demandé") diff --git a/src/Wikstraktor/wikstraktor_old_pierre.py b/src/Wikstraktor/wikstraktor_old_pierre.py new file mode 100644 index 0000000000000000000000000000000000000000..b9780d8e58b3fb8ac2eea8b5a244c75a626375fd --- /dev/null +++ b/src/Wikstraktor/wikstraktor_old_pierre.py @@ -0,0 +1,435 @@ +#!/usr/bin/env python3 +import pywikibot +import wikitextparser +import importlib +import json + +####### +# Oral +####### +class Sound: + def __init__(self, url, accent): + self.url = url + self.accent = accent + + def __eq__(self, other): + return self.url == other.url and self.accent == other.accent + + def serializable(self): + if self.accent == None: + res = {"url":self.url} + else: + res = {"accent":self.accent, "url":self.url} + return res + +class Pronunciation: + def __init__(self): + self.ipa = None + self.sounds = [] + self.accent = None + + def set_transcription(self, tscpt): + self.ipa = tscpt + + def set_accent(self, accent): + self.accent = accent + + def add_sound(self, url, accent=None): + self.sounds.append(Sound(url,accent)) + + def serializable(self): + snds = [] + for s in self.sounds: + snds.append(s.serializable()) + if self.accent == None: + res = {"transcript":self.ipa, "sounds":snds} + else: + res = {"accent":self.accent, "transcript":self.ipa, "sounds":snds} + return res + + def __str__(self): + return f"{self.serializable()}" + + def __eq__(self, other): + res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) + i = 0 + while res and i<len(self.sounds): + res = self.sounds[i] == other.sounds[i] + i += 1 + return res + +####### +# Metadata +## TODO: +# * POS : créer une classe POS avec les traits dépendants (ex: masc en fr) +####### + +####### +# Senses +# TODO: créer une classe Translations +####### + +class Definition: + def __init__(self, lang, text): + self.lang = lang + self.text = text + + def __eq__(self, other): + return self.lang == other.lang and self.text == other.text + + def serializable(self): + return {"lang":self.lang, "definition":self.text} + +class Translation(Definition): + def serializable(self): + return {"lang":self.lang, "translation":self.text} + +class Example: + def __init__(self, transcript, source=None, url=None): + self.text = transcript + self.source = source + self.url = url + + def __eq__(self, other): + return self.text==other.text and self.source==other.source and self.url==other.url + + def serializable(self): + res = {"example":self.text} + if self.source != None: + res["source"] = self.source + if self.url != None: + res["url"] = self.url + return res + +class Sense: + def __init__(self, label): + self.label = label #l'identifiant du sens + self.definitions = [] #liste des définitions (elles auront une langue et un texte) + self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) + self.translations = [] #liste des traductions dans d'autres langues + self.domain = None #domaine d'usage du mot dans ce sens + + def set_domain(self, d): + self.domain = d + + def add_def(self, lang, definition): + theDef = Definition(lang, definition) + if theDef not in self.definitions: + self.definitions.append(theDef) + + def add_example(self, transcript, src=None, url=None): + theEx = Example(transcript, src, url) + if theEx not in self.examples: + self.examples.append(theEx) + + def add_translation(self, lang, translation): + theTranslation = Translation(lang, translation) + if theTranslation not in self.translations: + self.translations.append(theTranslation) + + def __eq__(self, other): + res = self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain + i = 0 + while res and i < len(self.examples): + res = self.examples[i] in other.examples + i+=1 + i = 0 + while res and i < len(self.translations): + res = self.translations[i] in other.translations + i+=1 + i = 0 + while res and i < len(self.definitions): + res = self.definitions[i] in other.definitions + i+=1 + return res + + def serializable(self): + res = {} + res[self.label]={} + if self.domain != None: + res[self.label]["domain"] = self.domain + res[self.label]["defs"] = [] + for d in self.definitions: + res[self.label]["defs"].append(d.serializable()) + res[self.label]["exs"] = [] + for e in self.examples: + res[self.label]["exs"].append(e.serializable()) + res[self.label]["trad"] = [] + for t in self.translations: + res[self.label]["trad"].append(t.serializable()) + return res + + +class Entry: + def __init__(self, lemma): + self.lemma = lemma + self.pronunciations = [] + self.pos = None + self.senses = [] + + def set_pronunciations(self, pron): + if isinstance(pron, Pronunciation): + self.pronunciations.append(pron) + elif type(pron) == list: + for p in pron: + if isinstance(p, Pronunciation): + self.pronunciations.append(p) + else: + raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).") + else: + raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") + + def set_pos(self, pos): + self.pos = pos + + def set_senses(self, senses): + self.senses = senses + + def is_valid(self): + return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 + + def __eq__(self, other): + res = self.lemma == other.lemma and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) + i = 0 + while res and i < len(self.senses): + res = self.senses[i] == other.senses[i] + i += 1 + i = 0 + while res and i < len(self.pronunciations): + res = self.pronunciations[i] == other.pronunciations[i] + i += 1 + return res + + def serializable(self): + res = {} + res[self.lemma] = {"pos":self.pos} + res[self.lemma]["pronunciations"] = [] + for p in self.pronunciations: + res[self.lemma]["pronunciations"].append(p.serializable()) + res[self.lemma]["senses"] = [] + for s in self.senses: + res[self.lemma]["senses"].append(s.serializable()) + return res + + def __str__(self): + res = f"{self.lemma} ({self.pos})\n" + for p in self.pronunciations: + res += f"{str(p)}\n" + for s in self.senses: + res += f"{str(s)}\n" + return res + +class ParserContext: + def __init__(self, entry): + self.lemma = entry + self.context = [] + self.entries = [] + + def get_level(self): + if len(self.context) == 0: + res = -1 + else: + res = self.context[-1]["wiki"].level + return res + + def push(self, wiki_context): + self.context.append({"wiki":wiki_context}) + + def pop(self, testNewEntry = True): + if testNewEntry: + self.create_entry() + return self.context.pop() + + def set_top_wiki(self, wiki_context): + if len(self.context) == 0: + self.push(wiki_context) + else: + self.context[-1]['wiki'] = wiki_context + + def set_top_entry_info(self, key, entry_context, testNewEntry=True): + if len(self.context) == 0: + raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.") + else: + self.context[-1][key] = entry_context + if testNewEntry: + self.create_entry() + + def create_entry(self): + #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS + res = Entry(self.lemma) + for l in self.context: + #print(l.keys()) + if "pro" in l.keys(): + res.set_pronunciations(l['pro']) + if "ety" in l.keys(): + pass #On ignore l'étymologie pour le moment + if "POS" in l.keys(): + res.set_pos(l['POS']) + if "senses" in l.keys(): + res.set_senses(l['senses']) + # TODO: Ajouter les autres types + if res.is_valid() and res not in self.entries: + self.entries.append(res) + else: + res = None + return res + + def debug_top(self): + res = "Context: " + if len(self.context) == 0 : + res += "0" + else: + info = "" + for k,v in self.context[-1].items(): + if k != 'wiki': + if info != "": + info += "\n\t\t\t" + info += f"{k} → {str(v)}" + res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}" + return res + + +class Wikstraktor: + @classmethod + def get_instance(cls, wiki_language, entry_language): + try: + m_name = f"{wiki_language}_{entry_language}".capitalize() + instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() + except ModuleNotFoundError: + print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module") + instance = None + return instance + + def __init__(self): + self.entries = [] + self.pwb = pywikibot + self.wtp = wikitextparser + self.parserContext = None + + def get_file_url(self, file_page_name): + res = None + try: + f = self.pwb.FilePage(self.site, file_page_name) + res = f.get_file_url() + except pywikibot.exceptions.NoPageError: + print(f"{file_page_name} does not exist in {self.site}.") + return res + + #retrieves the content of a page and processes it (adding the entries to the list of entries) + #returns the number of entries added + def fetch(self, graphy): + nb_entries_added = 0 + page = self.pwb.Page(self.site, graphy) + to_parse = [] + if page.text != "": + sections = self.wtp.parse(page.text).sections + found = False + i = 0 + ### find language + while i < len(sections) and not found: + found = sections[i].title != None and sections[i].title.capitalize() == self.constants[self.entry_language] + if not found: + i += 1 + if found: + nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections) + return nb_entries_added + + def parse(self, entry, sections): + self.parserContext = ParserContext(entry) + for s in sections: + if s.title != None : + #handle wiki context + if self.parserContext.get_level() < s.level: + self.parserContext.push(s) + else: + while self.parserContext.get_level() > s.level: + self.parserContext.pop() + self.parserContext.set_top_wiki(s) + stitle = self.wtp.parse(s.title).templates + if stitle == []: + stitle = s.title + else: + stitle = stitle[0].arguments[0].value + if self.isPro(stitle): + self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents))) + elif self.isEty(stitle): + self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents))) +# elif stitle in self.constants['POS'].keys(): + else: + pos = self.process_POS(stitle) + if pos != None : + self.parserContext.set_top_entry_info('POS', pos, False) + self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) + res = len(self.parserContext.entries) + if res > 0: + for e in self.parserContext.entries: + self.entries.append(e) + return res + + def isPro(self, title): + if type(self.constants['pro']) == str: + res = title == self.constants['pro'] + else: + res = title in self.constants['pro'] + #print(title, res) + return res + + def isEty(self, title): + if type(self.constants['ety']) == str: + res = title == self.constants['ety'] + else: + res = title in self.constants['ety'] + return res + + def process_POS(self, parsedwikitext): + pass#in subclass + + def process_pronunciation(self, parsedwikitext): + pass#in subclass + + def process_etymology(self, parsedwikitext): + pass#in subclass + + def process_senses(self, entry, pos, parsedwikitext): + pass#in subclass + + def __str__(self): + return self.export() + + def export(self, ascii=False, compact=False): + res = [] + for e in self.entries: + res.append(e.serializable()) + if compact: + return json.dumps(res, ensure_ascii=ascii) + else: + return json.dumps(res, ensure_ascii=ascii, indent=4) + +if __name__ == "__main__": + import argparse + from argparse import RawTextHelpFormatter #pour le formattage de l'aide + parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="""Interroger un wiktionnaire + \033[1m\033[32mex :\033[0m + ‣\033[0m\033[32m./wikstraktor.py -m blue\033[0m + ‣\033[0m\033[32m./wikstraktor.py -m blue -f blue.json -A -C\033[0m + ‣\033[0m\033[32m./wikstraktor.py -l en -w fr -m blue -f blue.json -A -C\033[0m""") + parser.add_argument("-l", "--language", help="la langue du mot", type=str, default = "en") + parser.add_argument("-w", "--wiki_language", help="la langue du wiki", type=str, default = "en") + parser.add_argument("-m", "--mot", help="le mot à chercher", type=str, default=None) + parser.add_argument("-f", "--destination_file", help="le fichier dans lequel stocker le résultat", type=str, default=None) + parser.add_argument("-A", "--force_ascii", help="json avec que des caractères ascii", action="store_true") + parser.add_argument("-C", "--compact", help="json sans indentation", action="store_true") + args = parser.parse_args() + if args.mot != None: + w = Wikstraktor.get_instance(args.wiki_language, args.language) + resp = None + if w.fetch(args.mot) > 0: + resp = w.export(args.force_ascii, args.compact) + if args.destination_file != None: + f = open(args.destination_file, "w") + f.write(resp) + f.close + else: + print(resp) + else: + raise NameError("Pas de mot demandé") diff --git a/src/Wikstraktor/wikstraktor_server.py b/src/Wikstraktor/wikstraktor_server.py new file mode 100644 index 0000000000000000000000000000000000000000..3e63995082eb7eec9b609c4e8c1be5378370d72f --- /dev/null +++ b/src/Wikstraktor/wikstraktor_server.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +from flask import Flask #server +from flask import request #to handle the different http requests +from flask import Response #to reply (we could use jsonify as well but we handled it) +from flask_cors import CORS #to allow cross-origin requests +from wikstraktor import Wikstraktor +import wikstraktor_server_config as config + +app = Flask(__name__) +CORS(app) + +@app.route('/', methods=['GET']) +def index(): + c = request.remote_addr + response = f"<p>Server is running, your ip is {c}</p>" + return Response(response, 200) + +@app.route('/search/<word>', methods=['GET']) +def default_search(word): + return search(config.wiktlang, config.wordlang, word) + +@app.route('/search/<wiktlang>/<wordlang>/<word>', methods=['GET']) +def search(wiktlang, wordlang, word): + w = Wikstraktor.get_instance(wiktlang, wordlang) + if w.fetch(word) > 0: + resp = w.__str__() + status = 200 + mimetype='application/json' + else: + resp = f"""<!doctype html> +<html> + <head> + <title>Error</title> + </head> + <body> + <h1>{word}</h1> + <p>{word} is unknown in “{wordlang}†in {wiktlang}.wiktionary.org.</p> + </body> +</html>""" + status = 404 + mimetype='text/html' + return Response(resp, status=status, mimetype=mimetype) + +if __name__ == "__main__": + app.run(host=config.host, port=config.port, debug=config.debugging) diff --git a/src/Wikstraktor/wikstraktor_server_config.py b/src/Wikstraktor/wikstraktor_server_config.py new file mode 100644 index 0000000000000000000000000000000000000000..951a8d3dbb6e3ca60ee35ac43b443d3d1e6a7e53 --- /dev/null +++ b/src/Wikstraktor/wikstraktor_server_config.py @@ -0,0 +1,5 @@ +wiktlang = "en" +wordlang = "en" +host = "0.0.0.0" +port = 5000 +debugging = True diff --git a/templates/home.html.twig b/templates/home.html.twig index 93a7954fee36ac71b46609ff7175bc9395f26547..743aeb4c6119c97c8965d0f00f386fef40a5d7a7 100644 --- a/templates/home.html.twig +++ b/templates/home.html.twig @@ -17,7 +17,7 @@ {% if app.user.picture %} <p><img src="{{ path('user_picture') }}" style="max-width: 120px; max-height: 120px;"></p> {% else %} - <p>{{ app.user|badge }}</p> + <p>{{ app.user|badgeXl }}</p> {% endif %} </div>