From f391b7f3b6c2a322f2eca90384ff6038851ba541 Mon Sep 17 00:00:00 2001 From: Enzo Simonnet <enzosim@laposte.net> Date: Wed, 31 May 2023 11:12:11 +0200 Subject: [PATCH] =?UTF-8?q?Homog=C3=A9n=C3=A9isation=20des=20pos=20(rapide?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- parsers/fr_constants.py | 69 +++++++++++++++++++++++++++++++--------- test_wikstraktor.py | 2 +- wikstraktor.sqlite | Bin 16384 -> 20480 bytes 3 files changed, 55 insertions(+), 16 deletions(-) diff --git a/parsers/fr_constants.py b/parsers/fr_constants.py index 73d6812..b769bba 100644 --- a/parsers/fr_constants.py +++ b/parsers/fr_constants.py @@ -9,11 +9,50 @@ string_values = { "t_ipa":"pron", #template for transcription "t_snd":"écouter", #template for audio "t_acc":["US", "UK"], #template for accents +"regions":{ + "UK":"United Kingdom", + "United Kingdom":"United Kingdom", + "British":"Great Britain", + "GB":"Great Britain", + "Great Britain":"Great Britain", + "Scot":"Scotland", + "Scottish":"Scotland", + "Scotland":"Scotland", + "Irl":"Ireland", + "Irish":"Ireland", + "Ireland":"Ireland", + "Ulst":"Northern Ireland", + "Ulster":"Northern Ireland", + "Northern Ireland":"Northern Ireland", + "Wls":"Wales", + "Welsh":"Wales", + "Wales":"Wales", + "English":"England", + "Eng":"England", + "En":"England", + "England":"England", + "Canada":"Canada", + "Canadian":"Canada", + 'North American':'North America', + 'North America':"North America", + "US":"United States of America", + "USA":"United States of America", + "United States":"United States of America", + "United States of America":"United States of America", + "NZ":"New Zealand", + "New Zealand":"New Zealand", + "Au":"Australia", + "AU":"Australia", + "Australia":"Australia", + "India":"India", + "Indian":"India", + "Nigeria":"Nigeria", + "Nigerian":"Nigeria"}, "sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns {"def":"\\#", "ex":"\\#\\*", "add_subdef":"\\#"} ], "POS":{ - "adjectif":["adjectif","adjectif qualificatif","adj"], + "Adj":["adjectif","adjectif qualificatif","adj"], "adjectif démonstratif":["adjectif démonstratif","adj-dém","adjectif dém"], "adjectif exclamatif":["adjectif exclamatif","adj-excl","adjectif exc"], "adjectif indéfini":["adjectif indéfini","adjectif ind","adj-indéf"], @@ -21,47 +60,47 @@ string_values = { "adjectif numéral":["adjectif numéral","adjectif num","adj-num"], "adjectif possessif":["adjectif possessif","adjectif pos","adj-pos"], "adjectif relatif":["adjectif relatif","adjectif rel","adj-rel"], - "adverbe":["Adverbe","adv"], + "Adv":["Adverbe","adv"], "adverbe indéfini":["adverbe indéfini","adv-ind","adverbe ind"], "adverbe interrogatif":["adverbe interrogatif","dverbe int","adv-int"], "adverbe pronominal":["adverbe pronominal","adv-pron","adverbe pro"], "adverbe relatif":["adverbe relatif","adv-rel","adverbe rel"], - "affixe":["affixe","aff"], - "article":["article","art"], + "Aff":["affixe","aff"], + "Art":["article","art"], "article défini":["article défini","article déf","art-déf"], "article indéfini":["article indéfini","art-indéf","article ind"], "article partitif":["article partitif","art-part","article par"], "circonfixe":["circonfixe","circon","circonf"], "classificateur":["classificateur","class","classif"], - "conjonction":["conjonction","conj"], + "Conj":["conjonction","conj"], "conjonction de coordination":["conjonction de coordination","conj-coord","conjonction coo"], "copule":["copule"], - "déterminant":["déterminant","dét"], + "Det":["déterminant","dét"], "enclitique":["cnclitique","encl"], "gismu":["gismu"], "infixe":["infixe","inf"], "interfixe":["interfixe","interf"], - "interjection":["interjection","interj"], + "Interj":["interjection","interj"], "lettre":["lettre"], "locution":["locution","loc"], "locution-phrase":["locution-phrase","loc-phr","phrase locution","phrase","locution-phrase"], - "nom commun":["nom","nom commun","substantif"], + "N":["nom","nom commun","substantif"], "nom de famille":["nom de famille","nom-fam"], - "nom propre":["nom propre","nom-pr"], + "NP":["nom propre","nom-pr"], "nom scientifique":["nom scientifique","nom-sciences","nom scient","nom science"], - "numéral":["numéral","num","numér"], + "Num":["numéral","num","numér"], "onomatopée":["onomatopée","onoma","onom"], - "particule":["particule","part"], + "Particle":["particule","part"], "particule numérale":["particule numérale","part-num","particule num"], "patronyme":["patronyme"], - "postposition":["postposition","postpos","post"], + "Postp":["postposition","postpos","post"], "pré-nom":["pré-nom"], "pré-verbe":["pré-verbe"], "préfixe":["préfixe","préf"], "prénom":["prénom"], - "préposition":["préposition","prép"], + "Prep":["préposition","prép"], "proclitique":["proclitique","procl"], - "pronom":["pronom"], + "Pro":["pronom"], "pronom démonstratif":["pronom démonstratif","pronom dém","pronom-dém"], "pronom indéfini":["pronom indéfini","pronom ind","pronom-indéf"], "pronom interrogatif":["pronom interrogatif","pronom int","pronom-int"], @@ -78,6 +117,6 @@ string_values = { "symbole":["symbole","symb"], "variante par contrainte typographique":["variante typographique","variante typo","variante par contrainte typographique","var-typo"], "verbe pronominal":["verbe pronominal","verb-pr","verbe pr"], - "verbe":["verbe","verb"] + "V":["verbe","verb"] } } diff --git a/test_wikstraktor.py b/test_wikstraktor.py index 00d6228..a13c3fe 100644 --- a/test_wikstraktor.py +++ b/test_wikstraktor.py @@ -5,7 +5,7 @@ if __name__ == "__main__": # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) #e.fetch("water") - f.fetch("water") + f.fetch("blue") # print(e.fetch("test"), "entries added") #print(e) file_path = 'test.json' diff --git a/wikstraktor.sqlite b/wikstraktor.sqlite index cf340034be915fbe4d55478064deb092757f3e55..f83e39e04000619fc360339dce8bec4a227f9bff 100644 GIT binary patch delta 582 zcma)(&2G~`6ou{JP^e1W2vq{As<(iYHjN{v?hlQGDik(ItdLlsDvL>c?T$2_X=Wxw zQC*R?FF^4N@Bl23!UKdiU<sS<d4X=aVcee`6724rGjs2E?u<tE=(GKCzUWz&RXl|q z*R$$7zwVinU7A>>$+$fGrZk?EUfMGS_uR-<(Y<ibjr`j!IIH7}#k;e&=B%@9%C9(u zY3uB?_XUnuR;Pn@xjg5*I<eRMwZQW?yugRXMsxjHv$38v9@p+L@gAM_{h!(Q&8Hu@ z;9ksxQV?Q9In`9~^sT}WhyuNbu)EteHsx`<U4srrLgKZqXv~T3OUxY+0Wo(OYK<}% zQ0-%v62^w+l^!5+h=m9tk`#HUKuCy4vOtZWVA4UUOcM#6;br%s=<~Rq^(qe+2i2QM z#|!@vsTy_#W1`nLv(uJH62S*ZnBF@)goA^>#(e>XImshj!5JhHh7y%Qk{A;iGE-mr zh|H`))w8ALNA^m#v^Ago2;OC1m$%2?g4Z*jeCM|9wk)@0{%PBVqh{>}{^-d+;d@(^ iDwqW)De{zw2vhxB3)q*0Dy)G*#!?#QQeK+nJop2vgSVIf delta 100 zcmZozz}V2hI6<0Ki-CcGWuk&TzZQdDxEC*w$H;epf$zX(L4hE?%?J2g7`Yhve*=XX zSonVf1z+$_4B*)OPo7PHS(uq+GP{BL#=>h%%#GTblNHUSCtom*oV-E9a`P5rA07a; CFdD@G -- GitLab