From 842647de89255b65eb05b69f21b1dda5f7d80c0d Mon Sep 17 00:00:00 2001 From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr> Date: Mon, 19 Jun 2023 13:30:24 +0200 Subject: [PATCH] =?UTF-8?q?Pb=20sons=20ignor=C3=A9s=20EN=20corrig=C3=A9=20?= =?UTF-8?q?=3F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- parsers/en_en.py | 14 ++++++-------- wikstraktor.py | 22 +++++++++++++--------- wikstraktor.sqlite | Bin 20480 -> 28672 bytes 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/parsers/en_en.py b/parsers/en_en.py index a721c9b..c52dcfc 100644 --- a/parsers/en_en.py +++ b/parsers/en_en.py @@ -24,18 +24,16 @@ class En_en_straktor(Wikstraktor): acc = None for j, t in enumerate(templates): if (t.normal_name() == self.constants['t_acc'] and templates[j+1].normal_name()!= self.constants['t_acc']): - acc = t.arguments + for a in t.arguments: + p.set_accent(a.value) elif t.normal_name() == self.constants['t_ipa']: p.set_transcription(t.arguments[1].value) - if acc != None: - for a in acc: - p.set_accent(a.value) elif t.normal_name() == self.constants['t_snd']: p.add_sound(self.get_file_url(t.arguments[1].value), t.arguments[2].value) - if j==len(templates)-1 or templates[j+1].normal_name()== self.constants['t_acc'] : - if p.ipa != None or p.has_accents(): - pronunciations.append(p) - p = Pronunciation() + if p.ipa != None or p.has_accents() or p.has_sounds(): + pronunciations.append(p) + else: + self.log.add_log("En_en_straktor.process_pronunciation", f"“{l.fullitems[i]}†processed as empty → {p}") i += 1 return pronunciations diff --git a/wikstraktor.py b/wikstraktor.py index ff1c57e..ad85137 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -26,19 +26,20 @@ class SubInfo: cls.next_id = 0 def __init__(self, prefix = None): - self.id = None + self.id = self.__class__.next_id + self.__class__.inc_n_id() + self.label = None self.set_id(prefix) - def set_id(self, prefix): - if self.id == None and prefix != None: - self.id = f"{prefix}_{self.__class__.prfx}{self.__class__.next_id}" - self.__class__.inc_n_id() - return self.id + def set_id(self, prefix, force = False): + if (self.label == None or force) and prefix != None: + self.label = f"{prefix}_{self.__class__.prfx}{self.id}" + return self.label def serializable(self, prefix = None): res = {} - if self.set_id(prefix) != None: - res["id"] = self.id + if prefix != None: + res["id"] = self.set_id(prefix) return res @@ -82,6 +83,9 @@ class Pronunciation(SubInfo): def add_sound(self, url, accent=None): self.sounds.append(Sound(url,accent)) + def has_sounds(self): + return len(self.sounds) > 0 + def serializable(self, prefix = None): snds = [] for s in self.sounds: @@ -365,7 +369,7 @@ class Entry: def add_pronunciation(self, p): if p not in self.pronunciations: - p.set_id(self.get_id()) + p.set_id(self.get_id(), True) #pro often parsed without context self.pronunciations.append(p) def set_senses(self, senses): diff --git a/wikstraktor.sqlite b/wikstraktor.sqlite index b60d5703a8684738f7a25ed782234ade2c97f3c3..242a322cf854b7548f263467d6032ed6a7ffdb17 100644 GIT binary patch literal 28672 zcmeHPTWllM8Fn_#&AGAA<<b^7%@&eul4Zu@>nvNyW(^JOF0e`1rop6VJjeDZ;~6tE z-Vkua+MueHP{l=RUwV<+he{QqQW5X;rBX}#-iyS;N^M{Ig4BzuN>%?e6Ca#OX0c<& zYs!p>*K;%9|DFF_{_p>1R-U?`SOn#Dql7Iagl-A-^@ScrC=?1k4Sz%M*Vy{tOSrKE z|I+=B+nx@czVg^OT-7)6UI_jRwt#_vfq;R4fq;R4fq;R4fq;R4fq;R4fq;P{F;G9- zH+09zlYOr&T3Axa6=ln`47{c4g{H5^mM&)&S2JjJ@!W+Bnrb>Tg=QinQ|GZorbf=g zC9BI9Gb86Sk1VcTSVc=~moH~7t!~nTE31napPYwl8#i0Jv3ZpkrlM<8BdeLOt)fd$ zz~9=13-(pIA#bW!D^zd+e)`~fWlOo?d3rw$%DwI_6A0KfqFI}YJT<a>X*Kg`=JGxW zOITHju~{NkQHM(?1d41)*DScrIS`p00M^5kf1}(FP5x){pOYUtJ4b4a;HH3qfPsL4 zfPsL4fPsL4fPsL4fPsL4fPsL4fPr3MVDJ*47<dpq!gKJ^e>=_1-yV7=G<jy?>iDn5 z?;Kkh{Ym5>k<7^VhW|R89{Tp+p9bd!cEi65-`lSr`{l9YeL1-J=5Dox+ap5u`0?Aq z%eB!c7Znz`_=3P8o)fvW$R{u-=aR`pD$Pk8FAHfRkwhXUMC0(m(fzm-OC*yq0`iGL z<Z$-sYfYlg>9V2ch-q%hB(G?S1^UkQYu`sNzNku7qE)exD`F+Ly9-B%wot~Vh4g$C zeuwPtUcdGOw0Ap^5jGJim93rH+zEzv$b~m0#-l#r#i~UN4O_}pg3K~eRYfaMs9*%3 zl|9iee2$@Yv+E<;4%oeo;XTN1wCC8pm7zVzu5V~NVD~t~+srO>w0K(NxORQ_nOEO! zrLP^?cW+^6o7wdlZ3pa5GQ4hfQz9qy8oLt=t(#pR(RRS@IK%5^Hz^7|$L<(I>t@$S zv>mWJ%J90`O^96Ev#{SILxiDqv+E<;4%i)Gc-`#A#8|r5MuuUA*3GVuXggqch~agy z8|B1!&-v~kL+fJKPqas8_uK%(8*f_g!Yu4R-erS8+EuQuJzkXv<YyuMYE~7^u5VWD zRXb0Yzept|UBfq8w1=4y`&=X?#*$(*)@vQGpP}97BL1T7z(tNRy!%|F8+bo47dhAG zX+%tn#(i=ThxAX%2pa#N2z?Nmygc#a@xPCsAN%g;Uq(}rZ;kwM<kWC==(j`n3~B@K z4croz`hU?MJ{EyXKSf)8cFu016XE5T`<jL#J}$;mBHvaoR`shSoW1S$t;8>yt}PT) z-89ihDnfP5bVLEGNYKVc4o*Wvp3Tl9iNNqgkaOHr3J|nc3_{Q7d4%Cwl~@qV(*pz| z`J4ik9XoKBwM~eI3c4<%yrB?HHj!?iJT574x<pD6l&8RN7)aW&e|$%;Xob_Bf&ccY z23PgFyU3N}&pBOH_2(*JT6IY;!PvY_AP6klBle+eE+U|$L>Y2y2a~ZZ8=&0GBZCyM zA*=A+K2HeMK1^@R=Vn<)ZXWA-0U}&jY^6DZbFo|^DaX<hmSU->Or$slj^uO<gddU= z=;2(Og3^(L+!=<pd93#t?a{~jxoL*?5O$-z#_s(L?IG;?g|-8B?_+omvMUI^#_pFH z+Jo%+hPDHCPcpm**^TnO#_kkDdyrk<(00J?ml)oC>_)|ST;zI=UBuAtW7l7_9k6>Z z!|P@jCbnd+`R+Xot(#pR(RRS@-3;$Ob~#x2i_t`{vHL}ab|1U`qV0g)FEG4rcIl$5 z-O#GV6yK3oWS?hf-R$~^wgYzWVt8HbLJqa}5|Vc^v@Ul2MB4$ocQCvzcKu9GAJKPd z{C_<3T4-`<;?40tkI#&4jlLcEaO7XZ?+;H5zBO<re69aS$EtlV!6jYUYU+bFdl0eL z_t;A;@M#3p2{FhyN;wjbCpbQiQ@l(#o`+n1TIS-3crwS!NiLQbganp28jHyU8(~~) zeq_TlyASbtb<ueZf{L~=1k5@zEo@k5M$f|o69~hnfu^TRJJZv%jZl-O)J?raBwgM? zx<-(yY!N#OcS2ATZ7G_p?s$iuFPvfKo^*oln{93xWD7;2;q=E^3~!8}qJ=~Xil#Z` z1)x_rYQ;QNAaq2moTu@1Br@am1I7!9cG`Noh^;W57;8nHak}<JcsfT_%wlx~kkc=Y z)&PExDV2-ZR9=9Vq~=*h#VXoTj0Rc^HcevCs*r3Jl|VqIjsj{9JXK?0S;<*&7;A`J zRb+U|23X8v)g(|0VpgCQ1a3n`qRI_L2L89;Tg33BsP=5SK~Zh1ixkoBu&jYSP(K1b zj!~gvt0@M;WvFm@UV*oTfHAhZgi#q+z(yd#GVG3HkQ_)<Xis*h7kV3*N-+cP?XrQ3 z&`!^nsboak1(YPSVw4R9OaaYIR!j+WBa-)oUe}Th=&$TeOig{)Kli0=V{3_fx*_my z5a_jbtrAvIxoBXgIiUQAIx^jX?qQEqR*|-+0-lYxU{q5yXf<$N6WrCXo0OI<VU)95 z6lgFAHg)X`s+bj9b%chN;V!tRBq_AI2$<Vepnpu$&uKJc5~^;}?r0y+W@qQ2L&&I9 z$rWK}07(vR1$_^?hJBw7UCd~<z+0!QZzdXqzg+6Fuk4KpradOOx!l;$bc<~3hE?2w z{tvWmY#5}3jV=02A{+t2o&)|A3uXy$EmANDnWskykOt;M|Dq}`oRhFjr=K%0wor)| zc=zBt2cr;%UvT)72~|TC7)n6?Y!Zp5@$|gIx-*MS)7fg;)1_--yuono;B>^dH6q)3 z`8TYew=yEb;>DhOX>ppW=SkH%3w~KPF!=5~GNE{_s>t*Z{puxP*Joo5@C~uQsXzJ$ z{!n?^?x4_>=mgS}V!zPfsv-7wg)3+3y{Tfq7_}`w3Du(~8iNRzMtnl#p{Na3ktCiV zI7Si@)Qbv<R7yzXIG&3q2`@#HDV&$&Xfm0i3rzAn^g0dtwMoLu{Ie~3x>r>u`?VLm zOG!ok2H^R4kAvqt!KXx_*O`CdaPR^oh@j|L&P5>sgh?u%N+eSGWKtrrn8b5wkTK;L ziOUII5Tuk{bj%vMYi=`l+6GT6rdd!$eO?fS1jJSCfTtDHn?GIc(ARHVfxN)bvUQdJ zXpbK7i}N;K2w5AGrc;mZbCUTJnKWtqe{1N!p~=~ajqx|f-WiLGu0?(p=^I%eerxFU z!FLD#5dL@gu70ua1Gwn_ZmVT4?KRz6E!&t3Ky3hFDHerZeUu0^qiqsx(|!vDyd#1M z4iGhI6%BItbjb`)6r6$w2#n%R3CfdT(es;Et*UvJ-`$NI{tN-S{>?Ygi&OBf2N6we zmJN;PrsmNU#GIO$GnBFg$F+*8!Z8!F{IUuA8`<6Z`o)HQFRvZ6FYdE_MZKh#i=d0x zHeSJ&Ro%8fbjuF5?KN>;Ax4#ki*`%^E824gL^5V^9xYZVi>iVl&b&mn(bu6Q07408 zXI(e8&=Ory;~Zq(Ez=1nob}c=tc0&$dvhCt9RfOpQJ~cmt&^K<h1=RQ7aH8wHoDL4 zN51pxH`?@_L`&vnA7|#dx`qG~p3s)-K(H8I4_A)VH7f7FR%)+r2(8)on5TVF5$5hT zgvRae+Q+wK-}*AM&(-yz3Nse%9s459<63KNxBgWx+h7gPwIve!+ji~SR=@MnhHdqg z{lf~M5~C@f+im&DMLHinuuZ$&12AOLc?wr#MW=`1iMNWmKq)(|7eFwl$I;B%%Iw~S z9}P-Pz1Cw&YoD6hsYh&T=rVSoi5i+4h(k4BJJf)%8OGK1mX@)SWPqOQXkE?&bZ|E@ z%8IID!%K~*-Ip2JdrEQlW@F9U>h|F)i`xC?8#rIO**Hl-H+DP;ompb7zgNr2B#W-M zJ_d0QYo1V}=NMF`G3+I=x(`*|v6LJYd7&I0+}0EH{fj;02)JBPVTtJBvDt?iyY8Hb zkL)_}p-ErrcB86PSh5mgAq|%xjgYtXKiQ}hKm??3VuOJL)}*i^&St$u*vv!BzBAqH zl6%rz5mFxNPF2Gd@EYtqGGP`$+NJ_6oddvx1$}w}@_l3h6lm4-Y^TS+V%tIE;*84< zKU2AmAu<fn@Q^diUankLBgr$5$`C!+87#>4*lDe+SoLm|)w9f+D_yk4PE13h2Es?^ z#cgG#ArA>DIASOehI%y;jsJ&+-wsXuaC~9R7=1mWjeG}ogDqenU?5;1U?5;1U?5<i zM;NGO*Y*}+F1?-?qiHe5wW9#|EBqV~>{}DIva6o<(G=;Y-M%_nX)L1a%@u#0<Xc*i zaGa3fQ&RDFxk1ssia)<8az#mgR{TB13~a9W>xO|HSNuKMfWEKduN%-mqZNNoGzi~U N@z)K)KN}T){{xO$04V?f delta 77 zcmZp8z}T>Wae_3fF#`hw>qG^6M&peMOZa)1_`d-K7+Cp#^MBi{C~%H{VuSSNpYq%a hLhM{08Tg&~Ci6b!)!@nIUdQ!uW8rPC%^$t^Spiwr7ry`i -- GitLab