diff --git a/parsers/fr_constants.py b/parsers/fr_constants.py index 66915f4ac35116074f6de724eadf7130311a0369..ffcde47bdc2acfe4cb90d3d510ff2566fefd06f0 100644 --- a/parsers/fr_constants.py +++ b/parsers/fr_constants.py @@ -10,44 +10,58 @@ string_values = { #Inexistants "t_ipa":"pron", #template for transcription "t_snd":"écouter", #template for audio -"t_acc":["US", "UK"], #template for accents +"t_acc":["US", "UK"], #template for accents (inutile utilise régions) "regions":{ - "UK":"United Kingdom", - "United Kingdom":"United Kingdom", - "British":"Great Britain", - "GB":"Great Britain", - "Great Britain":"Great Britain", - "Scot":"Scotland", - "Scottish":"Scotland", - "Scotland":"Scotland", - "Irl":"Ireland", - "Irish":"Ireland", - "Ireland":"Ireland", - "Ulst":"Northern Ireland", - "Ulster":"Northern Ireland", - "Northern Ireland":"Northern Ireland", - "Wls":"Wales", - "Welsh":"Wales", - "Wales":"Wales", - "English":"England", - "Eng":"England", - "En":"England", - "England":"England", + "UK":"Royaume-Uni", + "United Kingdom":"Royaume-Uni", + "Royaume-Uni":"Royaume-Uni", + "British":"Grande Bretagne", + "GB":"Grande Bretagne", + "Great Britain":"Grande Bretagne", + "Grande Bretagne":"Grande Bretagne", + "Scot":"Écosse", + "Scottish":"Écosse", + "Scotland":"Écosse", + "Écosse":"Écosse", + "Irl":"Irlande", + "Irish":"Irlande", + "Ireland":"Irlande", + "Irlande":"Irlande", + "Ulst":"Irlande du nord", + "Ulster":"Irlande du nord", + "Northern Ireland":"Irlande du nord", + "Irlande du nord":"Irlande du nord", + "Wls":"Pays de Galles", + "Welsh":"Pays de Galles", + "Wales":"Pays de Galles", + "Pays de Galles":"Pays de Galles", + "Angleterre":"Angleterre", + "English":"Angleterre", + "Eng":"Angleterre", + "En":"Angleterre", + "England":"Angleterre", "Canada":"Canada", "Canadian":"Canada", - 'North American':'North America', - 'North America':"North America", - "US":"United States of America", - "USA":"United States of America", - "United States":"United States of America", - "United States of America":"United States of America", - "NZ":"New Zealand", - "New Zealand":"New Zealand", - "Au":"Australia", - "AU":"Australia", - "Australia":"Australia", - "India":"India", - "Indian":"India", + "Canadien":"Canada", + 'North American':"Amérique du nord", + 'North America':"Amérique du nord", + 'Amérique du nord':"Amérique du nord", + "US":"États-Unis", + "USA":"États-Unis", + "United States":"États-Unis", + "United States of America":"États-Unis", + "États-Unis":"États-Unis", + "États-Unis d'Amérique":"États-Unis", + "NZ":"Nouvelle Zélande", + "New Zealand":"Nouvelle Zélande", + "Nouvelle Zélande":"Nouvelle Zélande", + "Au":"Australie", + "AU":"Australie", + "Australia":"Australie", + "Australie":"Australie", + "India":"Inde", + "Indian":"Inde", + "Inde":"Inde", "Nigeria":"Nigeria", "Nigerian":"Nigeria"}, "sense_pattern":[ ## structure(s) for sense patterns add_subdef is to be added to def patterns diff --git a/parsers/fr_en.py b/parsers/fr_en.py index 91c43743ef70931cdf9db22e27afa6ce7c9bfdd3..a5caab06612e5c37b60c690d5662bfd1ab49e021 100644 --- a/parsers/fr_en.py +++ b/parsers/fr_en.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -from wikstraktor import Wikstraktor, Pronunciation, Sense +from wikstraktor import Wikstraktor, Pronunciation, Sense, get_list_string_level from parsers.fr_constants import string_values @@ -18,24 +18,30 @@ class Fr_en_straktor(Wikstraktor): l = proContent.get_lists()[0] i = 0 pronunciations = [] - while i < len(l.fullitems): - p = Pronunciation() - templates = self.wtp.parse(l.fullitems[i]).templates - a = None + previous_level = None + for item in l.fullitems: + current_level = get_list_string_level(item) + if previous_level == None or current_level <= previous_level: + p = Pronunciation() + pronunciations.append(p) #objects are pointers + templates = self.wtp.parse(item).templates for j, t in enumerate(templates): - #if t.normal_name() == self.constants['t_acc']: - # p.set_transcription(t.arguments[i+1].value) - if t.normal_name() == self.constants['t_snd']: - p.add_sound(self.get_file_url(t.arguments[-1].value)) - if len(self.wtp.parse(t.get_arg("1").value).templates) != 1: - p.set_accent(t.get_arg("1").value) + if t.normal_name() == self.constants['t_ipa']: + p.set_transcription(t.arguments[0].value) + elif t.normal_name() == self.constants['t_snd']: + if t.has_arg("audio"): + f = t.get_arg("audio").value else: + f = t.arguments[-1].value + p.add_sound(self.get_file_url(f)) + if len(self.wtp.parse(t.get_arg("1").value).templates) != 1:#Royaume-Uni (Londres) + p.set_accent(t.get_arg("1").value) + else:#{{UK|nocat=1}} p.set_accent(self.wtp.parse(t.get_arg("1").value).templates[0].normal_name()) if t.get_arg("2") != None: p.set_transcription(t.get_arg("2").value) - if p.accent != None and p.sounds != []: - pronunciations.append(p) - p = Pronunciation() + elif t.has_arg("nocat") and t.normal_name() in self.constants['regions'].keys(): + p.set_accent(self.constants['regions'][t.normal_name()]) i += 1 return pronunciations diff --git a/wikstraktor.py b/wikstraktor.py index f541cd435e3e3a6f67f68178e5d2ff697d951484..ff1c57eb91557f276f4892a1c5f3df209ddbcee5 100755 --- a/wikstraktor.py +++ b/wikstraktor.py @@ -6,8 +6,12 @@ import json from wikstraktor_version import version as the_version from wikstraklog import Wikstraklog -#ICITE : fr marche pas, en prend des trucs vides à virer (cf. yellow… def & example) - +def get_list_string_level(wikitext): + list_chars = {"*", "#", ":"} + i = 0 + while i < len(wikitext) and wikitext[i] in list_chars: + i+=1 + return i class SubInfo: next_id = 1