diff --git a/requirements.txt b/requirements.txt index fa73822839f14b9d103a52f18b0e964fb91b70c4..69ef062b6a7518f21f2fe780d9df0999a0814db3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ Flask==2.3.3 flask_cors==4.0.0 -lupa==1.8 +lupa>=1.8 wiktextract @ git+https://github.com/tatuylonen/wiktextract.git@a3665b8779ad78b045406cc7b1e9ce31876bc6b6 GitPython diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py index 05e1e5aad4dd631b4a89882fd03ea1a8526cb48e..13cfdf0f70e3d666bc23f07e521f6d2650151311 100644 --- a/src/wiktextract_wrapper.py +++ b/src/wiktextract_wrapper.py @@ -17,65 +17,70 @@ class Wiktextract: return None result = parse_page(self.wxr, title, page.body) return self.convert(result) + #return result def convert(self, data_format1): transformed_data = [] - # Iterate through entries in format 1 - for entry in data_format1: - transformed_entry = { + + for index, pos in enumerate(data_format1): + transformed_pos = { "sources": [ { - "wiktionary_language": entry['lang_code'], - "permanentId": hash(json.dumps(entry)), #TODO voir si on peut trouver le permanentId de la page + "wiktionary_language": pos["lang_code"], # Assuming this key is present in your data "wiktextract_version": metadata.version("wiktextract"), "wrapper_version": git.Repo(search_parent_directories=True).head.object.hexsha } ], - "id": f"{entry['lang_code']}-{entry['word']}{entry['pos']}", - entry['word']: { - "pos": entry['pos'], - "pronunciations": [ - { - "id": f"{entry['lang_code']}-{entry['word']}_prn1", - "transcript": entry['sounds'][0]['ipa'], #quick fix, il faut regarder comment c'est structuré, je pense qu'il faut une fonction qui parcourt tous les items pour savoir lequel contient ipa, en plus là tu as toujours une seule pronunciation, nous on peut en avoir plusieurs dans le wikstraktor… - "sounds": [ - { - "id": f"{entry['lang_code']}-{entry['word']}_sound{index}", - "audio": sound.get("audio", ""), - "text": sound.get("enpr", ""), - "tags": sound.get("tags", []), - "ogg_url": sound.get("ogg_url", ""), - "mp3_url": sound.get("mp3_url", "") - } for index, sound in enumerate(entry['sounds'], start=1) - ] - } - ], + "id": f"{pos.get('lang_code', '')}-{index}.{pos.get('pos', '')}.{pos.get('word', '')}", + f"{pos.get('pos', '')}": { + "pos": pos.get("pos", ""), + "pronunciations": [], "senses": {} } } + # Iterate through pronunciations + pron_counter = 1 # Counter for pronunciation id + for pron in pos.get("sounds", []): + pronunciation = { + "id": f"{pos.get('lang_code', '')}-{index}.{pos.get('pos', '')}.{pos.get('word', '')}_prn{pron_counter}", + "transcript": pron.get("ipa", None), + "sounds": [] + } + if "audio" in pron: + sound = { + "accent": pron.get("tags", [])[0], # Assuming the first tag is the accent + "url": pron.get("mp3_url", pron.get("ogg_url", "")) + } + pronunciation["sounds"].append(sound) + transformed_pos[pos.get('pos', '')]['pronunciations'].append(pronunciation) + pron_counter += 1 + # Iterate through senses in the entry - for sense_index, sense in enumerate(entry['senses'], start=0): - #ci dessous l'un des 2 lang_code doit être celui du wiktionnaire utilisés - sense_key = f"{entry['lang_code']}.{entry['lang_code']}-{entry['word']}{entry['pos']}_{sense_index}" - transformed_entry[entry['word']]['senses'][sense_key] = { + for sense_index, sense in enumerate(pos.get("senses", []), start=1): + sense_id = f"{pos.get('lang_code', '')}-{index}.{pos.get('pos', '')}.{pos.get('word', '')}_{pos.get('pos', '')}{sense_index}" + transformed_pos[pos.get('pos', '')]["senses"][sense_id] = { "Definitions": [ { - "id": f"{entry['lang_code']}-{entry['word']}{entry['pos']}_def{index}", - "lang": entry['lang_code'], - "definition": definition - } for index, definition in enumerate(sense.get('glosses', []), start=0) + "id": f"{pos['lang_code']}-{pos['word']}{pos['pos']}_def{index}", + "lang": pos.get('lang_code', ''), + "definition": gloss + } for index, gloss in enumerate(sense.get('glosses', []) + sense.get('raw_glosses', []), start=1) ], "Examples": [ { - "id": f"{entry['lang_code']}-{entry['word']}{entry['pos']}_ex{index}", + "id": f"{pos.get('lang_code', '')}-{pos.get('word', '')}{pos.get('pos', '')}_ex{index}", "example": example['text'] - } for index, example in enumerate(sense.get('examples', []), start=0) + } for index, example in enumerate(sense.get('examples', []), start=1) ] } - transformed_data.append(transformed_entry) - return json.dumps(transformed_data, indent=2) + # Copy the translations attribute + transformed_pos["translations"] = pos.get("translations", []) + transformed_data.append(transformed_pos) + + return transformed_data + if __name__ == "__main__": import argparse