diff --git a/.gitignore b/.gitignore index 314250ce0e7aba83c8aedcb1c79c3535aac4b348..145d40b2342dcf50cbca62c87f1ef4a6cccf8cd3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ /wiktextract sqlite* dumps/**-multistream.xml.bz2 +/requirements.txt throttle.ctrl apicache-py3 diff --git a/requirements.txt b/requirements.txt index 59030766178ab3b9e0febc25e280bd79a4703ff1..6eee2f43438661e8b8216a2c189401ee94f951ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ Flask==2.3.3 flask_cors==4.0.0 -wiktextract @ git+https://github.com/tatuylonen/wiktextract.git@b0038bd7a8d7b7db1140060c1035a7bf3e659042 +lupa==1.8 +wiktextract @ git+https://github.com/tatuylonen/wiktextract.git@a3665b8779ad78b045406cc7b1e9ce31876bc6b6 diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py index c1eab07b27d58c1db1cbaf6576894ea1782ab3fd..6e4607488c31a38886d9ffc443128efa2bb95363 100644 --- a/src/wiktextract_wrapper.py +++ b/src/wiktextract_wrapper.py @@ -1,6 +1,6 @@ from wiktextract.page import parse_page - from wiktextract_context import get_wiktextract_context +import json class Wiktextract: @@ -15,5 +15,81 @@ class Wiktextract: if not page: return None result = parse_page(self.wxr, title, page.body) + result2 = convert(result) + + return result2 + + def convert(data_format1) + transformed_data = [] + # Iterate through entries in format 1 + for lang_code, entry in data_format1.items(): + transformed_entry = { + "sources": [ + { + "wiktionary_language": lang_code, + "permanentId": hash(json.dumps(entry)), + "wikstraktor_version": "f391b7f3b6c2a322f2eca90384ff6038851ba541" + } + ], + "id": f"{lang_code}-{entry['word']}{entry['pos']}", + entry['word']: { + "pos": entry['pos'], + "pronunciations": [ + { + "id": f"{lang_code}-{entry['word']}_prn1", + "transcript": pronunciation['ipa'], + "sounds": [ + { + "id": f"{lang_code}-{entry['word']}_sound{index}", + "audio": sound.get("audio", ""), + "text": sound.get("enpr", ""), + "tags": sound.get("tags", []), + "ogg_url": sound.get("ogg_url", ""), + "mp3_url": sound.get("mp3_url", "") + } for index, sound in enumerate(entry['sounds'], start=1) + ] + } + ], + "senses": {} + } + } + + # Iterate through senses in the entry + for sense_index, sense in enumerate(entry['senses'], start=0): + sense_key = f"{lang_code}.{lang_code}-{entry['word']}{entry['pos']}_{sense_index}" + transformed_entry[entry['word']]['senses'][sense_key] = { + "Definitions": [ + { + "id": f"{lang_code}-{entry['word']}{entry['pos']}_def{index}", + "lang": lang_code, + "definition": definition + } for index, definition in enumerate(sense.get('glosses', []), start=0) + ], + "Examples": [ + { + "id": f"{lang_code}-{entry['word']}{entry['pos']}_ex{index}", + "example": example['text'] + } for index, example in enumerate(sense.get('examples', []), start=0) + ] + } + + transformed_data.append(transformed_entry) + + return json.dumps(transformed_data, indent=2) + +# # Input JSON data in format 1 +# json_format1 = ''' + # ... (your JSON data here) +# ''' + +# data_format1 = json.loads(json_format1) + +# # Transform data from format 1 to format 2 +# transformed_data = transform_format1_to_format2(data_format1) + +# # Convert the transformed data to JSON format 2 +# json_format2 = json.dumps(transformed_data, indent=2) + +# # Print or save the transformed JSON format 2 +# print(json_format2) - return result