diff --git a/requirements.txt b/requirements.txt index d049bcce64ea7801ad20e3712c38d01341e531d7..6eee2f43438661e8b8216a2c189401ee94f951ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ Flask==2.3.3 flask_cors==4.0.0 -wiktextract @ git+https://github.com/tatuylonen/wiktextract.git +lupa==1.8 +wiktextract @ git+https://github.com/tatuylonen/wiktextract.git@a3665b8779ad78b045406cc7b1e9ce31876bc6b6 diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py index 2bc30d3fc9fc3443d990c8bdd1a486ad69bd7b0c..de33b74ea0bd0b0416d1a70ef44c2167163fa04d 100644 --- a/src/wiktextract_wrapper.py +++ b/src/wiktextract_wrapper.py @@ -1,6 +1,6 @@ from wiktextract.page import parse_page - from wiktextract_context import get_wiktextract_context +import json class Wiktextract: def __init__(self, wiktlang: str, wordlang: str): @@ -14,10 +14,64 @@ class Wiktextract: if not page: return None result = parse_page(self.wxr, title, page.body) - # convert(result) - return result + return convert(result) + + def convert(data_format1) + transformed_data = [] + # Iterate through entries in format 1 + for lang_code, entry in data_format1.items(): + transformed_entry = { + "sources": [ + { + "wiktionary_language": lang_code, + "permanentId": hash(json.dumps(entry)), + "wikstraktor_version": "f391b7f3b6c2a322f2eca90384ff6038851ba541" + } + ], + "id": f"{lang_code}-{entry['word']}{entry['pos']}", + entry['word']: { + "pos": entry['pos'], + "pronunciations": [ + { + "id": f"{lang_code}-{entry['word']}_prn1", + "transcript": pronunciation['ipa'], + "sounds": [ + { + "id": f"{lang_code}-{entry['word']}_sound{index}", + "audio": sound.get("audio", ""), + "text": sound.get("enpr", ""), + "tags": sound.get("tags", []), + "ogg_url": sound.get("ogg_url", ""), + "mp3_url": sound.get("mp3_url", "") + } for index, sound in enumerate(entry['sounds'], start=1) + ] + } + ], + "senses": {} + } + } + + # Iterate through senses in the entry + for sense_index, sense in enumerate(entry['senses'], start=0): + sense_key = f"{lang_code}.{lang_code}-{entry['word']}{entry['pos']}_{sense_index}" + transformed_entry[entry['word']]['senses'][sense_key] = { + "Definitions": [ + { + "id": f"{lang_code}-{entry['word']}{entry['pos']}_def{index}", + "lang": lang_code, + "definition": definition + } for index, definition in enumerate(sense.get('glosses', []), start=0) + ], + "Examples": [ + { + "id": f"{lang_code}-{entry['word']}{entry['pos']}_ex{index}", + "example": example['text'] + } for index, example in enumerate(sense.get('examples', []), start=0) + ] + } - # def convert + transformed_data.append(transformed_entry) + return json.dumps(transformed_data, indent=2) if __name__ == "__main__": import argparse