Skip to content
Snippets Groups Projects
Commit 8e440680 authored by Enzo Simonnet's avatar Enzo Simonnet
Browse files

1er essai convert

parent 9908c85c
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,7 @@
/wiktextract
sqlite*
dumps/**-multistream.xml.bz2
/requirements.txt
throttle.ctrl
apicache-py3
......
from wiktextract.page import parse_page
from wiktextract_context import get_wiktextract_context
import json
class Wiktextract:
......@@ -15,5 +15,81 @@ class Wiktextract:
if not page:
return None
result = parse_page(self.wxr, title, page.body)
result2 = convert(result)
return result2
def convert(data_format1)
transformed_data = []
# Iterate through entries in format 1
for lang_code, entry in data_format1.items():
transformed_entry = {
"sources": [
{
"wiktionary_language": lang_code,
"permanentId": hash(json.dumps(entry)),
"wikstraktor_version": "f391b7f3b6c2a322f2eca90384ff6038851ba541"
}
],
"id": f"{lang_code}-{entry['word']}{entry['pos']}",
entry['word']: {
"pos": entry['pos'],
"pronunciations": [
{
"id": f"{lang_code}-{entry['word']}_prn1",
"transcript": pronunciation['ipa'],
"sounds": [
{
"id": f"{lang_code}-{entry['word']}_sound{index}",
"audio": sound.get("audio", ""),
"text": sound.get("enpr", ""),
"tags": sound.get("tags", []),
"ogg_url": sound.get("ogg_url", ""),
"mp3_url": sound.get("mp3_url", "")
} for index, sound in enumerate(entry['sounds'], start=1)
]
}
],
"senses": {}
}
}
# Iterate through senses in the entry
for sense_index, sense in enumerate(entry['senses'], start=0):
sense_key = f"{lang_code}.{lang_code}-{entry['word']}{entry['pos']}_{sense_index}"
transformed_entry[entry['word']]['senses'][sense_key] = {
"Definitions": [
{
"id": f"{lang_code}-{entry['word']}{entry['pos']}_def{index}",
"lang": lang_code,
"definition": definition
} for index, definition in enumerate(sense.get('glosses', []), start=0)
],
"Examples": [
{
"id": f"{lang_code}-{entry['word']}{entry['pos']}_ex{index}",
"example": example['text']
} for index, example in enumerate(sense.get('examples', []), start=0)
]
}
transformed_data.append(transformed_entry)
return json.dumps(transformed_data, indent=2)
# # Input JSON data in format 1
# json_format1 = '''
# ... (your JSON data here)
# '''
# data_format1 = json.loads(json_format1)
# # Transform data from format 1 to format 2
# transformed_data = transform_format1_to_format2(data_format1)
# # Convert the transformed data to JSON format 2
# json_format2 = json.dumps(transformed_data, indent=2)
# # Print or save the transformed JSON format 2
# print(json_format2)
return result
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment