Skip to content
Snippets Groups Projects
Commit a8f4d1c7 authored by Mathieu Loiseau's avatar Mathieu Loiseau
Browse files

Merge no-live

parents 5dc452be 8e440680
No related branches found
No related tags found
No related merge requests found
from wiktextract.page import parse_page
from wiktextract_context import get_wiktextract_context
import json
class Wiktextract:
def __init__(self, wiktlang: str, wordlang: str):
......@@ -14,10 +14,64 @@ class Wiktextract:
if not page:
return None
result = parse_page(self.wxr, title, page.body)
# convert(result)
return result
return convert(result)
def convert(data_format1)
transformed_data = []
# Iterate through entries in format 1
for lang_code, entry in data_format1.items():
transformed_entry = {
"sources": [
{
"wiktionary_language": lang_code,
"permanentId": hash(json.dumps(entry)),
"wikstraktor_version": "f391b7f3b6c2a322f2eca90384ff6038851ba541"
}
],
"id": f"{lang_code}-{entry['word']}{entry['pos']}",
entry['word']: {
"pos": entry['pos'],
"pronunciations": [
{
"id": f"{lang_code}-{entry['word']}_prn1",
"transcript": pronunciation['ipa'],
"sounds": [
{
"id": f"{lang_code}-{entry['word']}_sound{index}",
"audio": sound.get("audio", ""),
"text": sound.get("enpr", ""),
"tags": sound.get("tags", []),
"ogg_url": sound.get("ogg_url", ""),
"mp3_url": sound.get("mp3_url", "")
} for index, sound in enumerate(entry['sounds'], start=1)
]
}
],
"senses": {}
}
}
# Iterate through senses in the entry
for sense_index, sense in enumerate(entry['senses'], start=0):
sense_key = f"{lang_code}.{lang_code}-{entry['word']}{entry['pos']}_{sense_index}"
transformed_entry[entry['word']]['senses'][sense_key] = {
"Definitions": [
{
"id": f"{lang_code}-{entry['word']}{entry['pos']}_def{index}",
"lang": lang_code,
"definition": definition
} for index, definition in enumerate(sense.get('glosses', []), start=0)
],
"Examples": [
{
"id": f"{lang_code}-{entry['word']}{entry['pos']}_ex{index}",
"example": example['text']
} for index, example in enumerate(sense.get('examples', []), start=0)
]
}
# def convert
transformed_data.append(transformed_entry)
return json.dumps(transformed_data, indent=2)
if __name__ == "__main__":
import argparse
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment