Skip to content
Snippets Groups Projects
Commit a63d5e8e authored by Enzo Simonnet's avatar Enzo Simonnet
Browse files

1er wrapper ok

parent 02d6ceef
No related branches found
No related tags found
No related merge requests found
...@@ -17,65 +17,70 @@ class Wiktextract: ...@@ -17,65 +17,70 @@ class Wiktextract:
return None return None
result = parse_page(self.wxr, title, page.body) result = parse_page(self.wxr, title, page.body)
return self.convert(result) return self.convert(result)
#return result
def convert(self, data_format1): def convert(self, data_format1):
transformed_data = [] transformed_data = []
# Iterate through entries in format 1
for entry in data_format1: for index, pos in enumerate(data_format1):
transformed_entry = { transformed_pos = {
"sources": [ "sources": [
{ {
"wiktionary_language": entry['lang_code'], "wiktionary_language": pos["lang_code"], # Assuming this key is present in your data
"permanentId": hash(json.dumps(entry)), #TODO voir si on peut trouver le permanentId de la page
"wiktextract_version": metadata.version("wiktextract"), "wiktextract_version": metadata.version("wiktextract"),
"wrapper_version": git.Repo(search_parent_directories=True).head.object.hexsha "wrapper_version": git.Repo(search_parent_directories=True).head.object.hexsha
} }
], ],
"id": f"{entry['lang_code']}-{entry['word']}{entry['pos']}", "id": f"{pos.get('lang_code', '')}-{index}.{pos.get('pos', '')}.{pos.get('word', '')}",
entry['word']: { f"{pos.get('pos', '')}": {
"pos": entry['pos'], "pos": pos.get("pos", ""),
"pronunciations": [ "pronunciations": [],
{
"id": f"{entry['lang_code']}-{entry['word']}_prn1",
"transcript": entry['sounds'][0]['ipa'], #quick fix, il faut regarder comment c'est structuré, je pense qu'il faut une fonction qui parcourt tous les items pour savoir lequel contient ipa, en plus là tu as toujours une seule pronunciation, nous on peut en avoir plusieurs dans le wikstraktor…
"sounds": [
{
"id": f"{entry['lang_code']}-{entry['word']}_sound{index}",
"audio": sound.get("audio", ""),
"text": sound.get("enpr", ""),
"tags": sound.get("tags", []),
"ogg_url": sound.get("ogg_url", ""),
"mp3_url": sound.get("mp3_url", "")
} for index, sound in enumerate(entry['sounds'], start=1)
]
}
],
"senses": {} "senses": {}
} }
} }
# Iterate through pronunciations
pron_counter = 1 # Counter for pronunciation id
for pron in pos.get("sounds", []):
pronunciation = {
"id": f"{pos.get('lang_code', '')}-{index}.{pos.get('pos', '')}.{pos.get('word', '')}_prn{pron_counter}",
"transcript": pron.get("ipa", None),
"sounds": []
}
if "audio" in pron:
sound = {
"accent": pron.get("tags", [])[0], # Assuming the first tag is the accent
"url": pron.get("mp3_url", pron.get("ogg_url", ""))
}
pronunciation["sounds"].append(sound)
transformed_pos[pos.get('pos', '')]['pronunciations'].append(pronunciation)
pron_counter += 1
# Iterate through senses in the entry # Iterate through senses in the entry
for sense_index, sense in enumerate(entry['senses'], start=0): for sense_index, sense in enumerate(pos.get("senses", []), start=1):
#ci dessous l'un des 2 lang_code doit être celui du wiktionnaire utilisés sense_id = f"{pos.get('lang_code', '')}-{index}.{pos.get('pos', '')}.{pos.get('word', '')}_{pos.get('pos', '')}{sense_index}"
sense_key = f"{entry['lang_code']}.{entry['lang_code']}-{entry['word']}{entry['pos']}_{sense_index}" transformed_pos[pos.get('pos', '')]["senses"][sense_id] = {
transformed_entry[entry['word']]['senses'][sense_key] = {
"Definitions": [ "Definitions": [
{ {
"id": f"{entry['lang_code']}-{entry['word']}{entry['pos']}_def{index}", "id": f"{pos['lang_code']}-{pos['word']}{pos['pos']}_def{index}",
"lang": entry['lang_code'], "lang": pos.get('lang_code', ''),
"definition": definition "definition": gloss
} for index, definition in enumerate(sense.get('glosses', []), start=0) } for index, gloss in enumerate(sense.get('glosses', []) + sense.get('raw_glosses', []), start=1)
], ],
"Examples": [ "Examples": [
{ {
"id": f"{entry['lang_code']}-{entry['word']}{entry['pos']}_ex{index}", "id": f"{pos.get('lang_code', '')}-{pos.get('word', '')}{pos.get('pos', '')}_ex{index}",
"example": example['text'] "example": example['text']
} for index, example in enumerate(sense.get('examples', []), start=0) } for index, example in enumerate(sense.get('examples', []), start=1)
] ]
} }
transformed_data.append(transformed_entry) # Copy the translations attribute
return json.dumps(transformed_data, indent=2) transformed_pos["translations"] = pos.get("translations", [])
transformed_data.append(transformed_pos)
return transformed_data
if __name__ == "__main__": if __name__ == "__main__":
import argparse import argparse
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment