From 8d2547be1d96799f0660328a2f908608cfaf6ffd Mon Sep 17 00:00:00 2001
From: Mathieu Loiseau <mathieu.loiseau@liris.cnrs.fr>
Date: Thu, 11 Jan 2024 10:41:45 +0100
Subject: [PATCH] =?UTF-8?q?Quick=20fixes=20pour=20ex=C3=A9cution?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/wiktextract_wrapper.py | 111 +++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 55 deletions(-)

diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py
index de33b74..31cbf40 100644
--- a/src/wiktextract_wrapper.py
+++ b/src/wiktextract_wrapper.py
@@ -14,64 +14,65 @@ class Wiktextract:
         if not page:
             return None
         result = parse_page(self.wxr, title, page.body)
-        return convert(result)
+        return self.convert(result)
 
-    def convert(data_format1)
-		transformed_data = []
-		# Iterate through entries in format 1
-		for lang_code, entry in data_format1.items():
-			transformed_entry = {
-				"sources": [
-					{
-						"wiktionary_language": lang_code,
-						"permanentId": hash(json.dumps(entry)),
-						"wikstraktor_version": "f391b7f3b6c2a322f2eca90384ff6038851ba541"
-					}
-				],
-				"id": f"{lang_code}-{entry['word']}{entry['pos']}",
-				entry['word']: {
-					"pos": entry['pos'],
-					"pronunciations": [
-						{
-							"id": f"{lang_code}-{entry['word']}_prn1",
-							"transcript": pronunciation['ipa'],
-							"sounds": [
-								{
-									"id": f"{lang_code}-{entry['word']}_sound{index}",
-									"audio": sound.get("audio", ""),
-									"text": sound.get("enpr", ""),
-									"tags": sound.get("tags", []),
-									"ogg_url": sound.get("ogg_url", ""),
-									"mp3_url": sound.get("mp3_url", "")
-								} for index, sound in enumerate(entry['sounds'], start=1)
-							]
-						}
-					],
-					"senses": {}
-				}
-			}
+    def convert(self, data_format1):
+        transformed_data = []
+        # Iterate through entries in format 1
+        for entry in data_format1:
+            transformed_entry = {
+                "sources": [
+                    {
+                        "wiktionary_language": entry['lang_code'],
+                        "permanentId": hash(json.dumps(entry)),
+                        "wikstraktor_version": "f391b7f3b6c2a322f2eca90384ff6038851ba541"
+                    }
+                ],
+                "id": f"{entry['lang_code']}-{entry['word']}{entry['pos']}",
+                entry['word']: {
+                    "pos": entry['pos'],
+                    "pronunciations": [
+                        {
+                            "id": f"{entry['lang_code']}-{entry['word']}_prn1",
+                            "transcript": entry['sounds'][0]['ipa'], #quick fix, il faut regarder comment c'est structuré, je pense qu'il faut une fonction qui parcourt tous les items pour savoir lequel contient ipa, en plus là tu as toujours une seule pronunciation, nous on peut en avoir plusieurs dans le wikstraktor…
+                            "sounds": [
+                                {
+                                    "id": f"{entry['lang_code']}-{entry['word']}_sound{index}",
+                                    "audio": sound.get("audio", ""),
+                                    "text": sound.get("enpr", ""),
+                                    "tags": sound.get("tags", []),
+                                    "ogg_url": sound.get("ogg_url", ""),
+                                    "mp3_url": sound.get("mp3_url", "")
+                                } for index, sound in enumerate(entry['sounds'], start=1)
+                            ]
+                        }
+                    ],
+                    "senses": {}
+                }
+            }
 
-			# Iterate through senses in the entry
-			for sense_index, sense in enumerate(entry['senses'], start=0):
-				sense_key = f"{lang_code}.{lang_code}-{entry['word']}{entry['pos']}_{sense_index}"
-				transformed_entry[entry['word']]['senses'][sense_key] = {
-					"Definitions": [
-						{
-							"id": f"{lang_code}-{entry['word']}{entry['pos']}_def{index}",
-							"lang": lang_code,
-							"definition": definition
-						} for index, definition in enumerate(sense.get('glosses', []), start=0)
-					],
-					"Examples": [
-						{
-							"id": f"{lang_code}-{entry['word']}{entry['pos']}_ex{index}",
-							"example": example['text']
-						} for index, example in enumerate(sense.get('examples', []), start=0)
-					]
-				}
+            # Iterate through senses in the entry
+            for sense_index, sense in enumerate(entry['senses'], start=0):
+                #ci dessous l'un des 2 lang_code doit être celui du wiktionnaire utilisés
+                sense_key = f"{entry['lang_code']}.{entry['lang_code']}-{entry['word']}{entry['pos']}_{sense_index}"
+                transformed_entry[entry['word']]['senses'][sense_key] = {
+                    "Definitions": [
+                        {
+                            "id": f"{entry['lang_code']}-{entry['word']}{entry['pos']}_def{index}",
+                            "lang": entry['lang_code'],
+                            "definition": definition
+                        } for index, definition in enumerate(sense.get('glosses', []), start=0)
+                    ],
+                    "Examples": [
+                        {
+                            "id": f"{entry['lang_code']}-{entry['word']}{entry['pos']}_ex{index}",
+                            "example": example['text']
+                        } for index, example in enumerate(sense.get('examples', []), start=0)
+                    ]
+                }
 
-			transformed_data.append(transformed_entry)
-		return json.dumps(transformed_data, indent=2)
+            transformed_data.append(transformed_entry)
+        return json.dumps(transformed_data, indent=2)
 
 if __name__ == "__main__":
     import argparse
-- 
GitLab