diff --git a/src/utils/stats.js b/src/utils/stats.js
index 09a93070e514d6210d5d7136d12559d9be47bfa7..206afc71d503f54cc91bc5a5a0acb044abcc3373 100644
--- a/src/utils/stats.js
+++ b/src/utils/stats.js
@@ -240,7 +240,7 @@
         }
     }
     /**
-     * Fonction pour nettoyer le texte
+     * Fonction pour prétraiter le texte 
      */
     function cleanText(text) {
         text = text.replace(/[\u2022\u00b7•·■◆▪▸▹▶►▻⇨]/g, " ");  // Supprime puces et flèches
diff --git a/src/workers/pyodide_worker.js b/src/workers/pyodide_worker.js
index 99c90ab5a824c66a8430554795078b37291b812e..2aa8e8cd19776941ac8db9e328092864d10c5d25 100644
--- a/src/workers/pyodide_worker.js
+++ b/src/workers/pyodide_worker.js
@@ -104,28 +104,45 @@ self.onmessage = async (event) => {
         import re
         import simplemma
         from simplemma import langdetect
-        
+
+        abrev_pat = re.compile(r"""\\b(
+            p\\.ex|M\\.|MM\\.|cf\\.|e\\.g|etc\\.
+        )\\b""", re.X)
+
+        tokgrm = re.compile(r"""
+            (?:etc\\.|p\\.ex\\.|cf\\.|M\\.)|
+            (?:pomme de terre|pomme de pin|c'est-à-dire|peut-être|aujourd'hui|avant-hier|après-demain|tout-à-l’heure)|
+            \\w+(?=(?:-(?:je|tu|ils?|elles?|nous|vous|leur|lui|les?|ce|t-|même|ci|là)))|
+            [\\w\\-]+'?|
+            [^\\d\\W]+
+        """, re.X)
+
         def detect_language(text):
             lang_scores = simplemma.langdetect(text, lang=("fr", "en", "es", "de", "it", "pt"))
             return lang_scores[0][0] if lang_scores else "unk"
-        
-        def tokenize(text):
+
+        def tokenize(text, lang):
+            if lang == "fr":
+                tokens = tokgrm.findall(text.lower())
+                # Exclure nombres & ponctuation
+                tokens = [t for t in tokens if not re.match(r"^[\\d.,:!?;]+$", t)]  
+                return tokens
             return re.findall(r"\\b[a-zA-ZÀ-ÿ'-]+\\b", text.lower())
-        
+
         text = """${data.text.replace(/\"/g, '\\"')}"""
         detected_lang = detect_language(text)
         if detected_lang == "unk":
             detected_lang = "other"
-        
-        tokens = tokenize(text)
+
+        tokens = tokenize(text, detected_lang)
         lemmatized_tokens = [simplemma.lemmatize(token, lang=detected_lang) for token in tokens]
-        
+
         freq = {}
         for token in lemmatized_tokens:
             freq[token] = freq.get(token, 0) + 1
-        
+
         json.dumps({"lang": detected_lang, "frequencies": freq}, ensure_ascii=False)
-        `);
+`);
           const parsedResult = JSON.parse(result);
           const detectedLang = parsedResult.lang;
           if (!storedFrequencies[detectedLang]) {