diff --git a/src/utils/stats.js b/src/utils/stats.js index 09a93070e514d6210d5d7136d12559d9be47bfa7..206afc71d503f54cc91bc5a5a0acb044abcc3373 100644 --- a/src/utils/stats.js +++ b/src/utils/stats.js @@ -240,7 +240,7 @@ } } /** - * Fonction pour nettoyer le texte + * Fonction pour prétraiter le texte */ function cleanText(text) { text = text.replace(/[\u2022\u00b7•·■◆▪▸▹▶►▻⇨]/g, " "); // Supprime puces et flèches diff --git a/src/workers/pyodide_worker.js b/src/workers/pyodide_worker.js index 99c90ab5a824c66a8430554795078b37291b812e..2aa8e8cd19776941ac8db9e328092864d10c5d25 100644 --- a/src/workers/pyodide_worker.js +++ b/src/workers/pyodide_worker.js @@ -104,28 +104,45 @@ self.onmessage = async (event) => { import re import simplemma from simplemma import langdetect - + + abrev_pat = re.compile(r"""\\b( + p\\.ex|M\\.|MM\\.|cf\\.|e\\.g|etc\\. + )\\b""", re.X) + + tokgrm = re.compile(r""" + (?:etc\\.|p\\.ex\\.|cf\\.|M\\.)| + (?:pomme de terre|pomme de pin|c'est-à -dire|peut-être|aujourd'hui|avant-hier|après-demain|tout-à -l’heure)| + \\w+(?=(?:-(?:je|tu|ils?|elles?|nous|vous|leur|lui|les?|ce|t-|même|ci|là )))| + [\\w\\-]+'?| + [^\\d\\W]+ + """, re.X) + def detect_language(text): lang_scores = simplemma.langdetect(text, lang=("fr", "en", "es", "de", "it", "pt")) return lang_scores[0][0] if lang_scores else "unk" - - def tokenize(text): + + def tokenize(text, lang): + if lang == "fr": + tokens = tokgrm.findall(text.lower()) + # Exclure nombres & ponctuation + tokens = [t for t in tokens if not re.match(r"^[\\d.,:!?;]+$", t)] + return tokens return re.findall(r"\\b[a-zA-ZÀ-ÿ'-]+\\b", text.lower()) - + text = """${data.text.replace(/\"/g, '\\"')}""" detected_lang = detect_language(text) if detected_lang == "unk": detected_lang = "other" - - tokens = tokenize(text) + + tokens = tokenize(text, detected_lang) lemmatized_tokens = [simplemma.lemmatize(token, lang=detected_lang) for token in tokens] - + freq = {} for token in lemmatized_tokens: freq[token] = freq.get(token, 0) + 1 - + json.dumps({"lang": detected_lang, "frequencies": freq}, ensure_ascii=False) - `); +`); const parsedResult = JSON.parse(result); const detectedLang = parsedResult.lang; if (!storedFrequencies[detectedLang]) {