Skip to content
Snippets Groups Projects
Commit 4dab4c7e authored by Prénom Nom's avatar Prénom Nom
Browse files

petite amélioration tokénisation fr

parent 0502e06f
No related branches found
No related tags found
No related merge requests found
...@@ -240,7 +240,7 @@ ...@@ -240,7 +240,7 @@
} }
} }
/** /**
* Fonction pour nettoyer le texte * Fonction pour prétraiter le texte
*/ */
function cleanText(text) { function cleanText(text) {
text = text.replace(/[\u2022\u00b7•·■◆▪▸▹▶►▻⇨]/g, " "); // Supprime puces et flèches text = text.replace(/[\u2022\u00b7•·■◆▪▸▹▶►▻⇨]/g, " "); // Supprime puces et flèches
......
...@@ -104,28 +104,45 @@ self.onmessage = async (event) => { ...@@ -104,28 +104,45 @@ self.onmessage = async (event) => {
import re import re
import simplemma import simplemma
from simplemma import langdetect from simplemma import langdetect
abrev_pat = re.compile(r"""\\b(
p\\.ex|M\\.|MM\\.|cf\\.|e\\.g|etc\\.
)\\b""", re.X)
tokgrm = re.compile(r"""
(?:etc\\.|p\\.ex\\.|cf\\.|M\\.)|
(?:pomme de terre|pomme de pin|c'est-à-dire|peut-être|aujourd'hui|avant-hier|après-demain|tout-à-l’heure)|
\\w+(?=(?:-(?:je|tu|ils?|elles?|nous|vous|leur|lui|les?|ce|t-|même|ci|là)))|
[\\w\\-]+'?|
[^\\d\\W]+
""", re.X)
def detect_language(text): def detect_language(text):
lang_scores = simplemma.langdetect(text, lang=("fr", "en", "es", "de", "it", "pt")) lang_scores = simplemma.langdetect(text, lang=("fr", "en", "es", "de", "it", "pt"))
return lang_scores[0][0] if lang_scores else "unk" return lang_scores[0][0] if lang_scores else "unk"
def tokenize(text): def tokenize(text, lang):
if lang == "fr":
tokens = tokgrm.findall(text.lower())
# Exclure nombres & ponctuation
tokens = [t for t in tokens if not re.match(r"^[\\d.,:!?;]+$", t)]
return tokens
return re.findall(r"\\b[a-zA-ZÀ-ÿ'-]+\\b", text.lower()) return re.findall(r"\\b[a-zA-ZÀ-ÿ'-]+\\b", text.lower())
text = """${data.text.replace(/\"/g, '\\"')}""" text = """${data.text.replace(/\"/g, '\\"')}"""
detected_lang = detect_language(text) detected_lang = detect_language(text)
if detected_lang == "unk": if detected_lang == "unk":
detected_lang = "other" detected_lang = "other"
tokens = tokenize(text) tokens = tokenize(text, detected_lang)
lemmatized_tokens = [simplemma.lemmatize(token, lang=detected_lang) for token in tokens] lemmatized_tokens = [simplemma.lemmatize(token, lang=detected_lang) for token in tokens]
freq = {} freq = {}
for token in lemmatized_tokens: for token in lemmatized_tokens:
freq[token] = freq.get(token, 0) + 1 freq[token] = freq.get(token, 0) + 1
json.dumps({"lang": detected_lang, "frequencies": freq}, ensure_ascii=False) json.dumps({"lang": detected_lang, "frequencies": freq}, ensure_ascii=False)
`); `);
const parsedResult = JSON.parse(result); const parsedResult = JSON.parse(result);
const detectedLang = parsedResult.lang; const detectedLang = parsedResult.lang;
if (!storedFrequencies[detectedLang]) { if (!storedFrequencies[detectedLang]) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment