From 631df416a23fb088d61dd5586a0f3f5bb6de5832 Mon Sep 17 00:00:00 2001
From: Khalleud <ledk14@gmail.com>
Date: Wed, 9 Jun 2021 16:55:04 +0200
Subject: [PATCH 1/3] [FIX] update feature extractor with split

---
 classifiers.py                   |  7 +++--
 experimentsClassicClassifiers.py | 27 +++++++++++-------
 features_extractor.py            | 47 ++++++++++++++++++--------------
 3 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/classifiers.py b/classifiers.py
index f68b2c6..c061dac 100644
--- a/classifiers.py
+++ b/classifiers.py
@@ -30,10 +30,11 @@ param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'di
 
 grid_params = [
                 ('bayes', None),
+                ('lr', param_grid_lr),
+                ('sgd', param_grid_sgd ),
                 ('svm', param_grid_svm),
                 ('decisionTree', param_grid_decisionTree),
                 ('rfc', param_grid_rfc ),
-                ('lr', param_grid_lr),
-                ('sgd', param_grid_sgd ),
-                ('knn', param_grid_knn),
+                ('knn', param_grid_knn),               
+
                 ]
diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py
index 1cc2f91..35da41c 100644
--- a/experimentsClassicClassifiers.py
+++ b/experimentsClassicClassifiers.py
@@ -72,35 +72,42 @@ for columnInput in [columnText, 'firstParagraph']:
 
     print('Process: ' + columnInput)
 
-    extractor = feature_extractor(df, columnInput, columnClass)
+    #prepare data
+    df = df[df[columnClass] != 'unclassified']
+    y  = df[columnClass]
+
+    train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
+    encoder = preprocessing.LabelEncoder()
+    train_y = encoder.fit_transform(train_y)
+    valid_y = encoder.fit_transform(test_y)
+
+
+    extractor = feature_extractor(train_x, test_x, columnInput, columnClass)
 
     features_techniques = [
     ('counter',  extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
     ('tf_idf',  extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
     ('doc2vec',  extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
 
-    #prepare data
-    df = df[df[columnClass] != 'unclassified']
-    y  = df[columnClass]
+
 
     #case of full text
     for feature_technique_name, features in features_techniques:
-        train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
-        encoder = preprocessing.LabelEncoder()
-        train_y = encoder.fit_transform(train_y)
-        valid_y = encoder.fit_transform(test_y)
+
+        # features has the train_x and the test_x after vectorization
+        train_x, test_x = features
 
         for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
             clf_name, clf = tmp_clf
             grid_param_name, grid_param = tmp_grid_params
             print(clf_name, clf, grid_param_name, grid_param)
             model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl"
-            
+
             if clf_name != 'bayes' :
                 clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
             elif feature_technique_name == 'doc2vec':
                     continue
-            
+
             t_begin = time.time()
 
             if os.path.isfile(os.path.join('./models', model_file_name)):
diff --git a/features_extractor.py b/features_extractor.py
index a0c99fe..56d1944 100644
--- a/features_extractor.py
+++ b/features_extractor.py
@@ -12,16 +12,17 @@ from nltk.tokenize import word_tokenize
 
 class feature_extractor:
 
-    def __init__(self, data, column, target):
+    def __init__(self, train_x, test_x, column, target):
 
         self.column = column
-        self.data = data
-        self.X = data[column]
-        self.y = data[target]
+        #self.data = data
+        #self.X = data[column]
+        #self.y = data[target]
 
-        self.docs = []
-        for index, row in data.iterrows():
-            self.docs.append(row[column])
+        self.docs_train = train_x[column].tolist()
+        self.docs_test = test_x[column].tolist()
+        #for index, row in data.iterrows():
+        #    self.docs.append(row[column])
 
 
     def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ):
@@ -36,9 +37,9 @@ class feature_extractor:
 
         stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures)
 
-        stem_vectorizer_fr.fit(self.docs)
+        stem_vectorizer_fr.fit(self.docs_train)
 
-        return stem_vectorizer_fr.transform(self.docs)
+        return stem_vectorizer_fr.transform(self.docs_train), stem_vectorizer_fr.transform(self.docs_test)
 
 
     def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None):
@@ -53,21 +54,26 @@ class feature_extractor:
             return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
 
         tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures)
-        tfidf_vectorizer.fit(self.docs)
-        return tfidf_vectorizer.transform(self.docs)
+        tfidf_vectorizer.fit(self.docs_train)
+        return tfidf_vectorizer.transform(self.docs_train), tfidf_vectorizer.transform(self.docs_test)
 
 
 
 
     def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1):
-        tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)]
+        #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)]
+        tagged_tr = [TaggedDocument(words = word_tokenize(_d.lower()),tags = [str(i)]) for i, _d in enumerate(self.docs_train)]
+        #Tag test set
+        tagged_test = [TaggedDocument(words=word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(self.docs_test)]
+
+
         model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
 
-        model.build_vocab(tagged_data)
+        model.build_vocab(tagged_tr)
 
         for epoch in range(max_epochs):
             print('iteration {0}'.format(epoch))
-            model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
+            model.train(tagged_tr, total_examples=model.corpus_count, epochs=model.iter)
             # decrease the learning rate
             model.alpha -= 0.0002
             # fix the learning rate, no decay
@@ -78,12 +84,13 @@ class feature_extractor:
         nb_docs_small = len(set_tags)
         doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size))
 
-        i = 0
-        for t in set_tags:
-            doc_vec_doc2vec[i] = model.docvecs[t]
-            i += 1
-
-        return doc_vec_doc2vec
+        #i = 0
+        #for t in set_tags:
+        #    doc_vec_doc2vec[i] = model.docvecs[t]
+        #    i += 1
+        X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])
+        X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])
+        return X_train, X_test
 
 
     def text_based_features(self):
-- 
GitLab


From b98176ec8e6147467fe282ddbe2011bbcd93878a Mon Sep 17 00:00:00 2001
From: Khalleud <ledk14@gmail.com>
Date: Sat, 12 Jun 2021 16:20:08 +0200
Subject: [PATCH 2/3] [FIX] update doc2vec in feature extractor

---
 features_extractor.py | 45 +++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/features_extractor.py b/features_extractor.py
index 56d1944..e807b08 100644
--- a/features_extractor.py
+++ b/features_extractor.py
@@ -8,6 +8,7 @@ import pandas as pd
 import numpy as np
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 from nltk.tokenize import word_tokenize
+import spacy
 
 
 class feature_extractor:
@@ -60,36 +61,38 @@ class feature_extractor:
 
 
 
-    def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1):
-        #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)]
-        tagged_tr = [TaggedDocument(words = word_tokenize(_d.lower()),tags = [str(i)]) for i, _d in enumerate(self.docs_train)]
-        #Tag test set
-        tagged_test = [TaggedDocument(words=word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(self.docs_test)]
+    def doc2vec(self, max_epochs, doc2vec_vec_size, doc2vec_min_count ,  doc2vec_dm):
 
+        nlp = spacy.load("fr_core_news_sm")
+        stopWords = set(stopwords.words('french'))
 
-        model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
 
-        model.build_vocab(tagged_tr)
 
-        for epoch in range(max_epochs):
-            print('iteration {0}'.format(epoch))
-            model.train(tagged_tr, total_examples=model.corpus_count, epochs=model.iter)
-            # decrease the learning rate
-            model.alpha -= 0.0002
-            # fix the learning rate, no decay
-            model.min_alpha = model.alpha
+        def tokenize_fr_text(sentence):
+
+            result = string.punctuation
+
+
+            # Tokeniser la phrase
+            doc = nlp(sentence)
+            # Retourner le texte de chaque token
+            return [X.text.lower() for X in doc if not X.text in stopWords and not X.text in result and not len(X.text) < 2]
+
+
+        #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)]
+        tagged_tr = [TaggedDocument(words = tokenize_fr_text(_d),tags = [str(i)]) for i, _d in enumerate(self.docs_train)]
+        #Tag test set
+        tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(self.docs_test)]
+
+        model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm)
+        model.build_vocab(tagged_tr)
+        model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs)
 
 
-        set_tags = list(model.docvecs.doctags)
-        nb_docs_small = len(set_tags)
-        doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size))
 
-        #i = 0
-        #for t in set_tags:
-        #    doc_vec_doc2vec[i] = model.docvecs[t]
-        #    i += 1
         X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])
         X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])
+
         return X_train, X_test
 
 
-- 
GitLab


From e659184ad250574cf37c29465398e9cfd7a3d864 Mon Sep 17 00:00:00 2001
From: Khalleud <ledk14@gmail.com>
Date: Sat, 12 Jun 2021 17:13:29 +0200
Subject: [PATCH 3/3] [FIX] update classifiers in split and feature extraction
 order

---
 classifiers.py                   |  4 ++--
 experimentsClassicClassifiers.py | 12 ++++++++----
 settings.conf                    | 12 +++++++-----
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/classifiers.py b/classifiers.py
index c061dac..5eb6173 100644
--- a/classifiers.py
+++ b/classifiers.py
@@ -24,7 +24,7 @@ classifiers = [
 param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
 param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
 param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
-param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
+param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":['none',"l2"]}
 param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]}
 param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
 
@@ -35,6 +35,6 @@ grid_params = [
                 ('svm', param_grid_svm),
                 ('decisionTree', param_grid_decisionTree),
                 ('rfc', param_grid_rfc ),
-                ('knn', param_grid_knn),               
+                ('knn', param_grid_knn),
 
                 ]
diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py
index 35da41c..a3f2af2 100644
--- a/experimentsClassicClassifiers.py
+++ b/experimentsClassicClassifiers.py
@@ -3,6 +3,7 @@ import os
 import time
 import argparse
 import pandas as pd
+import numpy as np
 from data_preprocessing import Preprocessor
 from features_extractor import feature_extractor
 from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
@@ -64,9 +65,12 @@ config.read('settings.conf')
 vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else  float(config.get('vectorizers','vectorization_max_df'))
 vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else  float(config.get('vectorizers','vectorization_min_df'))
 vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None
+
 doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size'))
-doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs'))
-doc2vec_lr = float(config.get('vectorizers','doc2vec_lr'))
+max_epochs = int(config.get('vectorizers','max_epochs'))
+doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count'))
+doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
+doc2vec_workers = int(config.get('vectorizers','doc2vec_workers'))
 
 for columnInput in [columnText, 'firstParagraph']:
 
@@ -76,7 +80,7 @@ for columnInput in [columnText, 'firstParagraph']:
     df = df[df[columnClass] != 'unclassified']
     y  = df[columnClass]
 
-    train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
+    train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
     encoder = preprocessing.LabelEncoder()
     train_y = encoder.fit_transform(train_y)
     valid_y = encoder.fit_transform(test_y)
@@ -87,7 +91,7 @@ for columnInput in [columnText, 'firstParagraph']:
     features_techniques = [
     ('counter',  extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
     ('tf_idf',  extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
-    ('doc2vec',  extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
+    ('doc2vec',  extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count ,  doc2vec_dm))]
 
 
 
diff --git a/settings.conf b/settings.conf
index f1ef2be..eebf815 100644
--- a/settings.conf
+++ b/settings.conf
@@ -1,8 +1,10 @@
 [vectorizers]
 vectorization_max_df= 1.0
-vectorization_min_df= 1
+vectorization_min_df= 4
 vectorization_numberOfFeatures= None
-doc2vec_vec_size = 300
-doc2vec_epochs = 10
-doc2vec_lr = 0.025
-min_word_per_article = 4
+doc2vec_vec_size = 700
+max_epochs = 10
+doc2vec_min_count = 12
+doc2vec_dm = 0
+doc2vec_workers = 4
+min_word_per_article = 25
-- 
GitLab