From b9d052a7c46a6c35fb09070074897d2ac0fd2240 Mon Sep 17 00:00:00 2001
From: Khalleud <>
Date: Mon, 7 Jun 2021 17:43:45 +0200
Subject: [PATCH 1/2] [FIX] update experimentationclassicclassifiers by saving
 and loading models

---                   |  2 ++ | 24 ++++++++++++++----------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/ b/
index 16db401..96e90ab 100644
--- a/
+++ b/
@@ -22,6 +22,8 @@ classifiers = [
 param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
+#param_grid_svm = {'C':[1,10],'gamma':[1], 'kernel':['linear','rbf']}
+#param_grid_svm = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
 param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
 param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
 param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
diff --git a/ b/
index c6a9f72..091a4ef 100644
--- a/
+++ b/
@@ -51,9 +51,8 @@ if not os.path.exists('models'):
 # Reading data and preprocessings steps
 preprocessor = Preprocessor()
-df_original = pd.read_csv(dataPath)
+df = pd.read_csv(dataPath)
-df = df_original[[columnClass,columnText]].copy()
 df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
 df = resample_classes(df, columnClass, maxOfInstancePerClass)
@@ -73,7 +72,7 @@ for columnInput in [columnText, 'firstParagraph']:
     print('Process: ' + columnInput)
-    extractor = feature_extractor(df,columnText, columnClass)
+    extractor = feature_extractor(df, columnInput, columnClass)
     features_techniques = [
     ('counter',  extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
@@ -95,19 +94,22 @@ for columnInput in [columnText, 'firstParagraph']:
             clf_name, clf = tmp_clf
             grid_param_name, grid_param = tmp_grid_params
             print(clf_name, clf, grid_param_name, grid_param)
-            model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl"
+            model_file_name = columnInput + '_' + feature_technique_name + '_' + clf_name + '_' + str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl"
             if clf_name == 'bayes' :
                 if feature_technique_name == 'doc2vec':
                     t_begin = time.time()
                     # if model exist
-                    if os.path.isfile(os.path.join('./model', model_file_name)):
-                        with open(model_file_name, 'rb') as file:
+                    if os.path.isfile(os.path.join('./models', model_file_name)):
+                        print('trained model loaded')
+                        with open(os.path.join('./models', model_file_name), 'rb') as file:
                             clf = pickle.load(file)
+                        print('model training')
                         #if model not exists we save
-                        with open(Pkl_Filename, 'wb') as file:
+                        with open(os.path.join('./models', model_file_name), 'wb') as file:
                   , train_y)
                             pickle.dump(clf, file)
@@ -121,11 +123,13 @@ for columnInput in [columnText, 'firstParagraph']:
                 clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
                 t_begin = time.time()
-                if os.path.isfile(os.path.join('./model', model_file_name)):
-                    with open(model_file_name, 'rb') as file:
+                if os.path.isfile(os.path.join('./models', model_file_name)):
+                    print('trained model loaded')
+                    with open(os.path.join('./models', model_file_name), 'rb') as file:
                         clf = pickle.load(file)
-                    with open(Pkl_Filename, 'wb') as file:
+                    print('model training')
+                    with open(os.path.join('./models', model_file_name), 'wb') as file:
               , train_y)
                         pickle.dump(clf, file)

From 1a99bf702be41a3b707d0e67226c20e79421b713 Mon Sep 17 00:00:00 2001
From: Khalleud <>
Date: Mon, 7 Jun 2021 18:04:44 +0200
Subject: [PATCH 2/2] [FIX] update report output from text to csv

---                   | 2 -- | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/ b/
index 96e90ab..16db401 100644
--- a/
+++ b/
@@ -22,8 +22,6 @@ classifiers = [
 param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
-#param_grid_svm = {'C':[1,10],'gamma':[1], 'kernel':['linear','rbf']}
-#param_grid_svm = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
 param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
 param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
 param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
diff --git a/ b/
index 091a4ef..958752e 100644
--- a/
+++ b/
@@ -143,10 +143,10 @@ for columnInput in [columnText, 'firstParagraph']:
             file_name_report = columnInput + '_' +feature_technique_name + '_' + clf_name
             report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
+            report.to_csv(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.csv'))
             with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f:
                 sys.stdout = f # Change the standard output to the file we created.
-                print(report)
                 print('accuracy : {}'.format(accuracy))
                 print('weighted_Precision : {}'.format(weighted_avg['precision']))
                 print('weighted_Recall    : {}'.format(weighted_avg['recall']))