Merge branch 'master' into 'branch_dev'

# Conflicts: # experimentsClassicClassifiers.py

Merge branch 'master' into 'branch_dev'
# Conflicts: # experimentsClassicClassifiers.py
60bfb622 · Ludovic Moncla · 1a99bf70 · 7c34555b · 60bfb622 · 60bfb622
Commit 60bfb622 authored 4 years ago by Ludovic Moncla
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,5 @@ data/dataframe_with_ensemble_domaine_enccre.csv
 data/dataframe_with_normClass_artfl.csv
 dataframe_with_domaine_enccre.csv
 dataframe_with_normClass_artfl.csv
+*.pkl
+.DS_Store
--- a/classifiers.py
+++ b/classifiers.py
@@ -12,11 +12,11 @@ import numpy as np

 classifiers = [
                ('bayes', MultinomialNB()),
+                ('lr', LogisticRegression()),
+                ('sgd', SGDClassifier()),
                ('svm', SVC() ),
                ('decisionTree',DecisionTreeClassifier()),
                ('rfc', RandomForestClassifier()),
-                ('lr', LogisticRegression()),
-                ('sgd', SGDClassifier()),
                ('knn', KNeighborsClassifier())
                ]

@@ -26,7 +26,7 @@ param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range
 param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
 param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
 param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]}
-param_grid_knn = {'n_neighbors' : list(range(1,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
+param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }

 grid_params = [
                ('bayes', None),

--- a/evaluate_model.py
+++ b/evaluate_model.py
@@ -57,14 +57,3 @@ def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, p
    plt.savefig(pathSave)
    return df, accuracy, weighted_avg

-import seaborn as sns
-import matplotlib.pyplot as plt
-from sklearn.metrics import confusion_matrix
-
-
-#y_true = [2, 0, 2, 2, 0, 1]
-#y_pred = [0, 0, 2, 2, 0, 2]
-#cf_matrix = confusion_matrix(y_true, y_pred)
-#sns.heatmap(cf_matrix, annot=True)
-#import matplotlib.pyplot as plt
-#plt.show()
--- a/experimentsClassicClassifiers.py
+++ b/experimentsClassicClassifiers.py
@@ -94,50 +94,28 @@ for columnInput in [columnText, 'firstParagraph']:
            clf_name, clf = tmp_clf
            grid_param_name, grid_param = tmp_grid_params
            print(clf_name, clf, grid_param_name, grid_param)
-            model_file_name = columnInput + '_' + feature_technique_name + '_' + clf_name + '_' + str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl"
-            if clf_name == 'bayes' :
-                if feature_technique_name == 'doc2vec':
-                    continue
-                else:
-                    t_begin = time.time()
-                    # if model exist
-                    if os.path.isfile(os.path.join('./models', model_file_name)):
-                        print('trained model loaded')
-                        with open(os.path.join('./models', model_file_name), 'rb') as file:
-                            clf = pickle.load(file)
-                    else:
-                        print('model training')
-                        #if model not exists we save
-                        with open(os.path.join('./models', model_file_name), 'wb') as file:
-
-                            clf.fit(train_x, train_y)
-                            pickle.dump(clf, file)
-
-                    t_end =time.time()
-                    training_time = t_end - t_begin
-
-                    y_pred = clf.predict(test_x)
-
-            else :
-
+            model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl"
+            
+            if clf_name != 'bayes' :
                clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
-                t_begin = time.time()
+            elif feature_technique_name == 'doc2vec':
+                    continue
+            
+            t_begin = time.time()

-                if os.path.isfile(os.path.join('./models', model_file_name)):
-                    print('trained model loaded')
-                    with open(os.path.join('./models', model_file_name), 'rb') as file:
-                        clf = pickle.load(file)
-                else:
-                    print('model training')
-                    with open(os.path.join('./models', model_file_name), 'wb') as file:
-                        clf.fit(train_x, train_y)
-                        pickle.dump(clf, file)
+            if os.path.isfile(os.path.join('./models', model_file_name)):
+                with open(os.path.join('./models', model_file_name), 'rb') as file:
+                    clf = pickle.load(file)
+            else:
+                with open(os.path.join('./models', model_file_name), 'wb') as file:
+                    clf.fit(train_x, train_y)
+                    pickle.dump(clf, file)

-                t_end =time.time()
+            t_end =time.time()

-                training_time = t_end - t_begin
+            training_time = t_end - t_begin

-                y_pred = clf.predict(test_x)
+            y_pred = clf.predict(test_x)

    #evaluate model
            file_name_report = columnInput + '_' +feature_technique_name + '_' + clf_name