diff --git a/.gitignore b/.gitignore index 770acf4e909e9d5f4a5ad58d7db70917a4776d3c..71a9a39badd27be6c7da868fc886597fefba921c 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ data/dataframe_with_ensemble_domaine_enccre.csv data/dataframe_with_normClass_artfl.csv dataframe_with_domaine_enccre.csv dataframe_with_normClass_artfl.csv +*.pkl +.DS_Store diff --git a/classifiers.py b/classifiers.py index 16db401d1b54bfd1d667e2e9c7a32a1e5b298e5f..f68b2c6519a28447f62aa24426ef707455155f4f 100644 --- a/classifiers.py +++ b/classifiers.py @@ -12,11 +12,11 @@ import numpy as np classifiers = [ ('bayes', MultinomialNB()), + ('lr', LogisticRegression()), + ('sgd', SGDClassifier()), ('svm', SVC() ), ('decisionTree',DecisionTreeClassifier()), ('rfc', RandomForestClassifier()), - ('lr', LogisticRegression()), - ('sgd', SGDClassifier()), ('knn', KNeighborsClassifier()) ] @@ -26,7 +26,7 @@ param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} -param_grid_knn = {'n_neighbors' : list(range(1,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } +param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } grid_params = [ ('bayes', None), diff --git a/evaluate_model.py b/evaluate_model.py index 8abd2e9fc70578d30e11f7c54162a45521d1fe6e..f258ccd2d45248c3e67e6846f9f18cbc9c66aebc 100644 --- a/evaluate_model.py +++ b/evaluate_model.py @@ -57,14 +57,3 @@ def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, p plt.savefig(pathSave) return df, accuracy, weighted_avg -import seaborn as sns -import matplotlib.pyplot as plt -from sklearn.metrics import confusion_matrix - - -#y_true = [2, 0, 2, 2, 0, 1] -#y_pred = [0, 0, 2, 2, 0, 2] -#cf_matrix = confusion_matrix(y_true, y_pred) -#sns.heatmap(cf_matrix, annot=True) -#import matplotlib.pyplot as plt -#plt.show() diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index 958752ec98fd3840f1fcd175b04ab69cd7b5db90..1cc2f91dac3edb0d237da6605ce743a112c0e01c 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -94,50 +94,28 @@ for columnInput in [columnText, 'firstParagraph']: clf_name, clf = tmp_clf grid_param_name, grid_param = tmp_grid_params print(clf_name, clf, grid_param_name, grid_param) - model_file_name = columnInput + '_' + feature_technique_name + '_' + clf_name + '_' + str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl" - if clf_name == 'bayes' : - if feature_technique_name == 'doc2vec': - continue - else: - t_begin = time.time() - # if model exist - if os.path.isfile(os.path.join('./models', model_file_name)): - print('trained model loaded') - with open(os.path.join('./models', model_file_name), 'rb') as file: - clf = pickle.load(file) - else: - print('model training') - #if model not exists we save - with open(os.path.join('./models', model_file_name), 'wb') as file: - - clf.fit(train_x, train_y) - pickle.dump(clf, file) - - t_end =time.time() - training_time = t_end - t_begin - - y_pred = clf.predict(test_x) - - else : - + model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl" + + if clf_name != 'bayes' : clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) - t_begin = time.time() + elif feature_technique_name == 'doc2vec': + continue + + t_begin = time.time() - if os.path.isfile(os.path.join('./models', model_file_name)): - print('trained model loaded') - with open(os.path.join('./models', model_file_name), 'rb') as file: - clf = pickle.load(file) - else: - print('model training') - with open(os.path.join('./models', model_file_name), 'wb') as file: - clf.fit(train_x, train_y) - pickle.dump(clf, file) + if os.path.isfile(os.path.join('./models', model_file_name)): + with open(os.path.join('./models', model_file_name), 'rb') as file: + clf = pickle.load(file) + else: + with open(os.path.join('./models', model_file_name), 'wb') as file: + clf.fit(train_x, train_y) + pickle.dump(clf, file) - t_end =time.time() + t_end =time.time() - training_time = t_end - t_begin + training_time = t_end - t_begin - y_pred = clf.predict(test_x) + y_pred = clf.predict(test_x) #evaluate model file_name_report = columnInput + '_' +feature_technique_name + '_' + clf_name