From bc79aeb71dc23142d71f077302c90815f17306c5 Mon Sep 17 00:00:00 2001 From: Khalleud <ledk14@gmail.com> Date: Thu, 3 Jun 2021 14:07:48 +0200 Subject: [PATCH] Remove unused files --- projet/main_1.py | 71 ---------------------------------------- projet/main_2.py | 73 ----------------------------------------- projet/main_3.py | 85 ------------------------------------------------ 3 files changed, 229 deletions(-) delete mode 100644 projet/main_1.py delete mode 100644 projet/main_2.py delete mode 100644 projet/main_3.py diff --git a/projet/main_1.py b/projet/main_1.py deleted file mode 100644 index 4fcb689..0000000 --- a/projet/main_1.py +++ /dev/null @@ -1,71 +0,0 @@ -import pandas as pd -from data_preprocessing import Preprocessor -from features_extractor import feature_extractor -from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class -from classifiers import classifiers, grid_params -from sklearn.model_selection import train_test_split -from sklearn import preprocessing -from evaluate_model import evaluate_model -from sklearn.naive_bayes import MultinomialNB - -# Reading data -df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") -df_normClass_artfl = df[['normClass_artfl','content']].copy() - -#remove null values of class column and text column -preprocessor = Preprocessor() -preprocessor.remove_null_rows(df_normClass_artfl, 'content') -preprocessor.remove_null_rows(df_normClass_artfl, 'normClass_artfl') -df_normClass_artfl = split_class(df_normClass_artfl, 'normClass_artfl') - -minOfInstancePerClass = 200 -maxOfInstancePerClass = 1500 - -#remove weak classes and resample classes - -df_normClass_artfl = remove_weak_classes(df_normClass_artfl, 'normClass_artfl', minOfInstancePerClass ) -df_normClass_artfl = resample_classes(df_normClass_artfl, 'normClass_artfl', maxOfInstancePerClass) - -preprocessor.saveDataFrametoCSV(df_normClass_artfl,'df_normClass_artfl.csv') -#features extraction step -#df_normClass_artfl = pd.read_csv('df_normClass_artfl.csv') -extractor = feature_extractor(df_normClass_artfl,'content', 'normClass_artfl') - -X_count_vect = extractor.count_vect() -X_tf = extractor.tf_idf() -#X_doc2vec = extractor.doc2vec(10, 20, 0.025) -#X_text_feature = extractor.text_based_features() - - -# preparing the train and test data -df_normClass_artfl = df_normClass_artfl[df_normClass_artfl['normClass_artfl'] != 'unclassified'] -y = df_normClass_artfl['normClass_artfl'] - -train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y ) -encoder = preprocessing.LabelEncoder() -train_y = encoder.fit_transform(train_y) -valid_y = encoder.fit_transform(test_y) - - - -# fit the model -m = MultinomialNB() - -m.fit(train_x, train_y) - - -y_pred = m.predict(test_x) - - - - -#evaluate model - - -report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_) -print(report) -print('accuracy : {}'.format(accuracy)) -print('weighted_Precision : {}'.format(weighted_avg['precision'])) -print('weighted_Recall : {}'.format(weighted_avg['recall'])) -print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) -print('weighted_Support : {}'.format(weighted_avg['support'])) diff --git a/projet/main_2.py b/projet/main_2.py deleted file mode 100644 index 8de72fe..0000000 --- a/projet/main_2.py +++ /dev/null @@ -1,73 +0,0 @@ -import pandas as pd -from data_preprocessing import Preprocessor -from features_extractor import feature_extractor -from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class -from classifiers import classifiers, grid_params -from sklearn.model_selection import train_test_split -from sklearn import preprocessing -from evaluate_model import evaluate_model -from sklearn.naive_bayes import MultinomialNB - -# Reading data -df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") -df_domaine_enccre = df[['_domaine_enccre','content']].copy() - -#remove null values of class column and text column -preprocessor = Preprocessor() -preprocessor.remove_null_rows(df_domaine_enccre, 'content') -preprocessor.remove_null_rows(df_domaine_enccre, '_domaine_enccre') -df_domaine_enccre = split_class(df_domaine_enccre, '_domaine_enccre') - -minOfInstancePerClass = 200 -maxOfInstancePerClass = 1500 - -#remove weak classes and resample classes - -df_domaine_enccre = remove_weak_classes(df_domaine_enccre, '_domaine_enccre', minOfInstancePerClass ) -df_domaine_enccre = resample_classes(df_domaine_enccre, '_domaine_enccre', maxOfInstancePerClass) - -preprocessor.saveDataFrametoCSV(df_domaine_enccre,'df_domaine_enccre.csv') -#features extraction step -#df_domaine_enccre = pd.read_csv('df_domaine_enccre.csv') - - -extractor = feature_extractor(df_domaine_enccre,'content', '_domaine_enccre') - -X_count_vect = extractor.count_vect() -X_tf = extractor.tf_idf() -#X_doc2vec = extractor.doc2vec(10, 20, 0.025) -#X_text_feature = extractor.text_based_features() - - -# preparing the train and test data -df_domaine_enccre = df_domaine_enccre[df_domaine_enccre['domaine_enccre'] != 'unclassified'] -y = df_domaine_enccre['domaine_enccre'] - -train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y ) -encoder = preprocessing.LabelEncoder() -train_y = encoder.fit_transform(train_y) -valid_y = encoder.fit_transform(test_y) - - - -# fit the model -m = MultinomialNB() - -m.fit(train_x, train_y) - - -y_pred = m.predict(test_x) - - - - -#evaluate model - - -report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_) -print(report) -print('accuracy : {}'.format(accuracy)) -print('weighted_Precision : {}'.format(weighted_avg['precision'])) -print('weighted_Recall : {}'.format(weighted_avg['recall'])) -print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) -print('weighted_Support : {}'.format(weighted_avg['support'])) diff --git a/projet/main_3.py b/projet/main_3.py deleted file mode 100644 index ad7102f..0000000 --- a/projet/main_3.py +++ /dev/null @@ -1,85 +0,0 @@ -import pandas as pd -import numpy as np -from data_preprocessing import Preprocessor -from features_extractor import feature_extractor -from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class -from classifiers import classifiers, grid_params -from sklearn.model_selection import train_test_split -from sklearn import preprocessing -from evaluate_model import evaluate_model -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import GridSearchCV - - -# Reading data -df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") -df_ensemble_domaine_enccre = df[['ensemble_domaine_enccre','content']].copy() - -#remove null values of class column and text column -preprocessor = Preprocessor() -preprocessor.remove_null_rows(df_ensemble_domaine_enccre, 'content') -preprocessor.remove_null_rows(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre') -#df_ensemble_domaine_enccre = split_class(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre') - -minOfInstancePerClass = 200 -maxOfInstancePerClass = 1500 - -#remove weak classes and resample classes -print(create_dict(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre')) -df_ensemble_domaine_enccre = remove_weak_classes(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre', minOfInstancePerClass ) -df_ensemble_domaine_enccre = resample_classes(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre', maxOfInstancePerClass) - -print(create_dict(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre')) - -#preprocessor.saveDataFrametoCSV(df_ensemble_domaine_enccre,'df_ensemble_domaine_enccre.csv') -#features extraction step -#df_ensemble_domaine_enccre = pd.read_csv('df_ensemble_domaine_enccre.csv') - - -extractor = feature_extractor(df_ensemble_domaine_enccre,'content', 'ensemble_domaine_enccre') - -X_count_vect = extractor.count_vect() -#X_tf = extractor.tf_idf() -#X_doc2vec = extractor.doc2vec(10, 20, 0.025) -#X_text_feature = extractor.text_based_features() - - -# preparing the train and test data -df_ensemble_domaine_enccre = df_ensemble_domaine_enccre[df_ensemble_domaine_enccre['ensemble_domaine_enccre'] != 'unclassified'] -y = df_ensemble_domaine_enccre['ensemble_domaine_enccre'] - -train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y ) -encoder = preprocessing.LabelEncoder() -train_y = encoder.fit_transform(train_y) -valid_y = encoder.fit_transform(test_y) - - - -# fit the model - -m = LogisticRegression() #MultinomialNB() -#m.fit(train_x, train_y) - -param_grid_lr = {"C":np.logspace(-3,3,7)} - -clf = GridSearchCV(m, param_grid = param_grid_lr, cv = 5, verbose=True, n_jobs=-1) - -# Fit on data - -best_clf = clf.fit(train_x, train_y) - -y_pred = clf.predict(test_x) - - - - -#evaluate model - - -report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_) -print(report) -print('accuracy : {}'.format(accuracy)) -print('weighted_Precision : {}'.format(weighted_avg['precision'])) -print('weighted_Recall : {}'.format(weighted_avg['recall'])) -print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) -print('weighted_Support : {}'.format(weighted_avg['support'])) -- GitLab