import pandas as pd from data_preprocessing import Preprocessor from features_extractor import feature_extractor from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class from classifiers import classifiers, grid_params from sklearn.model_selection import train_test_split from sklearn import preprocessing from evaluate_model import evaluate_model from sklearn.naive_bayes import MultinomialNB # Reading data df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") df_domaine_enccre = df[['_domaine_enccre','content']].copy() #remove null values of class column and text column preprocessor = Preprocessor() preprocessor.remove_null_rows(df_domaine_enccre, 'content') preprocessor.remove_null_rows(df_domaine_enccre, '_domaine_enccre') df_domaine_enccre = split_class(df_domaine_enccre, '_domaine_enccre') minOfInstancePerClass = 200 maxOfInstancePerClass = 1500 #remove weak classes and resample classes df_domaine_enccre = remove_weak_classes(df_domaine_enccre, '_domaine_enccre', minOfInstancePerClass ) df_domaine_enccre = resample_classes(df_domaine_enccre, '_domaine_enccre', maxOfInstancePerClass) preprocessor.saveDataFrametoCSV(df_domaine_enccre,'df_domaine_enccre.csv') #features extraction step #df_domaine_enccre = pd.read_csv('df_domaine_enccre.csv') extractor = feature_extractor(df_domaine_enccre,'content', '_domaine_enccre') X_count_vect = extractor.count_vect() X_tf = extractor.tf_idf() #X_doc2vec = extractor.doc2vec(10, 20, 0.025) #X_text_feature = extractor.text_based_features() # preparing the train and test data df_domaine_enccre = df_domaine_enccre[df_domaine_enccre['domaine_enccre'] != 'unclassified'] y = df_domaine_enccre['domaine_enccre'] train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y ) encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(test_y) # fit the model m = MultinomialNB() m.fit(train_x, train_y) y_pred = m.predict(test_x) #evaluate model report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_) print(report) print('accuracy : {}'.format(accuracy)) print('weighted_Precision : {}'.format(weighted_avg['precision'])) print('weighted_Recall : {}'.format(weighted_avg['recall'])) print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) print('weighted_Support : {}'.format(weighted_avg['support']))