import sys import os import time import argparse import pandas as pd from data_preprocessing import Preprocessor from features_extractor import feature_extractor from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class from classifiers import classifiers, grid_params from sklearn.model_selection import train_test_split from sklearn import preprocessing from evaluate_model import evaluate_model from sklearn.model_selection import GridSearchCV import configparser parser = argparse.ArgumentParser() parser.add_argument("dataPath", help="Path of the dataframe") parser.add_argument("columnText", help="the column name of the text that should preproceed", default = 'content') parser.add_argument("columnClass", help="ColumnClass the column name of the classes") parser.add_argument("minOfInstancePerClass", help="minOfInstancePerClass the minimum of instance required for each class", type=int) parser.add_argument("maxOfInstancePerClass", help="maxOfInstancePerClass the maximum of instance required resamling classes", type=int) args = parser.parse_args() dataPath = args.dataPath columnText = args.columnText columnClass = args.columnClass minOfInstancePerClass = args.minOfInstancePerClass maxOfInstancePerClass = args.maxOfInstancePerClass # create directory in the reports directory so save the classification results dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)): os.makedirs(os.path.join('reports', columnClass, dir_name_report)) # Reading data and preprocessings steps preprocessor = Preprocessor() df_original = pd.read_csv(dataPath) df = df_original[[columnClass,columnText]].copy() preprocessor.remove_null_rows(df, columnText) preprocessor.remove_null_rows(df, columnClass) #df = split_class(df, columnClass) df = remove_weak_classes(df, columnClass, minOfInstancePerClass ) df = resample_classes(df, columnClass, maxOfInstancePerClass) preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text #Read configuration file for retreiving parameters of features extractors config = configparser.ConfigParser() config.read('settings.conf') vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df')) vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df')) vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) extractor = feature_extractor(df,columnText, columnClass) extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass) features_techniques = [ ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] features_techniques_paragraphe = [ ('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), ('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] #prepare data df = df[df[columnClass] != 'unclassified'] y = df[columnClass] #case of full text for feature_technique_name, features in features_techniques: train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(test_y) for tmp_clf, tmp_grid_params in zip(classifiers, grid_params): clf_name, clf = tmp_clf grid_param_name, grid_param = tmp_grid_params print(clf_name, clf, grid_param_name, grid_param) if clf_name == 'bayes' : if feature_technique_name == 'doc2vec': continue else: t_begin = time.time() clf.fit(train_x, train_y) t_end =time.time() training_time = t_end - t_begin y_pred = clf.predict(test_x) else : clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) t_begin = time.time() clf.fit(train_x, train_y) t_end =time.time() training_time = t_end - t_begin y_pred = clf.predict(test_x) #evaluate model file_name_report = feature_technique_name + '_' + clf_name report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf') with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f: sys.stdout = f # Change the standard output to the file we created. print(report) print('accuracy : {}'.format(accuracy)) print('weighted_Precision : {}'.format(weighted_avg['precision'])) print('weighted_Recall : {}'.format(weighted_avg['recall'])) print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) print('weighted_Support : {}'.format(weighted_avg['support'])) print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) print('training time : {}'.format(training_time)) #sys.stdout = sys.stdout # Reset the standard output to its original value sys.stdout = sys.__stdout__ for feature_technique_name, features in features_techniques_paragraphe: train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(test_y) for tmp_clf, clf_grid_params in zip(classifiers, grid_params): clf_name, clf = tmp_clf grid_param_name, grid_param = tmp_grid_params if clf_name == 'bayes' : if feature_technique_name == 'doc2vec': continue else: t_begin = time.time() clf.fit(train_x, train_y) t_end =time.time() training_time = t_end - t_begin y_pred = clf.predict(test_x) else : clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) t_begin = time.time() clf.fit(train_x, train_y) t_end =time.time() training_time = t_end - t_begin y_pred = clf.predict(test_x) #evaluate model file_name_report_paragraphe = feature_technique_name + '_paragraphe_' + clf_name report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe)+'.pdf') with open(os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe+'.txt'), 'w') as f: sys.stdout = f # Change the standard output to the file we created. print(report) print('accuracy : {}'.format(accuracy)) print('weighted_Precision : {}'.format(weighted_avg['precision'])) print('weighted_Recall : {}'.format(weighted_avg['recall'])) print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) print('weighted_Support : {}'.format(weighted_avg['support'])) print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) print('training time : {}'.format(training_time)) sys.stdout = sys.stdout # Reset the standard output to its original value sys.stdout = sys.__stdout__