diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index 530e60d672ef8cdeea668622359582a3eb76e428..17a5409b29ad658339229b13d56d81ce8c8de0d9 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -52,7 +52,7 @@ if not os.path.exists('models'): # Reading data and preprocessings steps preprocessor = Preprocessor() -df = pd.read_csv(dataPath) +df = pd.read_csv(dataPath, sep="\t") df = remove_weak_classes(df, columnClass, minOfInstancePerClass) df = resample_classes(df, columnClass, maxOfInstancePerClass) @@ -72,32 +72,40 @@ doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count')) doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed. doc2vec_workers = int(config.get('vectorizers','doc2vec_workers')) +print("size after resampling, ",len(df)) -for columnInput in [columnText]: - print('Process: ' + columnInput) +#prepare data +df = df[df[columnClass] != 'unclassified'] +y = df[columnClass] + +print(df.head()) + +train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) +encoder = preprocessing.LabelEncoder() +train_y = encoder.fit_transform(train_y) +valid_y = encoder.fit_transform(test_y) - #prepare data - df = df[df[columnClass] != 'unclassified'] - y = df[columnClass] +print("size training set, ",len(train_x)) +print("size validation set, ",len(test_x)) - train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) - encoder = preprocessing.LabelEncoder() - train_y = encoder.fit_transform(train_y) - valid_y = encoder.fit_transform(test_y) +for columnInput in [columnText, 'firstParagraph']: + + print('Process: ' + columnInput) extractor = feature_extractor(train_x, test_x, columnInput, columnClass) features_techniques = [ ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), - ('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm, doc2vec_workers))] + ('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm))] #case of full text for feature_technique_name, features in features_techniques: + print("**** Classifier :", feature_technique_name) # features has the train_x and the test_x after vectorization train_x, test_x = features @@ -115,10 +123,11 @@ for columnInput in [columnText]: t_begin = time.time() if os.path.isfile(os.path.join('./models', model_file_name)): - with open(os.path.join('./models', model_file_name), 'rb') as file: + report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf') + with open(os.path.join('./models',columnClass, model_file_name), 'rb') as file: clf = pickle.load(file) else: - with open(os.path.join('./models', model_file_name), 'wb') as file: + with open(os.path.join('./models',columnClass, model_file_name), 'wb') as file: clf.fit(train_x, train_y) pickle.dump(clf, file) @@ -143,5 +152,10 @@ for columnInput in [columnText]: print('weighted_Support : {}'.format(weighted_avg['support'])) print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) print('training time : {}'.format(training_time)) + try: + print('best parameters : {}'.format(clf.best_params_)) + except AttributeError: + pass + #sys.stdout = sys.stdout # Reset the standard output to its original value sys.stdout = sys.__stdout__