diff --git a/experiments/bert_experiments.py b/experiments/bert_experiments.py deleted file mode 100644 index b4e4bcd52219cb92d830f7d03654e631e113906f..0000000000000000000000000000000000000000 --- a/experiments/bert_experiments.py +++ /dev/null @@ -1,349 +0,0 @@ -import pandas as pd -import numpy as np -import torch -import transformers as ppb -from sklearn.model_selection import train_test_split -from sklearn import preprocessing -import statistics -import os -import sys -import argparse -import configparser -from transformers import CamembertModel, CamembertTokenizer -from transformers import FlaubertModel, FlaubertTokenizer - - -from sklearn.svm import SVC -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import LogisticRegression -from sklearn.linear_model import SGDClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.model_selection import GridSearchCV - - -import matplotlib.pyplot as plt -from sklearn.metrics import plot_confusion_matrix -from sklearn.metrics import confusion_matrix -from sklearn.metrics import classification_report -import seaborn as sns - - - - - - -def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave): - - #classifier, label_list, test_x, valid_y, title = "Confusion matrix"): - precision = [] - recall = [] - f1 = [] - support = [] - weighted_avg = None - accuracy = None - - df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN']) - report = classification_report( y_pred, valid_y, output_dict = True) - for c in classes: - precision.append(report[c]['precision']) - recall.append(report[c]['recall']) - f1.append(report[c]['f1-score']) - support.append(report[c]['support']) - - accuracy = report['accuracy'] - weighted_avg = report['weighted avg'] - cnf_matrix = confusion_matrix(valid_y, y_pred) - FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) - FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) - TP = np.diag(cnf_matrix) - TN = cnf_matrix.sum() - (FP + FN + TP) - - df['className'] = classesName - df['precision'] = precision - df['recall'] = recall - df['f1-score'] = f1 - df['support'] = support - df['FP'] = FP - df['FN'] = FN - df['TP'] = TP - df['TN'] = TN - #disp = plot_confusion_matrix(classifier, test_x, valid_y, - # display_labels= label_list, - # cmap=plt.cm.Blues, - # normalize=None) - #disp.ax_.set_title(title) - - #print(title) - #print(disp.confusion_matrix) - - #plt.show() - plt.rcParams["font.size"] = 3 - plot_confusion_matrix(clf, X_test, y_test) - plt.savefig(pathSave) - return df, accuracy, weighted_avg - - - -def create_dict(df, classColumnName): - return dict(df[classColumnName].value_counts()) - -def remove_weak_classes(df, classColumnName, threshold): - - dictOfClassInstances = create_dict(df,classColumnName) - - - dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } - keys = [*dictionary] - df_tmp = df[~ df[classColumnName].isin(keys)] - #df = df[df[columnTarget] not in keys] - #df = df.merge(df_tmp, how = 'outer' ,indicator=True) - df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) - return df - - -def split_class(df, columnProcessed): - i = 0 - new_df = pd.DataFrame(columns= df.columns) - for index, row in df.iterrows(): - #cls = re.split(';', row[columnProcessed]) - cls = filter(None, row[columnProcessed].split(';')) - cls = list(cls) - #cls = re.findall(r"[\w']+", row [columnProcessed]) - r = row - for categ in cls: - r[columnProcessed] = categ - #new_df.append(r, ignore_index = True) - new_df.loc[i] = r - i = i + 1 - - return new_df - - -def resample_classes(df, classColumnName, numberOfInstances): - # numberOfInstances first elements - #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns]) - #random numberOfInstances elements - replace = False # with replacement - - fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] - return df.groupby(classColumnName, as_index=False).apply(fn) - - -def select_classifier(argument): - - classifiers = { - - 'lr' :LogisticRegression(), - 'sgd' :SGDClassifier(), - 'svm' :SVC() , - 'decisionTree' :DecisionTreeClassifier(), - 'rfc' :RandomForestClassifier(), - 'knn' : KNeighborsClassifier() - } - - param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} - param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } - param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } - param_grid_lr = { "penalty":['none',"l2"]} - param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} - param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } - - grid_params = { - - 'lr': param_grid_lr, - 'sgd': param_grid_sgd , - 'svm': param_grid_svm, - 'decisionTree': param_grid_decisionTree, - 'rfc': param_grid_rfc , - 'knn': param_grid_knn, - - } - - return classifiers.get(argument), grid_params.get(argument) - - -if __name__ == "__main__": - - - - - - print('ok') - parser = argparse.ArgumentParser() - parser.add_argument("modelName", help="bert or distilBert or camembert or flaubert") - parser.add_argument("classifier", help="lr or knn or rfc or decisionTree or sgd or svm") - - - args = parser.parse_args() - arg = args.modelName - classifier = args.classifier - - config = configparser.ConfigParser() - config.read('parameters.conf') - - minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) - maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) - - dataPath = config.get('data','dataPath') - columnText = config.get('data','columnText') - columnClass = config.get('data','columnClass') - - - - if not os.path.exists('reports'): - os.makedirs('reports') - - if not os.path.exists(os.path.join('reports', columnClass)): - os.makedirs(os.path.join('reports', columnClass)) - - - dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) - if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)): - os.makedirs(os.path.join('reports', columnClass, dir_name_report)) - - - - # read data - print(dataPath) - df = pd.read_csv(dataPath) - df = remove_weak_classes(df, columnClass, minOfInstancePerClass) - df = resample_classes(df, columnClass, maxOfInstancePerClass) - - print(df.head()) - print(df.shape) - #encode labels - df = df[df[columnClass] != 'unclassified'] - y = df[columnClass] - encoder = preprocessing.LabelEncoder() - y = encoder.fit_transform(y) - - - sentences = df['firstParagraph'] - labels = y.tolist() - - - - # Features Extraction - #Bert - model_class_bert, tokenizer_class_bert, pretrained_weights_bert = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased') - tokenizer_bert = tokenizer_class_bert.from_pretrained(pretrained_weights_bert) - model_bert = model_class_bert.from_pretrained(pretrained_weights_bert) - #DistilBert - model_class_distilBert, tokenizer_class_distilBert, pretrained_weights_distilBert = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased') - tokenizer_distilBert = tokenizer_class_distilBert.from_pretrained(pretrained_weights_distilBert) - model_distilBert = model_class_distilBert.from_pretrained(pretrained_weights_distilBert) - #Camembert - camembert_tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base") - camembert = CamembertModel.from_pretrained("camembert/camembert-base") - #Flaubert - - flaubert, log = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased', output_loading_info=True) - flaubert_tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', do_lowercase=False) - - - - models = { - 'bert': model_bert, - 'distilbert': model_distilBert , - 'camembert': camembert, - 'flaubert': flaubert - } - - tokenizers = { - 'bert': tokenizer_bert, - 'distilbert': tokenizer_distilBert , - 'camembert': camembert_tokenizer, - 'flaubert': flaubert_tokenizer - - } - - - - - - - if arg == 'flaubert': - model = flaubert - tokenizer = flaubert_tokenizer - elif arg == 'camembert': - model = camembert - tokenizer = camembert_tokenizer - - elif arg == 'distilbert': - model = model_distilBert - tokenizer = tokenizer_distilBert - - elif arg == 'bert': - model = model_bert - tokenizer = tokenizer_bert - - - - - - - tokenized = sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length = 512, truncation = True))) - - # padding the sequences - max_len = 0 - for i in tokenized.values: - if len(i) > max_len: - max_len = len(i) - - padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values]) - - - - # attention mask - - attention_mask = np.where(padded != 0, 1, 0) - - - - # get features - input_ids = torch.tensor(padded) - attention_mask = torch.tensor(attention_mask) - - with torch.no_grad(): - last_hidden_states = model(input_ids, attention_mask=attention_mask) - - features = last_hidden_states[0][:,0,:].numpy() - print(features.shape) - - train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) - - - # classification - - - clf, grid_param = select_classifier(classifier) - - print(features) - - - - clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) - - clf.fit(train_x, train_y) - - #evaluation - - - y_pred = clf.predict(test_x) - - - report, accuracy, weighted_avg = evaluate_model(clf, test_x, test_y, y_pred, test_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, arg+ '_' + classifier+'.pdf')) - - report.to_csv(os.path.join('reports', columnClass, dir_name_report, arg + '_' + classifier +'.csv')) - with open(os.path.join('reports', columnClass, dir_name_report, arg + '_' + classifier+'.txt'), 'w') as f: - - sys.stdout = f # Change the standard output to the file we created. - print('accuracy : {}'.format(accuracy)) - print('weighted_Precision : {}'.format(weighted_avg['precision'])) - print('weighted_Recall : {}'.format(weighted_avg['recall'])) - print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) - print('weighted_Support : {}'.format(weighted_avg['support'])) - print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) - #sys.stdout = sys.stdout # Reset the standard output to its original value - sys.stdout = sys.__stdout__