diff --git a/experiments/bert_experiments.py b/experiments/bert_experiments.py new file mode 100644 index 0000000000000000000000000000000000000000..b4e4bcd52219cb92d830f7d03654e631e113906f --- /dev/null +++ b/experiments/bert_experiments.py @@ -0,0 +1,349 @@ +import pandas as pd +import numpy as np +import torch +import transformers as ppb +from sklearn.model_selection import train_test_split +from sklearn import preprocessing +import statistics +import os +import sys +import argparse +import configparser +from transformers import CamembertModel, CamembertTokenizer +from transformers import FlaubertModel, FlaubertTokenizer + + +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import SGDClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import GridSearchCV + + +import matplotlib.pyplot as plt +from sklearn.metrics import plot_confusion_matrix +from sklearn.metrics import confusion_matrix +from sklearn.metrics import classification_report +import seaborn as sns + + + + + + +def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave): + + #classifier, label_list, test_x, valid_y, title = "Confusion matrix"): + precision = [] + recall = [] + f1 = [] + support = [] + weighted_avg = None + accuracy = None + + df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN']) + report = classification_report( y_pred, valid_y, output_dict = True) + for c in classes: + precision.append(report[c]['precision']) + recall.append(report[c]['recall']) + f1.append(report[c]['f1-score']) + support.append(report[c]['support']) + + accuracy = report['accuracy'] + weighted_avg = report['weighted avg'] + cnf_matrix = confusion_matrix(valid_y, y_pred) + FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) + FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) + TP = np.diag(cnf_matrix) + TN = cnf_matrix.sum() - (FP + FN + TP) + + df['className'] = classesName + df['precision'] = precision + df['recall'] = recall + df['f1-score'] = f1 + df['support'] = support + df['FP'] = FP + df['FN'] = FN + df['TP'] = TP + df['TN'] = TN + #disp = plot_confusion_matrix(classifier, test_x, valid_y, + # display_labels= label_list, + # cmap=plt.cm.Blues, + # normalize=None) + #disp.ax_.set_title(title) + + #print(title) + #print(disp.confusion_matrix) + + #plt.show() + plt.rcParams["font.size"] = 3 + plot_confusion_matrix(clf, X_test, y_test) + plt.savefig(pathSave) + return df, accuracy, weighted_avg + + + +def create_dict(df, classColumnName): + return dict(df[classColumnName].value_counts()) + +def remove_weak_classes(df, classColumnName, threshold): + + dictOfClassInstances = create_dict(df,classColumnName) + + + dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } + keys = [*dictionary] + df_tmp = df[~ df[classColumnName].isin(keys)] + #df = df[df[columnTarget] not in keys] + #df = df.merge(df_tmp, how = 'outer' ,indicator=True) + df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) + return df + + +def split_class(df, columnProcessed): + i = 0 + new_df = pd.DataFrame(columns= df.columns) + for index, row in df.iterrows(): + #cls = re.split(';', row[columnProcessed]) + cls = filter(None, row[columnProcessed].split(';')) + cls = list(cls) + #cls = re.findall(r"[\w']+", row [columnProcessed]) + r = row + for categ in cls: + r[columnProcessed] = categ + #new_df.append(r, ignore_index = True) + new_df.loc[i] = r + i = i + 1 + + return new_df + + +def resample_classes(df, classColumnName, numberOfInstances): + # numberOfInstances first elements + #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns]) + #random numberOfInstances elements + replace = False # with replacement + + fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] + return df.groupby(classColumnName, as_index=False).apply(fn) + + +def select_classifier(argument): + + classifiers = { + + 'lr' :LogisticRegression(), + 'sgd' :SGDClassifier(), + 'svm' :SVC() , + 'decisionTree' :DecisionTreeClassifier(), + 'rfc' :RandomForestClassifier(), + 'knn' : KNeighborsClassifier() + } + + param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} + param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } + param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } + param_grid_lr = { "penalty":['none',"l2"]} + param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} + param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } + + grid_params = { + + 'lr': param_grid_lr, + 'sgd': param_grid_sgd , + 'svm': param_grid_svm, + 'decisionTree': param_grid_decisionTree, + 'rfc': param_grid_rfc , + 'knn': param_grid_knn, + + } + + return classifiers.get(argument), grid_params.get(argument) + + +if __name__ == "__main__": + + + + + + print('ok') + parser = argparse.ArgumentParser() + parser.add_argument("modelName", help="bert or distilBert or camembert or flaubert") + parser.add_argument("classifier", help="lr or knn or rfc or decisionTree or sgd or svm") + + + args = parser.parse_args() + arg = args.modelName + classifier = args.classifier + + config = configparser.ConfigParser() + config.read('parameters.conf') + + minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) + maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) + + dataPath = config.get('data','dataPath') + columnText = config.get('data','columnText') + columnClass = config.get('data','columnClass') + + + + if not os.path.exists('reports'): + os.makedirs('reports') + + if not os.path.exists(os.path.join('reports', columnClass)): + os.makedirs(os.path.join('reports', columnClass)) + + + dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) + if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)): + os.makedirs(os.path.join('reports', columnClass, dir_name_report)) + + + + # read data + print(dataPath) + df = pd.read_csv(dataPath) + df = remove_weak_classes(df, columnClass, minOfInstancePerClass) + df = resample_classes(df, columnClass, maxOfInstancePerClass) + + print(df.head()) + print(df.shape) + #encode labels + df = df[df[columnClass] != 'unclassified'] + y = df[columnClass] + encoder = preprocessing.LabelEncoder() + y = encoder.fit_transform(y) + + + sentences = df['firstParagraph'] + labels = y.tolist() + + + + # Features Extraction + #Bert + model_class_bert, tokenizer_class_bert, pretrained_weights_bert = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased') + tokenizer_bert = tokenizer_class_bert.from_pretrained(pretrained_weights_bert) + model_bert = model_class_bert.from_pretrained(pretrained_weights_bert) + #DistilBert + model_class_distilBert, tokenizer_class_distilBert, pretrained_weights_distilBert = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased') + tokenizer_distilBert = tokenizer_class_distilBert.from_pretrained(pretrained_weights_distilBert) + model_distilBert = model_class_distilBert.from_pretrained(pretrained_weights_distilBert) + #Camembert + camembert_tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base") + camembert = CamembertModel.from_pretrained("camembert/camembert-base") + #Flaubert + + flaubert, log = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased', output_loading_info=True) + flaubert_tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', do_lowercase=False) + + + + models = { + 'bert': model_bert, + 'distilbert': model_distilBert , + 'camembert': camembert, + 'flaubert': flaubert + } + + tokenizers = { + 'bert': tokenizer_bert, + 'distilbert': tokenizer_distilBert , + 'camembert': camembert_tokenizer, + 'flaubert': flaubert_tokenizer + + } + + + + + + + if arg == 'flaubert': + model = flaubert + tokenizer = flaubert_tokenizer + elif arg == 'camembert': + model = camembert + tokenizer = camembert_tokenizer + + elif arg == 'distilbert': + model = model_distilBert + tokenizer = tokenizer_distilBert + + elif arg == 'bert': + model = model_bert + tokenizer = tokenizer_bert + + + + + + + tokenized = sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length = 512, truncation = True))) + + # padding the sequences + max_len = 0 + for i in tokenized.values: + if len(i) > max_len: + max_len = len(i) + + padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values]) + + + + # attention mask + + attention_mask = np.where(padded != 0, 1, 0) + + + + # get features + input_ids = torch.tensor(padded) + attention_mask = torch.tensor(attention_mask) + + with torch.no_grad(): + last_hidden_states = model(input_ids, attention_mask=attention_mask) + + features = last_hidden_states[0][:,0,:].numpy() + print(features.shape) + + train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) + + + # classification + + + clf, grid_param = select_classifier(classifier) + + print(features) + + + + clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) + + clf.fit(train_x, train_y) + + #evaluation + + + y_pred = clf.predict(test_x) + + + report, accuracy, weighted_avg = evaluate_model(clf, test_x, test_y, y_pred, test_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, arg+ '_' + classifier+'.pdf')) + + report.to_csv(os.path.join('reports', columnClass, dir_name_report, arg + '_' + classifier +'.csv')) + with open(os.path.join('reports', columnClass, dir_name_report, arg + '_' + classifier+'.txt'), 'w') as f: + + sys.stdout = f # Change the standard output to the file we created. + print('accuracy : {}'.format(accuracy)) + print('weighted_Precision : {}'.format(weighted_avg['precision'])) + print('weighted_Recall : {}'.format(weighted_avg['recall'])) + print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) + print('weighted_Support : {}'.format(weighted_avg['support'])) + print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) + #sys.stdout = sys.stdout # Reset the standard output to its original value + sys.stdout = sys.__stdout__ diff --git a/experiments/parameters.conf b/experiments/parameters.conf new file mode 100644 index 0000000000000000000000000000000000000000..df584e4ab43603f86828c17c4c7cacaaaf6437ee --- /dev/null +++ b/experiments/parameters.conf @@ -0,0 +1,10 @@ +[general] + +minOfInstancePerClass = 1200 +maxOfInstancePerClass = 7 + +[data] + +dataPath = ../Data/dataframe_with_ensemble_domaine_enccre.csv +columnText = contentWithoutClass +columnClass = ensemble_domaine_enccre diff --git a/experiments/requierements.txt b/experiments/requierements.txt new file mode 100644 index 0000000000000000000000000000000000000000..076fef87790fd62e4a4101a85d64ceff09083b9a --- /dev/null +++ b/experiments/requierements.txt @@ -0,0 +1,7 @@ +transformers==4.3.2 +sentencepiece +sklearn +pandas +numpy +torch==1.8.1 +