Skip to content
Snippets Groups Projects
Commit ca1e73a7 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Delete bert_experiments.py

parent 9611e3ca
No related branches found
No related tags found
No related merge requests found
import pandas as pd
import numpy as np
import torch
import transformers as ppb
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import statistics
import os
import sys
import argparse
import configparser
from transformers import CamembertModel, CamembertTokenizer
from transformers import FlaubertModel, FlaubertTokenizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave):
#classifier, label_list, test_x, valid_y, title = "Confusion matrix"):
precision = []
recall = []
f1 = []
support = []
weighted_avg = None
accuracy = None
df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
report = classification_report( y_pred, valid_y, output_dict = True)
for c in classes:
precision.append(report[c]['precision'])
recall.append(report[c]['recall'])
f1.append(report[c]['f1-score'])
support.append(report[c]['support'])
accuracy = report['accuracy']
weighted_avg = report['weighted avg']
cnf_matrix = confusion_matrix(valid_y, y_pred)
FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)
df['className'] = classesName
df['precision'] = precision
df['recall'] = recall
df['f1-score'] = f1
df['support'] = support
df['FP'] = FP
df['FN'] = FN
df['TP'] = TP
df['TN'] = TN
#disp = plot_confusion_matrix(classifier, test_x, valid_y,
# display_labels= label_list,
# cmap=plt.cm.Blues,
# normalize=None)
#disp.ax_.set_title(title)
#print(title)
#print(disp.confusion_matrix)
#plt.show()
plt.rcParams["font.size"] = 3
plot_confusion_matrix(clf, X_test, y_test)
plt.savefig(pathSave)
return df, accuracy, weighted_avg
def create_dict(df, classColumnName):
return dict(df[classColumnName].value_counts())
def remove_weak_classes(df, classColumnName, threshold):
dictOfClassInstances = create_dict(df,classColumnName)
dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
keys = [*dictionary]
df_tmp = df[~ df[classColumnName].isin(keys)]
#df = df[df[columnTarget] not in keys]
#df = df.merge(df_tmp, how = 'outer' ,indicator=True)
df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)
return df
def split_class(df, columnProcessed):
i = 0
new_df = pd.DataFrame(columns= df.columns)
for index, row in df.iterrows():
#cls = re.split(';', row[columnProcessed])
cls = filter(None, row[columnProcessed].split(';'))
cls = list(cls)
#cls = re.findall(r"[\w']+", row [columnProcessed])
r = row
for categ in cls:
r[columnProcessed] = categ
#new_df.append(r, ignore_index = True)
new_df.loc[i] = r
i = i + 1
return new_df
def resample_classes(df, classColumnName, numberOfInstances):
# numberOfInstances first elements
#return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns])
#random numberOfInstances elements
replace = False # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
return df.groupby(classColumnName, as_index=False).apply(fn)
def select_classifier(argument):
classifiers = {
'lr' :LogisticRegression(),
'sgd' :SGDClassifier(),
'svm' :SVC() ,
'decisionTree' :DecisionTreeClassifier(),
'rfc' :RandomForestClassifier(),
'knn' : KNeighborsClassifier()
}
param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
param_grid_lr = { "penalty":['none',"l2"]}
param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]}
param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
grid_params = {
'lr': param_grid_lr,
'sgd': param_grid_sgd ,
'svm': param_grid_svm,
'decisionTree': param_grid_decisionTree,
'rfc': param_grid_rfc ,
'knn': param_grid_knn,
}
return classifiers.get(argument), grid_params.get(argument)
if __name__ == "__main__":
print('ok')
parser = argparse.ArgumentParser()
parser.add_argument("modelName", help="bert or distilBert or camembert or flaubert")
parser.add_argument("classifier", help="lr or knn or rfc or decisionTree or sgd or svm")
args = parser.parse_args()
arg = args.modelName
classifier = args.classifier
config = configparser.ConfigParser()
config.read('parameters.conf')
minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
dataPath = config.get('data','dataPath')
columnText = config.get('data','columnText')
columnClass = config.get('data','columnClass')
if not os.path.exists('reports'):
os.makedirs('reports')
if not os.path.exists(os.path.join('reports', columnClass)):
os.makedirs(os.path.join('reports', columnClass))
dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)):
os.makedirs(os.path.join('reports', columnClass, dir_name_report))
# read data
print(dataPath)
df = pd.read_csv(dataPath)
df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass)
print(df.head())
print(df.shape)
#encode labels
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(y)
sentences = df['firstParagraph']
labels = y.tolist()
# Features Extraction
#Bert
model_class_bert, tokenizer_class_bert, pretrained_weights_bert = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
tokenizer_bert = tokenizer_class_bert.from_pretrained(pretrained_weights_bert)
model_bert = model_class_bert.from_pretrained(pretrained_weights_bert)
#DistilBert
model_class_distilBert, tokenizer_class_distilBert, pretrained_weights_distilBert = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer_distilBert = tokenizer_class_distilBert.from_pretrained(pretrained_weights_distilBert)
model_distilBert = model_class_distilBert.from_pretrained(pretrained_weights_distilBert)
#Camembert
camembert_tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base")
camembert = CamembertModel.from_pretrained("camembert/camembert-base")
#Flaubert
flaubert, log = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased', output_loading_info=True)
flaubert_tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', do_lowercase=False)
models = {
'bert': model_bert,
'distilbert': model_distilBert ,
'camembert': camembert,
'flaubert': flaubert
}
tokenizers = {
'bert': tokenizer_bert,
'distilbert': tokenizer_distilBert ,
'camembert': camembert_tokenizer,
'flaubert': flaubert_tokenizer
}
if arg == 'flaubert':
model = flaubert
tokenizer = flaubert_tokenizer
elif arg == 'camembert':
model = camembert
tokenizer = camembert_tokenizer
elif arg == 'distilbert':
model = model_distilBert
tokenizer = tokenizer_distilBert
elif arg == 'bert':
model = model_bert
tokenizer = tokenizer_bert
tokenized = sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length = 512, truncation = True)))
# padding the sequences
max_len = 0
for i in tokenized.values:
if len(i) > max_len:
max_len = len(i)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
# attention mask
attention_mask = np.where(padded != 0, 1, 0)
# get features
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
last_hidden_states = model(input_ids, attention_mask=attention_mask)
features = last_hidden_states[0][:,0,:].numpy()
print(features.shape)
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
# classification
clf, grid_param = select_classifier(classifier)
print(features)
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
clf.fit(train_x, train_y)
#evaluation
y_pred = clf.predict(test_x)
report, accuracy, weighted_avg = evaluate_model(clf, test_x, test_y, y_pred, test_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, arg+ '_' + classifier+'.pdf'))
report.to_csv(os.path.join('reports', columnClass, dir_name_report, arg + '_' + classifier +'.csv'))
with open(os.path.join('reports', columnClass, dir_name_report, arg + '_' + classifier+'.txt'), 'w') as f:
sys.stdout = f # Change the standard output to the file we created.
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
#sys.stdout = sys.stdout # Reset the standard output to its original value
sys.stdout = sys.__stdout__
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment