Skip to content
Snippets Groups Projects
Commit a5158391 authored by Khalleud's avatar Khalleud
Browse files

[ADD] train bert finetuning & predict & evaluate

parent e7f6f159
No related branches found
No related tags found
1 merge request!5Branch dev bert exp
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
def evaluate_bertFineTuning(pred_labels_, true_labels_, encoder):
report = classification_report( pred_labels_, true_labels_, output_dict = True)
classes = [str(e) for e in encoder.transform(encoder.classes_)]
classesName = encoder.classes_
accuracy = report['accuracy']
weighted_avg = report['weighted avg']
precision = []
recall = []
f1 = []
support = []
dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
for c in classes:
precision.append(report[c]['precision'])
recall.append(report[c]['recall'])
f1.append(report[c]['f1-score'])
support.append(report[c]['support'])
accuracy = report['accuracy']
weighted_avg = report['weighted avg']
cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)
dff['className'] = classesName
dff['precision'] = precision
dff['recall'] = recall
dff['f1-score'] = f1
dff['support'] = support
dff['FP'] = FP
dff['FN'] = FN
dff['TP'] = TP
dff['TN'] = TN
return dff, accuracy, weighted_avg
main.py 0 → 100644
import pandas as pd
import numpy as np
import configparser
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from training_bertFineTuning import training_bertFineTuning
from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader
from evaluate_bertFineTuning import evaluate_bertFineTuning
def create_dict(df, classColumnName):
return dict(df[classColumnName].value_counts())
def remove_weak_classes(df, classColumnName, threshold):
dictOfClassInstances = create_dict(df,classColumnName)
dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
keys = [*dictionary]
df_tmp = df[~ df[classColumnName].isin(keys)]
df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)
return df
def resample_classes(df, classColumnName, numberOfInstances):
#random numberOfInstances elements
replace = False # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
return df.groupby(classColumnName, as_index=False).apply(fn)
def main():
config = configparser.ConfigParser()
config.read('bert_settings.conf')
dataPath = config.get('general','dataPath')
columnText = config.get('general','columnText')
columnClass = config.get('general','columnClass')
minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
chosen_tokeniser = config.get('model','tokeniser')
chosen_model = config.get('model','model')
max_len = int(config.get('model','max_len_sequences'))
batch_size = int(config.get('model','batch_size'))
epochs = int(config.get('model','epochs'))
df = pd.read_csv(dataPath)
df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass)
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
numberOfClasses = y.nunique()
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(y)
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
sentences = train_x[columnText].values
labels = train_y.tolist()
#call train method
model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs)
#save the model
model_save_name = config.get('model','modelName')
path = config.get('model','path')
torch.save(model, os.path.join(path,model_save_name))
#print the model parameters
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
#call predict method
prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32)
predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader)
#call Evaluate
result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder)
print(result_df)
print(accuracy)
print(weighted_avg)
if __name__ == "__main__":
main()
import torch
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, CamembertTokenizer
def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32):
if chosen_model == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
elif chosen_model == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
# For every sentence...
for sent in sentences_to_predict:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
input_ids_test.append(encoded_sent)
# Pad our input tokens
padded_test = []
for i in input_ids_test:
if len(i) > max_len:
padded_test.extend([i[:max_len]])
else:
padded_test.extend([i + [0] * (max_len - len(i))])
input_ids_test = np.array(padded_test)
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids_test)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
# Set the batch size.
batch_size = 32
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
return prediction_dataloader
def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions_test , true_labels = [], []
# Predict
for batch in prediction_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader
b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
#print(logits)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
#print(logits)
# Store predictions and true labels
predictions_test.append(logits)
true_labels.append(label_ids)
print(' DONE.')
pred_labels = []
for i in range(len(true_labels)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
pred_labels.append(pred_labels_i)
pred_labels_ = [item for sublist in pred_labels for item in sublist]
true_labels_ = [item for sublist in true_labels for item in sublist]
return predictions_test_, true_labels_
def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict):
if chosen_model == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
elif chosen_model == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
# For every sentence...
for sent in sentences_to_predict:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
input_ids_test.append(encoded_sent)
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment