Skip to content
Snippets Groups Projects
Commit a5158391 authored by Khalleud's avatar Khalleud
Browse files

[ADD] train bert finetuning & predict & evaluate

parent e7f6f159
Branches
No related tags found
1 merge request!5Branch dev bert exp
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
def evaluate_bertFineTuning(pred_labels_, true_labels_, encoder):
report = classification_report( pred_labels_, true_labels_, output_dict = True)
classes = [str(e) for e in encoder.transform(encoder.classes_)]
classesName = encoder.classes_
accuracy = report['accuracy']
weighted_avg = report['weighted avg']
precision = []
recall = []
f1 = []
support = []
dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
for c in classes:
precision.append(report[c]['precision'])
recall.append(report[c]['recall'])
f1.append(report[c]['f1-score'])
support.append(report[c]['support'])
accuracy = report['accuracy']
weighted_avg = report['weighted avg']
cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)
dff['className'] = classesName
dff['precision'] = precision
dff['recall'] = recall
dff['f1-score'] = f1
dff['support'] = support
dff['FP'] = FP
dff['FN'] = FN
dff['TP'] = TP
dff['TN'] = TN
return dff, accuracy, weighted_avg
main.py 0 → 100644
import pandas as pd
import numpy as np
import configparser
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from training_bertFineTuning import training_bertFineTuning
from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader
from evaluate_bertFineTuning import evaluate_bertFineTuning
def create_dict(df, classColumnName):
return dict(df[classColumnName].value_counts())
def remove_weak_classes(df, classColumnName, threshold):
dictOfClassInstances = create_dict(df,classColumnName)
dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
keys = [*dictionary]
df_tmp = df[~ df[classColumnName].isin(keys)]
df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)
return df
def resample_classes(df, classColumnName, numberOfInstances):
#random numberOfInstances elements
replace = False # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
return df.groupby(classColumnName, as_index=False).apply(fn)
def main():
config = configparser.ConfigParser()
config.read('bert_settings.conf')
dataPath = config.get('general','dataPath')
columnText = config.get('general','columnText')
columnClass = config.get('general','columnClass')
minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
chosen_tokeniser = config.get('model','tokeniser')
chosen_model = config.get('model','model')
max_len = int(config.get('model','max_len_sequences'))
batch_size = int(config.get('model','batch_size'))
epochs = int(config.get('model','epochs'))
df = pd.read_csv(dataPath)
df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass)
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
numberOfClasses = y.nunique()
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(y)
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
sentences = train_x[columnText].values
labels = train_y.tolist()
#call train method
model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs)
#save the model
model_save_name = config.get('model','modelName')
path = config.get('model','path')
torch.save(model, os.path.join(path,model_save_name))
#print the model parameters
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
#call predict method
prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32)
predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader)
#call Evaluate
result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder)
print(result_df)
print(accuracy)
print(weighted_avg)
if __name__ == "__main__":
main()
import torch
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, CamembertTokenizer
def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32):
if chosen_model == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
elif chosen_model == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
# For every sentence...
for sent in sentences_to_predict:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
input_ids_test.append(encoded_sent)
# Pad our input tokens
padded_test = []
for i in input_ids_test:
if len(i) > max_len:
padded_test.extend([i[:max_len]])
else:
padded_test.extend([i + [0] * (max_len - len(i))])
input_ids_test = np.array(padded_test)
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids_test)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
# Set the batch size.
batch_size = 32
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
return prediction_dataloader
def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions_test , true_labels = [], []
# Predict
for batch in prediction_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader
b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
#print(logits)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
#print(logits)
# Store predictions and true labels
predictions_test.append(logits)
true_labels.append(label_ids)
print(' DONE.')
pred_labels = []
for i in range(len(true_labels)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
pred_labels.append(pred_labels_i)
pred_labels_ = [item for sublist in pred_labels for item in sublist]
true_labels_ = [item for sublist in true_labels for item in sublist]
return predictions_test_, true_labels_
def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict):
if chosen_model == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
elif chosen_model == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
# For every sentence...
for sent in sentences_to_predict:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
input_ids_test.append(encoded_sent)
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
...@@ -2,48 +2,38 @@ import torch ...@@ -2,48 +2,38 @@ import torch
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn import preprocessing from sklearn import preprocessing
from sklearn.model_selection import train_test_split from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, CamembertTokenizer from transformers import BertTokenizer, CamembertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
from transformers import get_linear_schedule_with_warmup from transformers import get_linear_schedule_with_warmup
import time import time
import datetime import datetime
import random import random
import os
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
###########################################################################
########################## Utils Functions ################################
###########################################################################
def create_dict(df, classColumnName):
return dict(df[classColumnName].value_counts())
def remove_weak_classes(df, classColumnName, threshold):
dictOfClassInstances = create_dict(df,classColumnName)
dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
keys = [*dictionary]
df_tmp = df[~ df[classColumnName].isin(keys)]
df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)
return df
def resample_classes(df, classColumnName, numberOfInstances): def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round((elapsed)))
#random numberOfInstances elements # Format as hh:mm:ss
replace = False # with replacement return str(datetime.timedelta(seconds=elapsed_rounded))
fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
return df.groupby(classColumnName, as_index=False).apply(fn)
############################################################################################################## def training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs = 4):
########################## Setup GPU #########################################################################
##############################################################################################################
# If there's a GPU available... # If there's a GPU available...
if torch.cuda.is_available(): if torch.cuda.is_available():
...@@ -63,62 +53,18 @@ else: ...@@ -63,62 +53,18 @@ else:
#############################################################################################################
########################## parameters ###################################################################
###########################################################################################################
config = configparser.ConfigParser()
config.read('settings.conf')
dataPath = config.get('general','dataPath')
columnText = config.get('general','columnText')
columnClass = config.get('general','columnClass')
minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
chosen_tokeniser = config.get('model','tokeniser')
chosen_model = config.get('model','model')
max_len = int(config.get('model','max_len_sequences'))
#############################################################################################################
########################## Load Data ###################################################################
###########################################################################################################
df = pd.read_csv(dataPath)
df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass)
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
numberOfClasses = y.nunique()
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(y)
sentences = train_x[columnText].values
labels = train_y.tolist()
############################################################################################################ ############################################################################################################
########################## Model: Tokenization & Input Formatting ################################################################### ########################## Model: Tokenization & Input Formatting ###################################################################
########################################################################################################### ###########################################################################################################
# Load the BERT tokenizer. if chosen_model == 'bert-base-multilingual-cased' :
print('Loading BERT tokenizer...') print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True) tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
elif chosen_model == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs. # Tokenize all of the sentences and map the tokens to thier word IDs.
...@@ -157,7 +103,7 @@ for i in input_ids: ...@@ -157,7 +103,7 @@ for i in input_ids:
padded.extend([i + [0] * (max_len - len(i))]) padded.extend([i + [0] * (max_len - len(i))])
padded = input_ids = np.array(padded) padded = np.array(padded)
...@@ -177,11 +123,9 @@ for sent in padded: ...@@ -177,11 +123,9 @@ for sent in padded:
# Use 90% for training and 10% for validation. # Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.1, stratify = labels )
random_state=2018, test_size=0.1, stratify = labels )
# Do the same for the masks. # Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1, stratify = labels)
random_state=2018, test_size=0.1, stratify = labels)
# Convert all inputs and labels into torch tensors, the required datatype # Convert all inputs and labels into torch tensors, the required datatype
...@@ -197,13 +141,11 @@ validation_masks = torch.tensor(validation_masks) ...@@ -197,13 +141,11 @@ validation_masks = torch.tensor(validation_masks)
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# The DataLoader needs to know the batch size for training, so I specify it here. # The DataLoader needs to know the batch size for training, so I specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of # For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32. # 16 or 32.
batch_size = int(config.get('model','batch_size'))
# Create the DataLoader for training set. # Create the DataLoader for training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels) train_data = TensorDataset(train_inputs, train_masks, train_labels)
...@@ -218,18 +160,15 @@ validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, ...@@ -218,18 +160,15 @@ validation_dataloader = DataLoader(validation_data, sampler=validation_sampler,
############################################################################################################
########################## Model: Training ###################################################################
###########################################################################################################
print(' Selecting a model .....') print(' Selecting a model .....')
numberOfClasses = len(set(labels))
# Load BertForSequenceClassification, the pretrained BERT model with a single # Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top. # linear classification layer on top.
if chosen_model == 'bert-base-multilingual-cased':
model = BertForSequenceClassification.from_pretrained( model = BertForSequenceClassification.from_pretrained(
chosen_model, # Use the 12-layer BERT model, with an uncased vocab. chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
num_labels = numberOfClasses, # The number of output labels--2 for binary classification. num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
...@@ -237,6 +176,16 @@ model = BertForSequenceClassification.from_pretrained( ...@@ -237,6 +176,16 @@ model = BertForSequenceClassification.from_pretrained(
output_attentions = False, # Whether the model returns attentions weights. output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states. output_hidden_states = False, # Whether the model returns all hidden-states.
) )
elif chosen_model == 'camembert-base':
model = CamembertForSequenceClassification.from_pretrained(
chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
# Tell pytorch to run this model on the GPU. # Tell pytorch to run this model on the GPU.
model.cuda() model.cuda()
...@@ -251,8 +200,6 @@ optimizer = AdamW(model.parameters(), ...@@ -251,8 +200,6 @@ optimizer = AdamW(model.parameters(),
# Number of training epochs (authors recommend between 2 and 4)
epochs = int(config.get('model','epochs'))
# Total number of training steps is number of batches * number of epochs. # Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs total_steps = len(train_dataloader) * epochs
...@@ -263,26 +210,6 @@ scheduler = get_linear_schedule_with_warmup(optimizer, ...@@ -263,26 +210,6 @@ scheduler = get_linear_schedule_with_warmup(optimizer,
num_training_steps = total_steps) num_training_steps = total_steps)
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round((elapsed)))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
# This training code is based on the `run_glue.py` script here: # This training code is based on the `run_glue.py` script here:
...@@ -463,3 +390,11 @@ for epoch_i in range(0, epochs): ...@@ -463,3 +390,11 @@ for epoch_i in range(0, epochs):
print("") print("")
print("Training complete!") print("Training complete!")
return model
'''print('Saving Model....')
model_save_name = config.get('model','modelName')
path = config.get('model','path')
#torch.save(model.state_dict(), os.path.join(path,model_save_name))
torch.save(model, os.path.join(path,model_save_name))'''
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment