[ADD] train bert finetuning & predict & evaluate

a5158391 · Khalleud · e7f6f159 · a5158391 · a5158391 · a5158391
Commit a5158391 authored Sep 16, 2021 by Khalleud
--- a/evaluate_bertFineTuning.py
+++ b/evaluate_bertFineTuning.py
+import matplotlib.pyplot as plt
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+import seaborn as sns
+def evaluate_bertFineTuning(pred_labels_, true_labels_, encoder):
+    report = classification_report( pred_labels_, true_labels_, output_dict = True)
+    classes = [str(e) for e in encoder.transform(encoder.classes_)]
+    classesName = encoder.classes_
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+    precision = []
+    recall = []
+    f1 = []
+    support = []
+    dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
+    for c in classes:
+        precision.append(report[c]['precision'])
+        recall.append(report[c]['recall'])
+        f1.append(report[c]['f1-score'])
+        support.append(report[c]['support'])
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+    cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
+    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
+    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
+    TP = np.diag(cnf_matrix)
+    TN = cnf_matrix.sum() - (FP + FN + TP)
+    dff['className'] = classesName
+    dff['precision'] = precision
+    dff['recall'] = recall
+    dff['f1-score'] = f1
+    dff['support'] = support
+    dff['FP'] = FP
+    dff['FN'] = FN
+    dff['TP'] = TP
+    dff['TN'] = TN
+    return dff, accuracy, weighted_avg
--- a/main.py
+++ b/main.py
+import pandas as pd
+import numpy as np
+import configparser
+from sklearn import preprocessing
+from sklearn.model_selection import train_test_split
+from training_bertFineTuning import training_bertFineTuning
+from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader
+from evaluate_bertFineTuning import evaluate_bertFineTuning
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+def remove_weak_classes(df, classColumnName, threshold):
+    dictOfClassInstances = create_dict(df,classColumnName)
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+def resample_classes(df, classColumnName, numberOfInstances):
+    #random numberOfInstances elements
+    replace = False  # with replacement
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
+def main():
+    config = configparser.ConfigParser()
+    config.read('bert_settings.conf')
+    dataPath = config.get('general','dataPath')
+    columnText = config.get('general','columnText')
+    columnClass = config.get('general','columnClass')
+    minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
+    maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+    chosen_tokeniser = config.get('model','tokeniser')
+    chosen_model = config.get('model','model')
+    max_len = int(config.get('model','max_len_sequences'))
+    batch_size = int(config.get('model','batch_size'))
+    epochs = int(config.get('model','epochs'))
+    df = pd.read_csv(dataPath)
+    df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
+    df = resample_classes(df, columnClass, maxOfInstancePerClass)
+    df = df[df[columnClass] != 'unclassified']
+    y  = df[columnClass]
+    numberOfClasses = y.nunique()
+    encoder = preprocessing.LabelEncoder()
+    y = encoder.fit_transform(y)
+    train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
+    sentences = train_x[columnText].values
+    labels = train_y.tolist()
+    #call train method
+    model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs)
+    #save the model
+    model_save_name = config.get('model','modelName')
+    path = config.get('model','path')
+    torch.save(model, os.path.join(path,model_save_name))
+    #print the model parameters
+    params = list(model.named_parameters())
+    print('The BERT model has {:} different named parameters.\n'.format(len(params)))
+    print('==== Embedding Layer ====\n')
+    for p in params[0:5]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+        print('\n==== First Transformer ====\n')
+    for p in params[5:21]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+        print('\n==== Output Layer ====\n')
+    for p in params[-4:]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+    #call predict method
+    prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32)
+    predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader)
+    #call Evaluate
+    result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder)
+    print(result_df)
+    print(accuracy)
+    print(weighted_avg)
+if __name__ == "__main__":
+    main()
--- a/predict_bertFineTuning.py
+++ b/predict_bertFineTuning.py
+import torch
+import pandas as pd
+import numpy as np
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from transformers import BertTokenizer, CamembertTokenizer
+def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32):
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids_test = []
+    # For every sentence...
+    for sent in sentences_to_predict:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                    )
+        input_ids_test.append(encoded_sent)
+    # Pad our input tokens
+    padded_test = []
+    for i in input_ids_test:
+      if len(i) > max_len:
+        padded_test.extend([i[:max_len]])
+      else:
+        padded_test.extend([i + [0] * (max_len - len(i))])
+    input_ids_test = np.array(padded_test)
+    # Create attention masks
+    attention_masks = []
+    # Create a mask of 1s for each token followed by 0s for padding
+    for seq in input_ids_test:
+        seq_mask = [float(i>0) for i in seq]
+        attention_masks.append(seq_mask)
+    # Convert to tensors.
+    prediction_inputs = torch.tensor(input_ids_test)
+    prediction_masks = torch.tensor(attention_masks)
+    prediction_labels = torch.tensor(labels)
+    # Set the batch size.
+    batch_size = 32
+    # Create the DataLoader.
+    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
+    prediction_sampler = SequentialSampler(prediction_data)
+    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
+    return prediction_dataloader
+def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
+    # If there's a GPU available...
+    if torch.cuda.is_available():
+        # Tell PyTorch to use the GPU.
+        device = torch.device("cuda")
+        print('There are %d GPU(s) available.' % torch.cuda.device_count())
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
+        # If not...
+    else:
+        print('No GPU available, using the CPU instead.')
+        device = torch.device("cpu")
+    # Put model in evaluation mode
+    model.eval()
+    # Tracking variables
+    predictions_test , true_labels = [], []
+    # Predict
+    for batch in prediction_dataloader:
+    # Add batch to GPU
+        batch = tuple(t.to(device) for t in batch)
+        # Unpack the inputs from the dataloader
+        b_input_ids, b_input_mask, b_labels = batch
+        # Telling the model not to compute or store gradients, saving memory and
+        # speeding up prediction
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions
+            outputs = model(b_input_ids, token_type_ids=None,
+                            attention_mask=b_input_mask)
+        logits = outputs[0]
+        #print(logits)
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+        #print(logits)
+        # Store predictions and true labels
+        predictions_test.append(logits)
+        true_labels.append(label_ids)
+        print('    DONE.')
+        pred_labels = []
+        for i in range(len(true_labels)):
+            # The predictions for this batch are a 2-column ndarray (one column for "0"
+            # and one column for "1"). Pick the label with the highest value and turn this
+            # in to a list of 0s and 1s.
+            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
+            pred_labels.append(pred_labels_i)
+        pred_labels_ = [item for sublist in pred_labels for item in sublist]
+        true_labels_ = [item for sublist in true_labels for item in sublist]
+        return predictions_test_, true_labels_
+def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict):
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids_test = []
+    # For every sentence...
+    for sent in sentences_to_predict:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                    )
+        input_ids_test.append(encoded_sent)
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions
+            outputs = model(b_input_ids, token_type_ids=None,
+                            attention_mask=b_input_mask)
+        logits = outputs[0]
--- a/training_bertFineTuning.py
+++ b/training_bertFineTuning.py
@@ -2,48 +2,38 @@ import torch
 import pandas as pd
 import numpy as np
 from sklearn import preprocessing
-from sklearn.model_selection import train_test_split
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 from transformers import BertTokenizer, CamembertTokenizer
 from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
 from transformers import get_linear_schedule_with_warmup
 import time
 import datetime
 import random
+import os
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
-###########################################################################
-########################## Utils Functions ################################
-###########################################################################
-def create_dict(df, classColumnName):
-    return dict(df[classColumnName].value_counts())
-def remove_weak_classes(df, classColumnName, threshold):
-    dictOfClassInstances = create_dict(df,classColumnName)
-    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
-    keys = [*dictionary]
-    df_tmp = df[~ df[classColumnName].isin(keys)]
-    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
-    return df
-def resample_classes(df, classColumnName, numberOfInstances):
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
-    #random numberOfInstances elements
+    # Format as hh:mm:ss
-    replace = False  # with replacement
+    return str(datetime.timedelta(seconds=elapsed_rounded))
-    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
-    return df.groupby(classColumnName, as_index=False).apply(fn)
-##############################################################################################################
+def training_bertFineTuning(chosen_model,  sentences, labels, max_len,  batch_size, epochs = 4):
-########################## Setup GPU #########################################################################
-##############################################################################################################
    # If there's a GPU available...
    if torch.cuda.is_available():
@@ -63,62 +53,18 @@ else:
-#############################################################################################################
-########################## parameters ###################################################################
-###########################################################################################################
-config = configparser.ConfigParser()
-config.read('settings.conf')
-dataPath = config.get('general','dataPath')
-columnText = config.get('general','columnText')
-columnClass = config.get('general','columnClass')
-minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
-maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
-chosen_tokeniser = config.get('model','tokeniser')
-chosen_model = config.get('model','model')
-max_len = int(config.get('model','max_len_sequences'))
-#############################################################################################################
-########################## Load Data ###################################################################
-###########################################################################################################
-df = pd.read_csv(dataPath)
-df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
-df = resample_classes(df, columnClass, maxOfInstancePerClass)
-df = df[df[columnClass] != 'unclassified']
-y  = df[columnClass]
-numberOfClasses = y.nunique()
-encoder = preprocessing.LabelEncoder()
-y = encoder.fit_transform(y)
-sentences = train_x[columnText].values
-labels = train_y.tolist()
 ############################################################################################################
 ########################## Model: Tokenization & Input Formatting ###################################################################
 ###########################################################################################################
-# Load the BERT tokenizer.
+    if chosen_model == 'bert-base-multilingual-cased' :
-print('Loading BERT tokenizer...')
+        print('Loading Bert Tokenizer...')
-tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
    # Tokenize all of the sentences and map the tokens to thier word IDs.
@@ -157,7 +103,7 @@ for i in input_ids:
            padded.extend([i + [0] * (max_len - len(i))])
-padded = input_ids = np.array(padded)
+    padded  = np.array(padded)
@@ -177,11 +123,9 @@ for sent in padded:
    # Use 90% for training and 10% for validation.
-train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
+    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.1, stratify = labels )
-                                                            random_state=2018, test_size=0.1, stratify = labels )
    # Do the same for the masks.
-train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
+    train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1, stratify = labels)
-                                             random_state=2018, test_size=0.1, stratify = labels)
    # Convert all inputs and labels into torch tensors, the required datatype
@@ -197,13 +141,11 @@ validation_masks = torch.tensor(validation_masks)
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
    # The DataLoader needs to know the batch size for training, so I specify it here.
    # For fine-tuning BERT on a specific task, the authors recommend a batch size of
    # 16 or 32.
-batch_size = int(config.get('model','batch_size'))
    # Create the DataLoader for training set.
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
@@ -218,18 +160,15 @@ validation_dataloader = DataLoader(validation_data, sampler=validation_sampler,
-############################################################################################################
-########################## Model: Training ###################################################################
-###########################################################################################################
    print(' Selecting a model .....')
+    numberOfClasses = len(set(labels))
    # Load BertForSequenceClassification, the pretrained BERT model with a single
    # linear classification layer on top.
+    if chosen_model == 'bert-base-multilingual-cased':
        model = BertForSequenceClassification.from_pretrained(
            chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
@@ -237,6 +176,16 @@ model = BertForSequenceClassification.from_pretrained(
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
            )
+    elif chosen_model == 'camembert-base':
+        model = CamembertForSequenceClassification.from_pretrained(
+            chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
+            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+            # You can increase this for multi-class tasks.
+            output_attentions = False, # Whether the model returns attentions weights.
+            output_hidden_states = False, # Whether the model returns all hidden-states.
+            )
    # Tell pytorch to run this model on the GPU.
    model.cuda()
@@ -251,8 +200,6 @@ optimizer = AdamW(model.parameters(),
-# Number of training epochs (authors recommend between 2 and 4)
-epochs = int(config.get('model','epochs'))
    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * epochs
@@ -263,26 +210,6 @@ scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_training_steps = total_steps)
-def flat_accuracy(preds, labels):
-    pred_flat = np.argmax(preds, axis=1).flatten()
-    labels_flat = labels.flatten()
-    return np.sum(pred_flat == labels_flat) / len(labels_flat)
-def format_time(elapsed):
-    '''
-    Takes a time in seconds and returns a string hh:mm:ss
-    '''
-    # Round to the nearest second.
-    elapsed_rounded = int(round((elapsed)))
-    # Format as hh:mm:ss
-    return str(datetime.timedelta(seconds=elapsed_rounded))
    # This training code is based on the `run_glue.py` script here:
@@ -463,3 +390,11 @@ for epoch_i in range(0, epochs):
    print("")
    print("Training complete!")
+    return model
+'''print('Saving Model....')
+model_save_name = config.get('model','modelName')
+path = config.get('model','path')
+#torch.save(model.state_dict(), os.path.join(path,model_save_name))
+torch.save(model, os.path.join(path,model_save_name))'''