training_bertFineTuning.py

import torch
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, CamembertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import random
import os
import argparse
import configparser
import csv


def create_dict(df, classColumnName):
    return dict(df[classColumnName].value_counts())


def remove_weak_classes(df, classColumnName, threshold):

    dictOfClassInstances = create_dict(df,classColumnName)


    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
    keys = [*dictionary]
    df_tmp = df[~ df[classColumnName].isin(keys)]
    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
    return df


def resample_classes(df, classColumnName, numberOfInstances):

    #random numberOfInstances elements
    replace = False  # with replacement

    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
    return df.groupby(classColumnName, as_index=False).apply(fn)


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


def training_bertFineTuning(chosen_model, model_path,  sentences, labels, max_len,  batch_size, epochs = 4):

    # If there's a GPU available...
    if torch.cuda.is_available():

        # Tell PyTorch to use the GPU.
        device = torch.device("cuda")

        print('There are %d GPU(s) available.' % torch.cuda.device_count())

        print('We will use the GPU:', torch.cuda.get_device_name(0))

        # If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

############################################################################################################
########################## Model: Tokenization & Input Formatting ###################################################################
###########################################################################################################


    if chosen_model == 'bert' :
        print('Loading Bert Tokenizer...')
        tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)
    elif chosen_model == 'camembert':
        print('Loading Camembert Tokenizer...')
        tokenizer = CamembertTokenizer.from_pretrained(model_path , do_lower_case=True)


    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []

    # For every sentence...
    for sent in sentences:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            str(sent),                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                            # This function also supports truncation and conversion
                            # to pytorch tensors, but I need to do padding, so I
                            # can't use these features.
                            #max_length = 128,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                            )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_sent)


    padded = []
    for i in input_ids:

        if len(i) > max_len:
            padded.extend([i[:max_len]])
        else:
            padded.extend([i + [0] * (max_len - len(i))])


    padded  = np.array(padded)


    # Create attention masks
    attention_masks = []

    # For each sentence...
    for sent in padded:

        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]

        # Store the attention mask for this sentence.
        attention_masks.append(att_mask)


    # Use 90% for training and 10% for validation.
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.3, stratify = labels )
    # Do the same for the masks.
    train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.3, stratify = labels)


    # Convert all inputs and labels into torch tensors, the required datatype
    # for my model.
    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)

    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)

    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)


    # The DataLoader needs to know the batch size for training, so I specify it here.
    # For fine-tuning BERT on a specific task, the authors recommend a batch size of
    # 16 or 32.


    # Create the DataLoader for training set.
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create the DataLoader for validation set.
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


    print(' Selecting a model .....')

    numberOfClasses = len(set(labels))


    # Load BertForSequenceClassification, the pretrained BERT model with a single
    # linear classification layer on top.
    if chosen_model == 'bert':
        model = BertForSequenceClassification.from_pretrained(
            model_path, # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
            # You can increase this for multi-class tasks.
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
            )
    elif chosen_model == 'camembert':

        model = CamembertForSequenceClassification.from_pretrained(
            model_path, # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
            # You can increase this for multi-class tasks.
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
            )


    # Tell pytorch to run this model on the GPU.
    model.cuda()


    #Note: AdamW is a class from the huggingface library (as opposed to pytorch)
    # I believe the 'W' stands for 'Weight Decay fix"
    optimizer = AdamW(model.parameters(),
                    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                    )


    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


    # This training code is based on the `run_glue.py` script here:
    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


    # Set the seed value all over the place to make this reproducible.
    seed_val = 42

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # Store the average loss after each epoch so I can plot them.
    loss_values = []

    # For each epoch...
    for epoch_i in range(0, epochs):

        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_loss = 0

        # Put the model into training mode.
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # Unpack this training batch from the dataloader.
            #
            # As I unpack the batch, I'll also copy each tensor to the GPU using the
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because
            # accumulating the gradients is "convenient while training RNNs".
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()

            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because I
            # have provided the `labels`.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

            # The call to `model` always returns a tuple, so I need to pull the
            # loss value out of the tuple.
            loss = outputs[0]

            # Accumulate the training loss over all of the batches so that I can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value
            # from the tensor.
            total_loss += loss.item()

            #  Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)

        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure the performance on
        # the validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:

            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # Unpack the inputs from dataloader
            b_input_ids, b_input_mask, b_labels = batch

            # Telling the model not to compute or store gradients, saving memory and
            # speeding up validation
            with torch.no_grad():

                # Forward pass, calculate logit predictions.
                # This will return the logits rather than the loss because we have
                # not provided labels.
                # token_type_ids is the same as the "segment ids", which
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here:
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask)

            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            logits = outputs[0]

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences.
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            # Accumulate the total accuracy.
            eval_accuracy += tmp_eval_accuracy

            # Track the number of batches
            nb_eval_steps += 1

        # Report the final accuracy for this validation run.
        print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
        print("  Validation took: {:}".format(format_time(time.time() - t0)))

    print("")
    print("Training complete!")
    return model


'''print('Saving Model....')
model_save_name = config.get('model','modelName')
path = config.get('model','path')
#torch.save(model.state_dict(), os.path.join(path,model_save_name))
torch.save(model, os.path.join(path,model_save_name))'''


if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    parser.add_argument("input_dataset")
    parser.add_argument("conf_file")
    parser.add_argument("output_path")

    args = parser.parse_args()

    INPUT_DATASET = args.input_dataset
    CONF_FILE = args.conf_file
    OUTPUT_PATH = args.output_path

    config = configparser.ConfigParser()
    config.read(CONF_FILE)

    #dataPath = config.get('general','dataPath')
    columnText = config.get('general','columnText')
    columnClass = config.get('general','columnClass')

    minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
    maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))

    model_path = config.get('model','path')
    chosen_model = config.get('model','model')

    max_len = int(config.get('model','max_len_sequences'))
    batch_size = int(config.get('model','batch_size'))
    epochs = int(config.get('model','epochs'))


    df = pd.read_csv(INPUT_DATASET, sep="\t")
    df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
    df = resample_classes(df, columnClass, maxOfInstancePerClass)
    #df = df[df[columnClass] != 'unclassified']


    y  = df[columnClass]
    numberOfClasses = y.nunique()
    encoder = preprocessing.LabelEncoder()
    y = encoder.fit_transform(y)


    #train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )

    #sentences = train_x[columnText].values
    sentences = df[columnText].values
    #labels = train_y.tolist()
    labels = y.tolist()


    #call train method

    model = training_bertFineTuning(chosen_model,model_path, sentences, labels, max_len, batch_size, epochs)


    #save the model
    model_save_name = chosen_model+"_b"+batch_size+"_e"+epochs

    torch.save(model, os.path.join(OUTPUT_PATH,model_save_name))

    #print the model parameters
    params = list(model.named_parameters())

    print('The BERT model has {:} different named parameters.\n'.format(len(params)))

    print('==== Embedding Layer ====\n')

    for p in params[0:5]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

        print('\n==== First Transformer ====\n')

    for p in params[5:21]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

        print('\n==== Output Layer ====\n')

    for p in params[-4:]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))