From e7f6f159fa4b99afbcec38a6c14a8745765f8775 Mon Sep 17 00:00:00 2001 From: Khalleud <ledk14@gmail.com> Date: Sun, 12 Sep 2021 00:45:06 +0200 Subject: [PATCH] [ADD] Training mode Bert Fine Tuning --- training_bertFineTuning.py | 465 +++++++++++++++++++++++++++++++++++++ 1 file changed, 465 insertions(+) create mode 100644 training_bertFineTuning.py diff --git a/training_bertFineTuning.py b/training_bertFineTuning.py new file mode 100644 index 0000000..72a5929 --- /dev/null +++ b/training_bertFineTuning.py @@ -0,0 +1,465 @@ +import torch +import pandas as pd +import numpy as np +from sklearn import preprocessing +from sklearn.model_selection import train_test_split +from transformers import BertTokenizer, CamembertTokenizer +from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification +from transformers import get_linear_schedule_with_warmup +import time +import datetime +import random + + + + + +########################################################################### +########################## Utils Functions ################################ +########################################################################### + +def create_dict(df, classColumnName): + return dict(df[classColumnName].value_counts()) + +def remove_weak_classes(df, classColumnName, threshold): + + dictOfClassInstances = create_dict(df,classColumnName) + + + dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } + keys = [*dictionary] + df_tmp = df[~ df[classColumnName].isin(keys)] + df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) + return df + + +def resample_classes(df, classColumnName, numberOfInstances): + + #random numberOfInstances elements + replace = False # with replacement + + fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] + return df.groupby(classColumnName, as_index=False).apply(fn) + +############################################################################################################## +########################## Setup GPU ######################################################################### +############################################################################################################## + +# If there's a GPU available... +if torch.cuda.is_available(): + + # Tell PyTorch to use the GPU. + device = torch.device("cuda") + + print('There are %d GPU(s) available.' % torch.cuda.device_count()) + + print('We will use the GPU:', torch.cuda.get_device_name(0)) + +# If not... +else: + print('No GPU available, using the CPU instead.') + device = torch.device("cpu") + + + + +############################################################################################################# +########################## parameters ################################################################### +########################################################################################################### + +config = configparser.ConfigParser() +config.read('settings.conf') + +dataPath = config.get('general','dataPath') +columnText = config.get('general','columnText') +columnClass = config.get('general','columnClass') + +minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) +maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) + +chosen_tokeniser = config.get('model','tokeniser') +chosen_model = config.get('model','model') + +max_len = int(config.get('model','max_len_sequences')) + + +############################################################################################################# +########################## Load Data ################################################################### +########################################################################################################### + + + + +df = pd.read_csv(dataPath) +df = remove_weak_classes(df, columnClass, minOfInstancePerClass) +df = resample_classes(df, columnClass, maxOfInstancePerClass) +df = df[df[columnClass] != 'unclassified'] + + + + + +y = df[columnClass] +numberOfClasses = y.nunique() +encoder = preprocessing.LabelEncoder() +y = encoder.fit_transform(y) + + + +sentences = train_x[columnText].values +labels = train_y.tolist() + + + +############################################################################################################ +########################## Model: Tokenization & Input Formatting ################################################################### +########################################################################################################### + + +# Load the BERT tokenizer. +print('Loading BERT tokenizer...') +tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True) + + + # Tokenize all of the sentences and map the tokens to thier word IDs. +input_ids = [] + +# For every sentence... +for sent in sentences: + # `encode` will: + # (1) Tokenize the sentence. + # (2) Prepend the `[CLS]` token to the start. + # (3) Append the `[SEP]` token to the end. + # (4) Map tokens to their IDs. + encoded_sent = tokenizer.encode( + sent, # Sentence to encode. + add_special_tokens = True, # Add '[CLS]' and '[SEP]' + + # This function also supports truncation and conversion + # to pytorch tensors, but I need to do padding, so I + # can't use these features. + #max_length = 128, # Truncate all sentences. + #return_tensors = 'pt', # Return pytorch tensors. + ) + + # Add the encoded sentence to the list. + input_ids.append(encoded_sent) + + + + +padded = [] +for i in input_ids: + + if len(i) > max_len: + padded.extend([i[:max_len]]) + else: + padded.extend([i + [0] * (max_len - len(i))]) + + +padded = input_ids = np.array(padded) + + + + # Create attention masks +attention_masks = [] + +# For each sentence... +for sent in padded: + + # Create the attention mask. + # - If a token ID is 0, then it's padding, set the mask to 0. + # - If a token ID is > 0, then it's a real token, set the mask to 1. + att_mask = [int(token_id > 0) for token_id in sent] + + # Store the attention mask for this sentence. + attention_masks.append(att_mask) + + +# Use 90% for training and 10% for validation. +train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, + random_state=2018, test_size=0.1, stratify = labels ) +# Do the same for the masks. +train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, + random_state=2018, test_size=0.1, stratify = labels) + + +# Convert all inputs and labels into torch tensors, the required datatype +# for my model. +train_inputs = torch.tensor(train_inputs) +validation_inputs = torch.tensor(validation_inputs) + +train_labels = torch.tensor(train_labels) +validation_labels = torch.tensor(validation_labels) + +train_masks = torch.tensor(train_masks) +validation_masks = torch.tensor(validation_masks) + + + +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler + +# The DataLoader needs to know the batch size for training, so I specify it here. +# For fine-tuning BERT on a specific task, the authors recommend a batch size of +# 16 or 32. + +batch_size = int(config.get('model','batch_size')) + +# Create the DataLoader for training set. +train_data = TensorDataset(train_inputs, train_masks, train_labels) +train_sampler = RandomSampler(train_data) +train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) + +# Create the DataLoader for validation set. +validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) +validation_sampler = SequentialSampler(validation_data) +validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) + + + + +############################################################################################################ +########################## Model: Training ################################################################### +########################################################################################################### + + +print(' Selecting a model .....') + + + +# Load BertForSequenceClassification, the pretrained BERT model with a single +# linear classification layer on top. + +model = BertForSequenceClassification.from_pretrained( + chosen_model, # Use the 12-layer BERT model, with an uncased vocab. + num_labels = numberOfClasses, # The number of output labels--2 for binary classification. + # You can increase this for multi-class tasks. + output_attentions = False, # Whether the model returns attentions weights. + output_hidden_states = False, # Whether the model returns all hidden-states. +) + +# Tell pytorch to run this model on the GPU. +model.cuda() + + +#Note: AdamW is a class from the huggingface library (as opposed to pytorch) +# I believe the 'W' stands for 'Weight Decay fix" +optimizer = AdamW(model.parameters(), + lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 + eps = 1e-8 # args.adam_epsilon - default is 1e-8. + ) + + + +# Number of training epochs (authors recommend between 2 and 4) +epochs = int(config.get('model','epochs')) + +# Total number of training steps is number of batches * number of epochs. +total_steps = len(train_dataloader) * epochs + +# Create the learning rate scheduler. +scheduler = get_linear_schedule_with_warmup(optimizer, + num_warmup_steps = 0, # Default value in run_glue.py + num_training_steps = total_steps) + + +def flat_accuracy(preds, labels): + pred_flat = np.argmax(preds, axis=1).flatten() + labels_flat = labels.flatten() + return np.sum(pred_flat == labels_flat) / len(labels_flat) + + + + + +def format_time(elapsed): + ''' + Takes a time in seconds and returns a string hh:mm:ss + ''' + # Round to the nearest second. + elapsed_rounded = int(round((elapsed))) + + # Format as hh:mm:ss + return str(datetime.timedelta(seconds=elapsed_rounded)) + + + + +# This training code is based on the `run_glue.py` script here: +# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 + + +# Set the seed value all over the place to make this reproducible. +seed_val = 42 + +random.seed(seed_val) +np.random.seed(seed_val) +torch.manual_seed(seed_val) +torch.cuda.manual_seed_all(seed_val) + +# Store the average loss after each epoch so I can plot them. +loss_values = [] + +# For each epoch... +for epoch_i in range(0, epochs): + + # ======================================== + # Training + # ======================================== + + # Perform one full pass over the training set. + + print("") + print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) + print('Training...') + + # Measure how long the training epoch takes. + t0 = time.time() + + # Reset the total loss for this epoch. + total_loss = 0 + + # Put the model into training mode. + model.train() + + # For each batch of training data... + for step, batch in enumerate(train_dataloader): + + # Progress update every 40 batches. + if step % 40 == 0 and not step == 0: + # Calculate elapsed time in minutes. + elapsed = format_time(time.time() - t0) + + # Report progress. + print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) + + # Unpack this training batch from the dataloader. + # + # As I unpack the batch, I'll also copy each tensor to the GPU using the + # `to` method. + # + # `batch` contains three pytorch tensors: + # [0]: input ids + # [1]: attention masks + # [2]: labels + b_input_ids = batch[0].to(device) + b_input_mask = batch[1].to(device) + b_labels = batch[2].to(device) + + # Always clear any previously calculated gradients before performing a + # backward pass. PyTorch doesn't do this automatically because + # accumulating the gradients is "convenient while training RNNs". + # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) + model.zero_grad() + + # Perform a forward pass (evaluate the model on this training batch). + # This will return the loss (rather than the model output) because I + # have provided the `labels`. + # The documentation for this `model` function is here: + # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification + outputs = model(b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask, + labels=b_labels) + + # The call to `model` always returns a tuple, so I need to pull the + # loss value out of the tuple. + loss = outputs[0] + + # Accumulate the training loss over all of the batches so that I can + # calculate the average loss at the end. `loss` is a Tensor containing a + # single value; the `.item()` function just returns the Python value + # from the tensor. + total_loss += loss.item() + + # Perform a backward pass to calculate the gradients. + loss.backward() + + # Clip the norm of the gradients to 1.0. + # This is to help prevent the "exploding gradients" problem. + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + + # Update parameters and take a step using the computed gradient. + # The optimizer dictates the "update rule"--how the parameters are + # modified based on their gradients, the learning rate, etc. + optimizer.step() + + # Update the learning rate. + scheduler.step() + + # Calculate the average loss over the training data. + avg_train_loss = total_loss / len(train_dataloader) + + # Store the loss value for plotting the learning curve. + loss_values.append(avg_train_loss) + + print("") + print(" Average training loss: {0:.2f}".format(avg_train_loss)) + print(" Training epoch took: {:}".format(format_time(time.time() - t0))) + + # ======================================== + # Validation + # ======================================== + # After the completion of each training epoch, measure the performance on + # the validation set. + + print("") + print("Running Validation...") + + t0 = time.time() + + # Put the model in evaluation mode--the dropout layers behave differently + # during evaluation. + model.eval() + + # Tracking variables + eval_loss, eval_accuracy = 0, 0 + nb_eval_steps, nb_eval_examples = 0, 0 + + # Evaluate data for one epoch + for batch in validation_dataloader: + + # Add batch to GPU + batch = tuple(t.to(device) for t in batch) + + # Unpack the inputs from dataloader + b_input_ids, b_input_mask, b_labels = batch + + # Telling the model not to compute or store gradients, saving memory and + # speeding up validation + with torch.no_grad(): + + # Forward pass, calculate logit predictions. + # This will return the logits rather than the loss because we have + # not provided labels. + # token_type_ids is the same as the "segment ids", which + # differentiates sentence 1 and 2 in 2-sentence tasks. + # The documentation for this `model` function is here: + # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification + outputs = model(b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask) + + # Get the "logits" output by the model. The "logits" are the output + # values prior to applying an activation function like the softmax. + logits = outputs[0] + + # Move logits and labels to CPU + logits = logits.detach().cpu().numpy() + label_ids = b_labels.to('cpu').numpy() + + # Calculate the accuracy for this batch of test sentences. + tmp_eval_accuracy = flat_accuracy(logits, label_ids) + + # Accumulate the total accuracy. + eval_accuracy += tmp_eval_accuracy + + # Track the number of batches + nb_eval_steps += 1 + + # Report the final accuracy for this validation run. + print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps)) + print(" Validation took: {:}".format(format_time(time.time() - t0))) + +print("") +print("Training complete!") -- GitLab