Ludovic Moncla
--- a/training_bertFineTuning.py 0 → 100644

+ 465

− 0
+++ b/training_bertFineTuning.py 0 → 100644

+ 465

− 0
+import torch
+import pandas as pd
+import numpy as np
+from sklearn import preprocessing
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer, CamembertTokenizer
+from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
+from transformers import get_linear_schedule_with_warmup
+import time
+import datetime
+import random
+###########################################################################
+########################## Utils Functions ################################
+###########################################################################
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+def remove_weak_classes(df, classColumnName, threshold):
+    dictOfClassInstances = create_dict(df,classColumnName)
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+def resample_classes(df, classColumnName, numberOfInstances):
+    #random numberOfInstances elements
+    replace = False  # with replacement
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
+##############################################################################################################
+########################## Setup GPU #########################################################################
+##############################################################################################################
+# If there's a GPU available...
+if torch.cuda.is_available():
+    # Tell PyTorch to use the GPU.
+    device = torch.device("cuda")
+    print('There are %d GPU(s) available.' % torch.cuda.device_count())
+    print('We will use the GPU:', torch.cuda.get_device_name(0))
+# If not...
+else:
+    print('No GPU available, using the CPU instead.')
+    device = torch.device("cpu")
+#############################################################################################################
+########################## parameters ###################################################################
+###########################################################################################################
+config = configparser.ConfigParser()
+config.read('settings.conf')
+dataPath = config.get('general','dataPath')
+columnText = config.get('general','columnText')
+columnClass = config.get('general','columnClass')
+minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
+maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+chosen_tokeniser = config.get('model','tokeniser')
+chosen_model = config.get('model','model')
+max_len = int(config.get('model','max_len_sequences'))
+#############################################################################################################
+########################## Load Data ###################################################################
+###########################################################################################################
+df = pd.read_csv(dataPath)
+df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
+df = resample_classes(df, columnClass, maxOfInstancePerClass)
+df = df[df[columnClass] != 'unclassified']
+y  = df[columnClass]
+numberOfClasses = y.nunique()
+encoder = preprocessing.LabelEncoder()
+y = encoder.fit_transform(y)
+sentences = train_x[columnText].values
+labels = train_y.tolist()
+############################################################################################################
+########################## Model: Tokenization & Input Formatting ###################################################################
+###########################################################################################################
+# Load the BERT tokenizer.
+print('Loading BERT tokenizer...')
+tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)
+ # Tokenize all of the sentences and map the tokens to thier word IDs.
+input_ids = []
+# For every sentence...
+for sent in sentences:
+    # `encode` will:
+    #   (1) Tokenize the sentence.
+    #   (2) Prepend the `[CLS]` token to the start.
+    #   (3) Append the `[SEP]` token to the end.
+    #   (4) Map tokens to their IDs.
+    encoded_sent = tokenizer.encode(
+                        sent,                      # Sentence to encode.
+                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                        # This function also supports truncation and conversion
+                        # to pytorch tensors, but I need to do padding, so I
+                        # can't use these features.
+                        #max_length = 128,          # Truncate all sentences.
+                        #return_tensors = 'pt',     # Return pytorch tensors.
+                   )
+    # Add the encoded sentence to the list.
+    input_ids.append(encoded_sent)
+padded = []
+for i in input_ids:
+  if len(i) > max_len:
+    padded.extend([i[:max_len]])
+  else:
+    padded.extend([i + [0] * (max_len - len(i))])
+padded = input_ids = np.array(padded)
+ # Create attention masks
+attention_masks = []
+# For each sentence...
+for sent in padded:
+    # Create the attention mask.
+    #   - If a token ID is 0, then it's padding, set the mask to 0.
+    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
+    att_mask = [int(token_id > 0) for token_id in sent]
+    # Store the attention mask for this sentence.
+    attention_masks.append(att_mask)
+# Use 90% for training and 10% for validation.
+train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
+                                                            random_state=2018, test_size=0.1, stratify = labels )
+# Do the same for the masks.
+train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
+                                             random_state=2018, test_size=0.1, stratify = labels)
+# Convert all inputs and labels into torch tensors, the required datatype
+# for my model.
+train_inputs = torch.tensor(train_inputs)
+validation_inputs = torch.tensor(validation_inputs)
+train_labels = torch.tensor(train_labels)
+validation_labels = torch.tensor(validation_labels)
+train_masks = torch.tensor(train_masks)
+validation_masks = torch.tensor(validation_masks)
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+# The DataLoader needs to know the batch size for training, so I specify it here.
+# For fine-tuning BERT on a specific task, the authors recommend a batch size of
+# 16 or 32.
+batch_size = int(config.get('model','batch_size'))
+# Create the DataLoader for training set.
+train_data = TensorDataset(train_inputs, train_masks, train_labels)
+train_sampler = RandomSampler(train_data)
+train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
+# Create the DataLoader for validation set.
+validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
+validation_sampler = SequentialSampler(validation_data)
+validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
+############################################################################################################
+########################## Model: Training ###################################################################
+###########################################################################################################
+print(' Selecting a model .....')
+# Load BertForSequenceClassification, the pretrained BERT model with a single
+# linear classification layer on top.
+model = BertForSequenceClassification.from_pretrained(
+    chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
+    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+                    # You can increase this for multi-class tasks.
+    output_attentions = False, # Whether the model returns attentions weights.
+    output_hidden_states = False, # Whether the model returns all hidden-states.
+)
+# Tell pytorch to run this model on the GPU.
+model.cuda()
+#Note: AdamW is a class from the huggingface library (as opposed to pytorch)
+# I believe the 'W' stands for 'Weight Decay fix"
+optimizer = AdamW(model.parameters(),
+                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                )
+# Number of training epochs (authors recommend between 2 and 4)
+epochs = int(config.get('model','epochs'))
+# Total number of training steps is number of batches * number of epochs.
+total_steps = len(train_dataloader) * epochs
+# Create the learning rate scheduler.
+scheduler = get_linear_schedule_with_warmup(optimizer,
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+# This training code is based on the `run_glue.py` script here:
+# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+# Set the seed value all over the place to make this reproducible.
+seed_val = 42
+random.seed(seed_val)
+np.random.seed(seed_val)
+torch.manual_seed(seed_val)
+torch.cuda.manual_seed_all(seed_val)
+# Store the average loss after each epoch so I can plot them.
+loss_values = []
+# For each epoch...
+for epoch_i in range(0, epochs):
+    # ========================================
+    #               Training
+    # ========================================
+    # Perform one full pass over the training set.
+    print("")
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('Training...')
+    # Measure how long the training epoch takes.
+    t0 = time.time()
+    # Reset the total loss for this epoch.
+    total_loss = 0
+    # Put the model into training mode.
+    model.train()
+    # For each batch of training data...
+    for step, batch in enumerate(train_dataloader):
+        # Progress update every 40 batches.
+        if step % 40 == 0 and not step == 0:
+            # Calculate elapsed time in minutes.
+            elapsed = format_time(time.time() - t0)
+            # Report progress.
+            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+        # Unpack this training batch from the dataloader.
+        #
+        # As I unpack the batch, I'll also copy each tensor to the GPU using the
+        # `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids
+        #   [1]: attention masks
+        #   [2]: labels
+        b_input_ids = batch[0].to(device)
+        b_input_mask = batch[1].to(device)
+        b_labels = batch[2].to(device)
+        # Always clear any previously calculated gradients before performing a
+        # backward pass. PyTorch doesn't do this automatically because
+        # accumulating the gradients is "convenient while training RNNs".
+        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+        model.zero_grad()
+        # Perform a forward pass (evaluate the model on this training batch).
+        # This will return the loss (rather than the model output) because I
+        # have provided the `labels`.
+        # The documentation for this `model` function is here:
+        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+        outputs = model(b_input_ids,
+                    token_type_ids=None,
+                    attention_mask=b_input_mask,
+                    labels=b_labels)
+        # The call to `model` always returns a tuple, so I need to pull the
+        # loss value out of the tuple.
+        loss = outputs[0]
+        # Accumulate the training loss over all of the batches so that I can
+        # calculate the average loss at the end. `loss` is a Tensor containing a
+        # single value; the `.item()` function just returns the Python value
+        # from the tensor.
+        total_loss += loss.item()
+        # Perform a backward pass to calculate the gradients.
+        loss.backward()
+        # Clip the norm of the gradients to 1.0.
+        # This is to help prevent the "exploding gradients" problem.
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        # Update parameters and take a step using the computed gradient.
+        # The optimizer dictates the "update rule"--how the parameters are
+        # modified based on their gradients, the learning rate, etc.
+        optimizer.step()
+        # Update the learning rate.
+        scheduler.step()
+    # Calculate the average loss over the training data.
+    avg_train_loss = total_loss / len(train_dataloader)
+    # Store the loss value for plotting the learning curve.
+    loss_values.append(avg_train_loss)
+    print("")
+    print("  Average training loss: {0:.2f}".format(avg_train_loss))
+    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
+    # ========================================
+    #               Validation
+    # ========================================
+    # After the completion of each training epoch, measure the performance on
+    # the validation set.
+    print("")
+    print("Running Validation...")
+    t0 = time.time()
+    # Put the model in evaluation mode--the dropout layers behave differently
+    # during evaluation.
+    model.eval()
+    # Tracking variables
+    eval_loss, eval_accuracy = 0, 0
+    nb_eval_steps, nb_eval_examples = 0, 0
+    # Evaluate data for one epoch
+    for batch in validation_dataloader:
+        # Add batch to GPU
+        batch = tuple(t.to(device) for t in batch)
+        # Unpack the inputs from dataloader
+        b_input_ids, b_input_mask, b_labels = batch
+        # Telling the model not to compute or store gradients, saving memory and
+        # speeding up validation
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions.
+            # This will return the logits rather than the loss because we have
+            # not provided labels.
+            # token_type_ids is the same as the "segment ids", which
+            # differentiates sentence 1 and 2 in 2-sentence tasks.
+            # The documentation for this `model` function is here:
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            outputs = model(b_input_ids,
+                            token_type_ids=None,
+                            attention_mask=b_input_mask)
+        # Get the "logits" output by the model. The "logits" are the output
+        # values prior to applying an activation function like the softmax.
+        logits = outputs[0]
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+        # Calculate the accuracy for this batch of test sentences.
+        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
+        # Accumulate the total accuracy.
+        eval_accuracy += tmp_eval_accuracy
+        # Track the number of batches
+        nb_eval_steps += 1
+    # Report the final accuracy for this validation run.
+    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
+    print("  Validation took: {:}".format(format_time(time.time() - t0)))
+print("")
+print("Training complete!")