Skip to content
Snippets Groups Projects

Branch dev bert exp

Merged Ludovic Moncla requested to merge branch_dev_bert_exp into master
1 file
+ 465
0
Compare changes
  • Side-by-side
  • Inline
+ 465
0
 
import torch
 
import pandas as pd
 
import numpy as np
 
from sklearn import preprocessing
 
from sklearn.model_selection import train_test_split
 
from transformers import BertTokenizer, CamembertTokenizer
 
from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
 
from transformers import get_linear_schedule_with_warmup
 
import time
 
import datetime
 
import random
 
 
 
 
 
 
###########################################################################
 
########################## Utils Functions ################################
 
###########################################################################
 
 
def create_dict(df, classColumnName):
 
return dict(df[classColumnName].value_counts())
 
 
def remove_weak_classes(df, classColumnName, threshold):
 
 
dictOfClassInstances = create_dict(df,classColumnName)
 
 
 
dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
 
keys = [*dictionary]
 
df_tmp = df[~ df[classColumnName].isin(keys)]
 
df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)
 
return df
 
 
 
def resample_classes(df, classColumnName, numberOfInstances):
 
 
#random numberOfInstances elements
 
replace = False # with replacement
 
 
fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
 
return df.groupby(classColumnName, as_index=False).apply(fn)
 
 
##############################################################################################################
 
########################## Setup GPU #########################################################################
 
##############################################################################################################
 
 
# If there's a GPU available...
 
if torch.cuda.is_available():
 
 
# Tell PyTorch to use the GPU.
 
device = torch.device("cuda")
 
 
print('There are %d GPU(s) available.' % torch.cuda.device_count())
 
 
print('We will use the GPU:', torch.cuda.get_device_name(0))
 
 
# If not...
 
else:
 
print('No GPU available, using the CPU instead.')
 
device = torch.device("cpu")
 
 
 
 
 
#############################################################################################################
 
########################## parameters ###################################################################
 
###########################################################################################################
 
 
config = configparser.ConfigParser()
 
config.read('settings.conf')
 
 
dataPath = config.get('general','dataPath')
 
columnText = config.get('general','columnText')
 
columnClass = config.get('general','columnClass')
 
 
minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
 
maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
 
 
chosen_tokeniser = config.get('model','tokeniser')
 
chosen_model = config.get('model','model')
 
 
max_len = int(config.get('model','max_len_sequences'))
 
 
 
#############################################################################################################
 
########################## Load Data ###################################################################
 
###########################################################################################################
 
 
 
 
 
df = pd.read_csv(dataPath)
 
df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
 
df = resample_classes(df, columnClass, maxOfInstancePerClass)
 
df = df[df[columnClass] != 'unclassified']
 
 
 
 
 
 
y = df[columnClass]
 
numberOfClasses = y.nunique()
 
encoder = preprocessing.LabelEncoder()
 
y = encoder.fit_transform(y)
 
 
 
 
sentences = train_x[columnText].values
 
labels = train_y.tolist()
 
 
 
 
############################################################################################################
 
########################## Model: Tokenization & Input Formatting ###################################################################
 
###########################################################################################################
 
 
 
# Load the BERT tokenizer.
 
print('Loading BERT tokenizer...')
 
tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)
 
 
 
# Tokenize all of the sentences and map the tokens to thier word IDs.
 
input_ids = []
 
 
# For every sentence...
 
for sent in sentences:
 
# `encode` will:
 
# (1) Tokenize the sentence.
 
# (2) Prepend the `[CLS]` token to the start.
 
# (3) Append the `[SEP]` token to the end.
 
# (4) Map tokens to their IDs.
 
encoded_sent = tokenizer.encode(
 
sent, # Sentence to encode.
 
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
 
 
# This function also supports truncation and conversion
 
# to pytorch tensors, but I need to do padding, so I
 
# can't use these features.
 
#max_length = 128, # Truncate all sentences.
 
#return_tensors = 'pt', # Return pytorch tensors.
 
)
 
 
# Add the encoded sentence to the list.
 
input_ids.append(encoded_sent)
 
 
 
 
 
padded = []
 
for i in input_ids:
 
 
if len(i) > max_len:
 
padded.extend([i[:max_len]])
 
else:
 
padded.extend([i + [0] * (max_len - len(i))])
 
 
 
padded = input_ids = np.array(padded)
 
 
 
 
# Create attention masks
 
attention_masks = []
 
 
# For each sentence...
 
for sent in padded:
 
 
# Create the attention mask.
 
# - If a token ID is 0, then it's padding, set the mask to 0.
 
# - If a token ID is > 0, then it's a real token, set the mask to 1.
 
att_mask = [int(token_id > 0) for token_id in sent]
 
 
# Store the attention mask for this sentence.
 
attention_masks.append(att_mask)
 
 
 
# Use 90% for training and 10% for validation.
 
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
 
random_state=2018, test_size=0.1, stratify = labels )
 
# Do the same for the masks.
 
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
 
random_state=2018, test_size=0.1, stratify = labels)
 
 
 
# Convert all inputs and labels into torch tensors, the required datatype
 
# for my model.
 
train_inputs = torch.tensor(train_inputs)
 
validation_inputs = torch.tensor(validation_inputs)
 
 
train_labels = torch.tensor(train_labels)
 
validation_labels = torch.tensor(validation_labels)
 
 
train_masks = torch.tensor(train_masks)
 
validation_masks = torch.tensor(validation_masks)
 
 
 
 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 
 
# The DataLoader needs to know the batch size for training, so I specify it here.
 
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
 
# 16 or 32.
 
 
batch_size = int(config.get('model','batch_size'))
 
 
# Create the DataLoader for training set.
 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
 
train_sampler = RandomSampler(train_data)
 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
 
 
# Create the DataLoader for validation set.
 
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
 
validation_sampler = SequentialSampler(validation_data)
 
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
 
 
 
 
 
############################################################################################################
 
########################## Model: Training ###################################################################
 
###########################################################################################################
 
 
 
print(' Selecting a model .....')
 
 
 
 
# Load BertForSequenceClassification, the pretrained BERT model with a single
 
# linear classification layer on top.
 
 
model = BertForSequenceClassification.from_pretrained(
 
chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
 
num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
 
# You can increase this for multi-class tasks.
 
output_attentions = False, # Whether the model returns attentions weights.
 
output_hidden_states = False, # Whether the model returns all hidden-states.
 
)
 
 
# Tell pytorch to run this model on the GPU.
 
model.cuda()
 
 
 
#Note: AdamW is a class from the huggingface library (as opposed to pytorch)
 
# I believe the 'W' stands for 'Weight Decay fix"
 
optimizer = AdamW(model.parameters(),
 
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
 
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
 
)
 
 
 
 
# Number of training epochs (authors recommend between 2 and 4)
 
epochs = int(config.get('model','epochs'))
 
 
# Total number of training steps is number of batches * number of epochs.
 
total_steps = len(train_dataloader) * epochs
 
 
# Create the learning rate scheduler.
 
scheduler = get_linear_schedule_with_warmup(optimizer,
 
num_warmup_steps = 0, # Default value in run_glue.py
 
num_training_steps = total_steps)
 
 
 
def flat_accuracy(preds, labels):
 
pred_flat = np.argmax(preds, axis=1).flatten()
 
labels_flat = labels.flatten()
 
return np.sum(pred_flat == labels_flat) / len(labels_flat)
 
 
 
 
 
 
def format_time(elapsed):
 
'''
 
Takes a time in seconds and returns a string hh:mm:ss
 
'''
 
# Round to the nearest second.
 
elapsed_rounded = int(round((elapsed)))
 
 
# Format as hh:mm:ss
 
return str(datetime.timedelta(seconds=elapsed_rounded))
 
 
 
 
 
# This training code is based on the `run_glue.py` script here:
 
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
 
 
 
# Set the seed value all over the place to make this reproducible.
 
seed_val = 42
 
 
random.seed(seed_val)
 
np.random.seed(seed_val)
 
torch.manual_seed(seed_val)
 
torch.cuda.manual_seed_all(seed_val)
 
 
# Store the average loss after each epoch so I can plot them.
 
loss_values = []
 
 
# For each epoch...
 
for epoch_i in range(0, epochs):
 
 
# ========================================
 
# Training
 
# ========================================
 
 
# Perform one full pass over the training set.
 
 
print("")
 
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
 
print('Training...')
 
 
# Measure how long the training epoch takes.
 
t0 = time.time()
 
 
# Reset the total loss for this epoch.
 
total_loss = 0
 
 
# Put the model into training mode.
 
model.train()
 
 
# For each batch of training data...
 
for step, batch in enumerate(train_dataloader):
 
 
# Progress update every 40 batches.
 
if step % 40 == 0 and not step == 0:
 
# Calculate elapsed time in minutes.
 
elapsed = format_time(time.time() - t0)
 
 
# Report progress.
 
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
 
 
# Unpack this training batch from the dataloader.
 
#
 
# As I unpack the batch, I'll also copy each tensor to the GPU using the
 
# `to` method.
 
#
 
# `batch` contains three pytorch tensors:
 
# [0]: input ids
 
# [1]: attention masks
 
# [2]: labels
 
b_input_ids = batch[0].to(device)
 
b_input_mask = batch[1].to(device)
 
b_labels = batch[2].to(device)
 
 
# Always clear any previously calculated gradients before performing a
 
# backward pass. PyTorch doesn't do this automatically because
 
# accumulating the gradients is "convenient while training RNNs".
 
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
 
model.zero_grad()
 
 
# Perform a forward pass (evaluate the model on this training batch).
 
# This will return the loss (rather than the model output) because I
 
# have provided the `labels`.
 
# The documentation for this `model` function is here:
 
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
 
outputs = model(b_input_ids,
 
token_type_ids=None,
 
attention_mask=b_input_mask,
 
labels=b_labels)
 
 
# The call to `model` always returns a tuple, so I need to pull the
 
# loss value out of the tuple.
 
loss = outputs[0]
 
 
# Accumulate the training loss over all of the batches so that I can
 
# calculate the average loss at the end. `loss` is a Tensor containing a
 
# single value; the `.item()` function just returns the Python value
 
# from the tensor.
 
total_loss += loss.item()
 
 
# Perform a backward pass to calculate the gradients.
 
loss.backward()
 
 
# Clip the norm of the gradients to 1.0.
 
# This is to help prevent the "exploding gradients" problem.
 
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
 
 
# Update parameters and take a step using the computed gradient.
 
# The optimizer dictates the "update rule"--how the parameters are
 
# modified based on their gradients, the learning rate, etc.
 
optimizer.step()
 
 
# Update the learning rate.
 
scheduler.step()
 
 
# Calculate the average loss over the training data.
 
avg_train_loss = total_loss / len(train_dataloader)
 
 
# Store the loss value for plotting the learning curve.
 
loss_values.append(avg_train_loss)
 
 
print("")
 
print(" Average training loss: {0:.2f}".format(avg_train_loss))
 
print(" Training epoch took: {:}".format(format_time(time.time() - t0)))
 
 
# ========================================
 
# Validation
 
# ========================================
 
# After the completion of each training epoch, measure the performance on
 
# the validation set.
 
 
print("")
 
print("Running Validation...")
 
 
t0 = time.time()
 
 
# Put the model in evaluation mode--the dropout layers behave differently
 
# during evaluation.
 
model.eval()
 
 
# Tracking variables
 
eval_loss, eval_accuracy = 0, 0
 
nb_eval_steps, nb_eval_examples = 0, 0
 
 
# Evaluate data for one epoch
 
for batch in validation_dataloader:
 
 
# Add batch to GPU
 
batch = tuple(t.to(device) for t in batch)
 
 
# Unpack the inputs from dataloader
 
b_input_ids, b_input_mask, b_labels = batch
 
 
# Telling the model not to compute or store gradients, saving memory and
 
# speeding up validation
 
with torch.no_grad():
 
 
# Forward pass, calculate logit predictions.
 
# This will return the logits rather than the loss because we have
 
# not provided labels.
 
# token_type_ids is the same as the "segment ids", which
 
# differentiates sentence 1 and 2 in 2-sentence tasks.
 
# The documentation for this `model` function is here:
 
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
 
outputs = model(b_input_ids,
 
token_type_ids=None,
 
attention_mask=b_input_mask)
 
 
# Get the "logits" output by the model. The "logits" are the output
 
# values prior to applying an activation function like the softmax.
 
logits = outputs[0]
 
 
# Move logits and labels to CPU
 
logits = logits.detach().cpu().numpy()
 
label_ids = b_labels.to('cpu').numpy()
 
 
# Calculate the accuracy for this batch of test sentences.
 
tmp_eval_accuracy = flat_accuracy(logits, label_ids)
 
 
# Accumulate the total accuracy.
 
eval_accuracy += tmp_eval_accuracy
 
 
# Track the number of batches
 
nb_eval_steps += 1
 
 
# Report the final accuracy for this validation run.
 
print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
 
print(" Validation took: {:}".format(format_time(time.time() - t0)))
 
 
print("")
 
print("Training complete!")
Loading