diff --git a/evaluate_bertFineTuning.py b/evaluate_bertFineTuning.py new file mode 100644 index 0000000000000000000000000000000000000000..3c9b52bb3dac78506110b0d49716fe97e18e10bf --- /dev/null +++ b/evaluate_bertFineTuning.py @@ -0,0 +1,54 @@ +import matplotlib.pyplot as plt +from sklearn.metrics import plot_confusion_matrix +from sklearn.metrics import confusion_matrix +from sklearn.metrics import classification_report +import seaborn as sns + + + + + + + + + + +def evaluate_bertFineTuning(pred_labels_, true_labels_, encoder): + report = classification_report( pred_labels_, true_labels_, output_dict = True) + + classes = [str(e) for e in encoder.transform(encoder.classes_)] + classesName = encoder.classes_ + + accuracy = report['accuracy'] + weighted_avg = report['weighted avg'] + + precision = [] + recall = [] + f1 = [] + support = [] + dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN']) + for c in classes: + precision.append(report[c]['precision']) + recall.append(report[c]['recall']) + f1.append(report[c]['f1-score']) + support.append(report[c]['support']) + + accuracy = report['accuracy'] + weighted_avg = report['weighted avg'] + cnf_matrix = confusion_matrix(true_labels_, pred_labels_) + FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) + FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) + TP = np.diag(cnf_matrix) + TN = cnf_matrix.sum() - (FP + FN + TP) + + dff['className'] = classesName + dff['precision'] = precision + dff['recall'] = recall + dff['f1-score'] = f1 + dff['support'] = support + dff['FP'] = FP + dff['FN'] = FN + dff['TP'] = TP + dff['TN'] = TN + + return dff, accuracy, weighted_avg diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..8301acc2f929d750e0cea915a905a721ab8150fb --- /dev/null +++ b/main.py @@ -0,0 +1,120 @@ +import pandas as pd +import numpy as np +import configparser +from sklearn import preprocessing +from sklearn.model_selection import train_test_split + +from training_bertFineTuning import training_bertFineTuning +from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader +from evaluate_bertFineTuning import evaluate_bertFineTuning + + + + + + +def create_dict(df, classColumnName): + return dict(df[classColumnName].value_counts()) + +def remove_weak_classes(df, classColumnName, threshold): + + dictOfClassInstances = create_dict(df,classColumnName) + + + dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } + keys = [*dictionary] + df_tmp = df[~ df[classColumnName].isin(keys)] + df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) + return df + + +def resample_classes(df, classColumnName, numberOfInstances): + + #random numberOfInstances elements + replace = False # with replacement + + fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] + return df.groupby(classColumnName, as_index=False).apply(fn) + + + +def main(): + + config = configparser.ConfigParser() + config.read('bert_settings.conf') + + dataPath = config.get('general','dataPath') + columnText = config.get('general','columnText') + columnClass = config.get('general','columnClass') + + minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) + maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) + + chosen_tokeniser = config.get('model','tokeniser') + chosen_model = config.get('model','model') + + max_len = int(config.get('model','max_len_sequences')) + batch_size = int(config.get('model','batch_size')) + epochs = int(config.get('model','epochs')) + + df = pd.read_csv(dataPath) + df = remove_weak_classes(df, columnClass, minOfInstancePerClass) + df = resample_classes(df, columnClass, maxOfInstancePerClass) + df = df[df[columnClass] != 'unclassified'] + + + y = df[columnClass] + numberOfClasses = y.nunique() + encoder = preprocessing.LabelEncoder() + y = encoder.fit_transform(y) + + + train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) + + sentences = train_x[columnText].values + labels = train_y.tolist() + + + #call train method + + model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs) + #save the model + model_save_name = config.get('model','modelName') + path = config.get('model','path') + torch.save(model, os.path.join(path,model_save_name)) + + #print the model parameters + params = list(model.named_parameters()) + + print('The BERT model has {:} different named parameters.\n'.format(len(params))) + + print('==== Embedding Layer ====\n') + + for p in params[0:5]: + print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) + + print('\n==== First Transformer ====\n') + + for p in params[5:21]: + print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) + + print('\n==== Output Layer ====\n') + + for p in params[-4:]: + print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) + + #call predict method + prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32) + predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader) + + #call Evaluate + result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder) + + print(result_df) + print(accuracy) + print(weighted_avg) + + + +if __name__ == "__main__": + main() diff --git a/predict_bertFineTuning.py b/predict_bertFineTuning.py new file mode 100644 index 0000000000000000000000000000000000000000..4276122d0b88159c6631fe2dd2db9d14603558c3 --- /dev/null +++ b/predict_bertFineTuning.py @@ -0,0 +1,168 @@ +import torch + +import pandas as pd + +import numpy as np + +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from transformers import BertTokenizer, CamembertTokenizer + +def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32): + + if chosen_model == 'bert-base-multilingual-cased' : + print('Loading Bert Tokenizer...') + tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True) + elif chosen_model == 'camembert-base': + print('Loading Camembert Tokenizer...') + tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True) + + # Tokenize all of the sentences and map the tokens to thier word IDs. + input_ids_test = [] + # For every sentence... + for sent in sentences_to_predict: + # `encode` will: + # (1) Tokenize the sentence. + # (2) Prepend the `[CLS]` token to the start. + # (3) Append the `[SEP]` token to the end. + # (4) Map tokens to their IDs. + encoded_sent = tokenizer.encode( + sent, # Sentence to encode. + add_special_tokens = True, # Add '[CLS]' and '[SEP]' + ) + + input_ids_test.append(encoded_sent) + + # Pad our input tokens + padded_test = [] + for i in input_ids_test: + + if len(i) > max_len: + padded_test.extend([i[:max_len]]) + else: + padded_test.extend([i + [0] * (max_len - len(i))]) + input_ids_test = np.array(padded_test) + + # Create attention masks + attention_masks = [] + + # Create a mask of 1s for each token followed by 0s for padding + for seq in input_ids_test: + seq_mask = [float(i>0) for i in seq] + attention_masks.append(seq_mask) + + # Convert to tensors. + prediction_inputs = torch.tensor(input_ids_test) + prediction_masks = torch.tensor(attention_masks) + prediction_labels = torch.tensor(labels) + + # Set the batch size. + batch_size = 32 + + # Create the DataLoader. + prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) + + return prediction_dataloader + + + +def predict_class_bertFineTuning(model, sentences_to_predict_dataloader): + + + # If there's a GPU available... + if torch.cuda.is_available(): + + # Tell PyTorch to use the GPU. + device = torch.device("cuda") + + print('There are %d GPU(s) available.' % torch.cuda.device_count()) + + print('We will use the GPU:', torch.cuda.get_device_name(0)) + + # If not... + else: + print('No GPU available, using the CPU instead.') + device = torch.device("cpu") + + # Put model in evaluation mode + model.eval() + + # Tracking variables + predictions_test , true_labels = [], [] + + # Predict + for batch in prediction_dataloader: + # Add batch to GPU + batch = tuple(t.to(device) for t in batch) + + # Unpack the inputs from the dataloader + b_input_ids, b_input_mask, b_labels = batch + + # Telling the model not to compute or store gradients, saving memory and + # speeding up prediction + with torch.no_grad(): + # Forward pass, calculate logit predictions + outputs = model(b_input_ids, token_type_ids=None, + attention_mask=b_input_mask) + + logits = outputs[0] + #print(logits) + + # Move logits and labels to CPU + logits = logits.detach().cpu().numpy() + label_ids = b_labels.to('cpu').numpy() + #print(logits) + + # Store predictions and true labels + predictions_test.append(logits) + true_labels.append(label_ids) + + print(' DONE.') + + pred_labels = [] + + + for i in range(len(true_labels)): + + # The predictions for this batch are a 2-column ndarray (one column for "0" + # and one column for "1"). Pick the label with the highest value and turn this + # in to a list of 0s and 1s. + pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten() + pred_labels.append(pred_labels_i) + + pred_labels_ = [item for sublist in pred_labels for item in sublist] + true_labels_ = [item for sublist in true_labels for item in sublist] + return predictions_test_, true_labels_ + + +def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict): + + if chosen_model == 'bert-base-multilingual-cased' : + print('Loading Bert Tokenizer...') + tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True) + elif chosen_model == 'camembert-base': + print('Loading Camembert Tokenizer...') + tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True) + + # Tokenize all of the sentences and map the tokens to thier word IDs. + input_ids_test = [] + # For every sentence... + for sent in sentences_to_predict: + # `encode` will: + # (1) Tokenize the sentence. + # (2) Prepend the `[CLS]` token to the start. + # (3) Append the `[SEP]` token to the end. + # (4) Map tokens to their IDs. + encoded_sent = tokenizer.encode( + sent, # Sentence to encode. + add_special_tokens = True, # Add '[CLS]' and '[SEP]' + ) + + input_ids_test.append(encoded_sent) + with torch.no_grad(): + # Forward pass, calculate logit predictions + outputs = model(b_input_ids, token_type_ids=None, + attention_mask=b_input_mask) + + logits = outputs[0] diff --git a/training_bertFineTuning.py b/training_bertFineTuning.py index 72a5929c733d95aebeab139af781661c184e984b..285be2d9a72d13d6cc693ad2a1c373b571e5fe86 100644 --- a/training_bertFineTuning.py +++ b/training_bertFineTuning.py @@ -2,464 +2,399 @@ import torch import pandas as pd import numpy as np from sklearn import preprocessing -from sklearn.model_selection import train_test_split +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from transformers import BertTokenizer, CamembertTokenizer from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification from transformers import get_linear_schedule_with_warmup import time import datetime import random +import os +def flat_accuracy(preds, labels): + pred_flat = np.argmax(preds, axis=1).flatten() + labels_flat = labels.flatten() + return np.sum(pred_flat == labels_flat) / len(labels_flat) -########################################################################### -########################## Utils Functions ################################ -########################################################################### - -def create_dict(df, classColumnName): - return dict(df[classColumnName].value_counts()) - -def remove_weak_classes(df, classColumnName, threshold): - - dictOfClassInstances = create_dict(df,classColumnName) - - - dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } - keys = [*dictionary] - df_tmp = df[~ df[classColumnName].isin(keys)] - df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) - return df - - -def resample_classes(df, classColumnName, numberOfInstances): - - #random numberOfInstances elements - replace = False # with replacement - - fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] - return df.groupby(classColumnName, as_index=False).apply(fn) - -############################################################################################################## -########################## Setup GPU ######################################################################### -############################################################################################################## - -# If there's a GPU available... -if torch.cuda.is_available(): - # Tell PyTorch to use the GPU. - device = torch.device("cuda") - print('There are %d GPU(s) available.' % torch.cuda.device_count()) - print('We will use the GPU:', torch.cuda.get_device_name(0)) +def format_time(elapsed): + ''' + Takes a time in seconds and returns a string hh:mm:ss + ''' + # Round to the nearest second. + elapsed_rounded = int(round((elapsed))) -# If not... -else: - print('No GPU available, using the CPU instead.') - device = torch.device("cpu") + # Format as hh:mm:ss + return str(datetime.timedelta(seconds=elapsed_rounded)) +def training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs = 4): + # If there's a GPU available... + if torch.cuda.is_available(): -############################################################################################################# -########################## parameters ################################################################### -########################################################################################################### + # Tell PyTorch to use the GPU. + device = torch.device("cuda") -config = configparser.ConfigParser() -config.read('settings.conf') + print('There are %d GPU(s) available.' % torch.cuda.device_count()) -dataPath = config.get('general','dataPath') -columnText = config.get('general','columnText') -columnClass = config.get('general','columnClass') + print('We will use the GPU:', torch.cuda.get_device_name(0)) -minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) -maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) + # If not... + else: + print('No GPU available, using the CPU instead.') + device = torch.device("cpu") -chosen_tokeniser = config.get('model','tokeniser') -chosen_model = config.get('model','model') -max_len = int(config.get('model','max_len_sequences')) -############################################################################################################# -########################## Load Data ################################################################### +############################################################################################################ +########################## Model: Tokenization & Input Formatting ################################################################### ########################################################################################################### + if chosen_model == 'bert-base-multilingual-cased' : + print('Loading Bert Tokenizer...') + tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True) + elif chosen_model == 'camembert-base': + print('Loading Camembert Tokenizer...') + tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True) -df = pd.read_csv(dataPath) -df = remove_weak_classes(df, columnClass, minOfInstancePerClass) -df = resample_classes(df, columnClass, maxOfInstancePerClass) -df = df[df[columnClass] != 'unclassified'] - + # Tokenize all of the sentences and map the tokens to thier word IDs. + input_ids = [] + # For every sentence... + for sent in sentences: + # `encode` will: + # (1) Tokenize the sentence. + # (2) Prepend the `[CLS]` token to the start. + # (3) Append the `[SEP]` token to the end. + # (4) Map tokens to their IDs. + encoded_sent = tokenizer.encode( + sent, # Sentence to encode. + add_special_tokens = True, # Add '[CLS]' and '[SEP]' + # This function also supports truncation and conversion + # to pytorch tensors, but I need to do padding, so I + # can't use these features. + #max_length = 128, # Truncate all sentences. + #return_tensors = 'pt', # Return pytorch tensors. + ) -y = df[columnClass] -numberOfClasses = y.nunique() -encoder = preprocessing.LabelEncoder() -y = encoder.fit_transform(y) + # Add the encoded sentence to the list. + input_ids.append(encoded_sent) -sentences = train_x[columnText].values -labels = train_y.tolist() + padded = [] + for i in input_ids: + if len(i) > max_len: + padded.extend([i[:max_len]]) + else: + padded.extend([i + [0] * (max_len - len(i))]) -############################################################################################################ -########################## Model: Tokenization & Input Formatting ################################################################### -########################################################################################################### - - -# Load the BERT tokenizer. -print('Loading BERT tokenizer...') -tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True) + padded = np.array(padded) - # Tokenize all of the sentences and map the tokens to thier word IDs. -input_ids = [] -# For every sentence... -for sent in sentences: - # `encode` will: - # (1) Tokenize the sentence. - # (2) Prepend the `[CLS]` token to the start. - # (3) Append the `[SEP]` token to the end. - # (4) Map tokens to their IDs. - encoded_sent = tokenizer.encode( - sent, # Sentence to encode. - add_special_tokens = True, # Add '[CLS]' and '[SEP]' - # This function also supports truncation and conversion - # to pytorch tensors, but I need to do padding, so I - # can't use these features. - #max_length = 128, # Truncate all sentences. - #return_tensors = 'pt', # Return pytorch tensors. - ) + # Create attention masks + attention_masks = [] - # Add the encoded sentence to the list. - input_ids.append(encoded_sent) + # For each sentence... + for sent in padded: + # Create the attention mask. + # - If a token ID is 0, then it's padding, set the mask to 0. + # - If a token ID is > 0, then it's a real token, set the mask to 1. + att_mask = [int(token_id > 0) for token_id in sent] + # Store the attention mask for this sentence. + attention_masks.append(att_mask) -padded = [] -for i in input_ids: + # Use 90% for training and 10% for validation. + train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.1, stratify = labels ) + # Do the same for the masks. + train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1, stratify = labels) - if len(i) > max_len: - padded.extend([i[:max_len]]) - else: - padded.extend([i + [0] * (max_len - len(i))]) + # Convert all inputs and labels into torch tensors, the required datatype + # for my model. + train_inputs = torch.tensor(train_inputs) + validation_inputs = torch.tensor(validation_inputs) -padded = input_ids = np.array(padded) + train_labels = torch.tensor(train_labels) + validation_labels = torch.tensor(validation_labels) + train_masks = torch.tensor(train_masks) + validation_masks = torch.tensor(validation_masks) - # Create attention masks -attention_masks = [] -# For each sentence... -for sent in padded: - # Create the attention mask. - # - If a token ID is 0, then it's padding, set the mask to 0. - # - If a token ID is > 0, then it's a real token, set the mask to 1. - att_mask = [int(token_id > 0) for token_id in sent] + # The DataLoader needs to know the batch size for training, so I specify it here. + # For fine-tuning BERT on a specific task, the authors recommend a batch size of + # 16 or 32. - # Store the attention mask for this sentence. - attention_masks.append(att_mask) + # Create the DataLoader for training set. + train_data = TensorDataset(train_inputs, train_masks, train_labels) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) -# Use 90% for training and 10% for validation. -train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, - random_state=2018, test_size=0.1, stratify = labels ) -# Do the same for the masks. -train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, - random_state=2018, test_size=0.1, stratify = labels) + # Create the DataLoader for validation set. + validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) + validation_sampler = SequentialSampler(validation_data) + validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) -# Convert all inputs and labels into torch tensors, the required datatype -# for my model. -train_inputs = torch.tensor(train_inputs) -validation_inputs = torch.tensor(validation_inputs) -train_labels = torch.tensor(train_labels) -validation_labels = torch.tensor(validation_labels) -train_masks = torch.tensor(train_masks) -validation_masks = torch.tensor(validation_masks) + print(' Selecting a model .....') + numberOfClasses = len(set(labels)) -from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler -# The DataLoader needs to know the batch size for training, so I specify it here. -# For fine-tuning BERT on a specific task, the authors recommend a batch size of -# 16 or 32. + # Load BertForSequenceClassification, the pretrained BERT model with a single + # linear classification layer on top. + if chosen_model == 'bert-base-multilingual-cased': + model = BertForSequenceClassification.from_pretrained( + chosen_model, # Use the 12-layer BERT model, with an uncased vocab. + num_labels = numberOfClasses, # The number of output labels--2 for binary classification. + # You can increase this for multi-class tasks. + output_attentions = False, # Whether the model returns attentions weights. + output_hidden_states = False, # Whether the model returns all hidden-states. + ) + elif chosen_model == 'camembert-base': -batch_size = int(config.get('model','batch_size')) + model = CamembertForSequenceClassification.from_pretrained( + chosen_model, # Use the 12-layer BERT model, with an uncased vocab. + num_labels = numberOfClasses, # The number of output labels--2 for binary classification. + # You can increase this for multi-class tasks. + output_attentions = False, # Whether the model returns attentions weights. + output_hidden_states = False, # Whether the model returns all hidden-states. + ) -# Create the DataLoader for training set. -train_data = TensorDataset(train_inputs, train_masks, train_labels) -train_sampler = RandomSampler(train_data) -train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) -# Create the DataLoader for validation set. -validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) -validation_sampler = SequentialSampler(validation_data) -validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) + # Tell pytorch to run this model on the GPU. + model.cuda() + #Note: AdamW is a class from the huggingface library (as opposed to pytorch) + # I believe the 'W' stands for 'Weight Decay fix" + optimizer = AdamW(model.parameters(), + lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 + eps = 1e-8 # args.adam_epsilon - default is 1e-8. + ) -############################################################################################################ -########################## Model: Training ################################################################### -########################################################################################################### -print(' Selecting a model .....') + # Total number of training steps is number of batches * number of epochs. + total_steps = len(train_dataloader) * epochs + # Create the learning rate scheduler. + scheduler = get_linear_schedule_with_warmup(optimizer, + num_warmup_steps = 0, # Default value in run_glue.py + num_training_steps = total_steps) -# Load BertForSequenceClassification, the pretrained BERT model with a single -# linear classification layer on top. -model = BertForSequenceClassification.from_pretrained( - chosen_model, # Use the 12-layer BERT model, with an uncased vocab. - num_labels = numberOfClasses, # The number of output labels--2 for binary classification. - # You can increase this for multi-class tasks. - output_attentions = False, # Whether the model returns attentions weights. - output_hidden_states = False, # Whether the model returns all hidden-states. -) -# Tell pytorch to run this model on the GPU. -model.cuda() + # This training code is based on the `run_glue.py` script here: + # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 -#Note: AdamW is a class from the huggingface library (as opposed to pytorch) -# I believe the 'W' stands for 'Weight Decay fix" -optimizer = AdamW(model.parameters(), - lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 - eps = 1e-8 # args.adam_epsilon - default is 1e-8. - ) + # Set the seed value all over the place to make this reproducible. + seed_val = 42 + random.seed(seed_val) + np.random.seed(seed_val) + torch.manual_seed(seed_val) + torch.cuda.manual_seed_all(seed_val) + # Store the average loss after each epoch so I can plot them. + loss_values = [] -# Number of training epochs (authors recommend between 2 and 4) -epochs = int(config.get('model','epochs')) + # For each epoch... + for epoch_i in range(0, epochs): -# Total number of training steps is number of batches * number of epochs. -total_steps = len(train_dataloader) * epochs + # ======================================== + # Training + # ======================================== -# Create the learning rate scheduler. -scheduler = get_linear_schedule_with_warmup(optimizer, - num_warmup_steps = 0, # Default value in run_glue.py - num_training_steps = total_steps) + # Perform one full pass over the training set. + print("") + print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) + print('Training...') -def flat_accuracy(preds, labels): - pred_flat = np.argmax(preds, axis=1).flatten() - labels_flat = labels.flatten() - return np.sum(pred_flat == labels_flat) / len(labels_flat) + # Measure how long the training epoch takes. + t0 = time.time() + # Reset the total loss for this epoch. + total_loss = 0 + # Put the model into training mode. + model.train() + # For each batch of training data... + for step, batch in enumerate(train_dataloader): + # Progress update every 40 batches. + if step % 40 == 0 and not step == 0: + # Calculate elapsed time in minutes. + elapsed = format_time(time.time() - t0) -def format_time(elapsed): - ''' - Takes a time in seconds and returns a string hh:mm:ss - ''' - # Round to the nearest second. - elapsed_rounded = int(round((elapsed))) + # Report progress. + print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) - # Format as hh:mm:ss - return str(datetime.timedelta(seconds=elapsed_rounded)) + # Unpack this training batch from the dataloader. + # + # As I unpack the batch, I'll also copy each tensor to the GPU using the + # `to` method. + # + # `batch` contains three pytorch tensors: + # [0]: input ids + # [1]: attention masks + # [2]: labels + b_input_ids = batch[0].to(device) + b_input_mask = batch[1].to(device) + b_labels = batch[2].to(device) + # Always clear any previously calculated gradients before performing a + # backward pass. PyTorch doesn't do this automatically because + # accumulating the gradients is "convenient while training RNNs". + # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) + model.zero_grad() + # Perform a forward pass (evaluate the model on this training batch). + # This will return the loss (rather than the model output) because I + # have provided the `labels`. + # The documentation for this `model` function is here: + # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification + outputs = model(b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask, + labels=b_labels) + # The call to `model` always returns a tuple, so I need to pull the + # loss value out of the tuple. + loss = outputs[0] -# This training code is based on the `run_glue.py` script here: -# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 + # Accumulate the training loss over all of the batches so that I can + # calculate the average loss at the end. `loss` is a Tensor containing a + # single value; the `.item()` function just returns the Python value + # from the tensor. + total_loss += loss.item() + # Perform a backward pass to calculate the gradients. + loss.backward() -# Set the seed value all over the place to make this reproducible. -seed_val = 42 + # Clip the norm of the gradients to 1.0. + # This is to help prevent the "exploding gradients" problem. + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) -random.seed(seed_val) -np.random.seed(seed_val) -torch.manual_seed(seed_val) -torch.cuda.manual_seed_all(seed_val) + # Update parameters and take a step using the computed gradient. + # The optimizer dictates the "update rule"--how the parameters are + # modified based on their gradients, the learning rate, etc. + optimizer.step() -# Store the average loss after each epoch so I can plot them. -loss_values = [] + # Update the learning rate. + scheduler.step() -# For each epoch... -for epoch_i in range(0, epochs): + # Calculate the average loss over the training data. + avg_train_loss = total_loss / len(train_dataloader) - # ======================================== - # Training - # ======================================== + # Store the loss value for plotting the learning curve. + loss_values.append(avg_train_loss) - # Perform one full pass over the training set. + print("") + print(" Average training loss: {0:.2f}".format(avg_train_loss)) + print(" Training epoch took: {:}".format(format_time(time.time() - t0))) - print("") - print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) - print('Training...') - - # Measure how long the training epoch takes. - t0 = time.time() - - # Reset the total loss for this epoch. - total_loss = 0 - - # Put the model into training mode. - model.train() - - # For each batch of training data... - for step, batch in enumerate(train_dataloader): - - # Progress update every 40 batches. - if step % 40 == 0 and not step == 0: - # Calculate elapsed time in minutes. - elapsed = format_time(time.time() - t0) - - # Report progress. - print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) - - # Unpack this training batch from the dataloader. - # - # As I unpack the batch, I'll also copy each tensor to the GPU using the - # `to` method. - # - # `batch` contains three pytorch tensors: - # [0]: input ids - # [1]: attention masks - # [2]: labels - b_input_ids = batch[0].to(device) - b_input_mask = batch[1].to(device) - b_labels = batch[2].to(device) - - # Always clear any previously calculated gradients before performing a - # backward pass. PyTorch doesn't do this automatically because - # accumulating the gradients is "convenient while training RNNs". - # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) - model.zero_grad() - - # Perform a forward pass (evaluate the model on this training batch). - # This will return the loss (rather than the model output) because I - # have provided the `labels`. - # The documentation for this `model` function is here: - # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification - outputs = model(b_input_ids, - token_type_ids=None, - attention_mask=b_input_mask, - labels=b_labels) - - # The call to `model` always returns a tuple, so I need to pull the - # loss value out of the tuple. - loss = outputs[0] - - # Accumulate the training loss over all of the batches so that I can - # calculate the average loss at the end. `loss` is a Tensor containing a - # single value; the `.item()` function just returns the Python value - # from the tensor. - total_loss += loss.item() - - # Perform a backward pass to calculate the gradients. - loss.backward() - - # Clip the norm of the gradients to 1.0. - # This is to help prevent the "exploding gradients" problem. - torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) - - # Update parameters and take a step using the computed gradient. - # The optimizer dictates the "update rule"--how the parameters are - # modified based on their gradients, the learning rate, etc. - optimizer.step() - - # Update the learning rate. - scheduler.step() - - # Calculate the average loss over the training data. - avg_train_loss = total_loss / len(train_dataloader) - - # Store the loss value for plotting the learning curve. - loss_values.append(avg_train_loss) + # ======================================== + # Validation + # ======================================== + # After the completion of each training epoch, measure the performance on + # the validation set. - print("") - print(" Average training loss: {0:.2f}".format(avg_train_loss)) - print(" Training epoch took: {:}".format(format_time(time.time() - t0))) + print("") + print("Running Validation...") - # ======================================== - # Validation - # ======================================== - # After the completion of each training epoch, measure the performance on - # the validation set. + t0 = time.time() - print("") - print("Running Validation...") + # Put the model in evaluation mode--the dropout layers behave differently + # during evaluation. + model.eval() - t0 = time.time() + # Tracking variables + eval_loss, eval_accuracy = 0, 0 + nb_eval_steps, nb_eval_examples = 0, 0 - # Put the model in evaluation mode--the dropout layers behave differently - # during evaluation. - model.eval() + # Evaluate data for one epoch + for batch in validation_dataloader: - # Tracking variables - eval_loss, eval_accuracy = 0, 0 - nb_eval_steps, nb_eval_examples = 0, 0 + # Add batch to GPU + batch = tuple(t.to(device) for t in batch) - # Evaluate data for one epoch - for batch in validation_dataloader: + # Unpack the inputs from dataloader + b_input_ids, b_input_mask, b_labels = batch - # Add batch to GPU - batch = tuple(t.to(device) for t in batch) + # Telling the model not to compute or store gradients, saving memory and + # speeding up validation + with torch.no_grad(): - # Unpack the inputs from dataloader - b_input_ids, b_input_mask, b_labels = batch + # Forward pass, calculate logit predictions. + # This will return the logits rather than the loss because we have + # not provided labels. + # token_type_ids is the same as the "segment ids", which + # differentiates sentence 1 and 2 in 2-sentence tasks. + # The documentation for this `model` function is here: + # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification + outputs = model(b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask) - # Telling the model not to compute or store gradients, saving memory and - # speeding up validation - with torch.no_grad(): + # Get the "logits" output by the model. The "logits" are the output + # values prior to applying an activation function like the softmax. + logits = outputs[0] - # Forward pass, calculate logit predictions. - # This will return the logits rather than the loss because we have - # not provided labels. - # token_type_ids is the same as the "segment ids", which - # differentiates sentence 1 and 2 in 2-sentence tasks. - # The documentation for this `model` function is here: - # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification - outputs = model(b_input_ids, - token_type_ids=None, - attention_mask=b_input_mask) + # Move logits and labels to CPU + logits = logits.detach().cpu().numpy() + label_ids = b_labels.to('cpu').numpy() - # Get the "logits" output by the model. The "logits" are the output - # values prior to applying an activation function like the softmax. - logits = outputs[0] + # Calculate the accuracy for this batch of test sentences. + tmp_eval_accuracy = flat_accuracy(logits, label_ids) - # Move logits and labels to CPU - logits = logits.detach().cpu().numpy() - label_ids = b_labels.to('cpu').numpy() + # Accumulate the total accuracy. + eval_accuracy += tmp_eval_accuracy - # Calculate the accuracy for this batch of test sentences. - tmp_eval_accuracy = flat_accuracy(logits, label_ids) + # Track the number of batches + nb_eval_steps += 1 - # Accumulate the total accuracy. - eval_accuracy += tmp_eval_accuracy + # Report the final accuracy for this validation run. + print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps)) + print(" Validation took: {:}".format(format_time(time.time() - t0))) - # Track the number of batches - nb_eval_steps += 1 + print("") + print("Training complete!") + return model - # Report the final accuracy for this validation run. - print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps)) - print(" Validation took: {:}".format(format_time(time.time() - t0))) -print("") -print("Training complete!") +'''print('Saving Model....') +model_save_name = config.get('model','modelName') +path = config.get('model','path') +#torch.save(model.state_dict(), os.path.join(path,model_save_name)) +torch.save(model, os.path.join(path,model_save_name))'''