import torch import pandas as pd import numpy as np from sklearn import preprocessing from sklearn.model_selection import train_test_split from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from transformers import BertTokenizer, CamembertTokenizer from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification from transformers import get_linear_schedule_with_warmup import time import datetime import random import os import argparse import configparser import csv def create_dict(df, classColumnName): return dict(df[classColumnName].value_counts()) def remove_weak_classes(df, classColumnName, threshold): dictOfClassInstances = create_dict(df,classColumnName) dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } keys = [*dictionary] df_tmp = df[~ df[classColumnName].isin(keys)] df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) return df def resample_classes(df, classColumnName, numberOfInstances): #random numberOfInstances elements replace = False # with replacement fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] return df.groupby(classColumnName, as_index=False).apply(fn) def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return np.sum(pred_flat == labels_flat) / len(labels_flat) def format_time(elapsed): ''' Takes a time in seconds and returns a string hh:mm:ss ''' # Round to the nearest second. elapsed_rounded = int(round((elapsed))) # Format as hh:mm:ss return str(datetime.timedelta(seconds=elapsed_rounded)) def training_bertFineTuning(chosen_model, model_path, sentences, labels, max_len, batch_size, epochs = 4): # If there's a GPU available... if torch.cuda.is_available(): # Tell PyTorch to use the GPU. device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not... else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") ############################################################################################################ ########################## Model: Tokenization & Input Formatting ################################################################### ########################################################################################################### if chosen_model == 'bert' : print('Loading Bert Tokenizer...') tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True) elif chosen_model == 'camembert': print('Loading Camembert Tokenizer...') tokenizer = CamembertTokenizer.from_pretrained(model_path , do_lower_case=True) # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids = [] # For every sentence... for sent in sentences: # `encode` will: # (1) Tokenize the sentence. # (2) Prepend the `[CLS]` token to the start. # (3) Append the `[SEP]` token to the end. # (4) Map tokens to their IDs. encoded_sent = tokenizer.encode( str(sent), # Sentence to encode. add_special_tokens = True, # Add '[CLS]' and '[SEP]' # This function also supports truncation and conversion # to pytorch tensors, but I need to do padding, so I # can't use these features. #max_length = 128, # Truncate all sentences. #return_tensors = 'pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. input_ids.append(encoded_sent) padded = [] for i in input_ids: if len(i) > max_len: padded.extend([i[:max_len]]) else: padded.extend([i + [0] * (max_len - len(i))]) padded = np.array(padded) # Create attention masks attention_masks = [] # For each sentence... for sent in padded: # Create the attention mask. # - If a token ID is 0, then it's padding, set the mask to 0. # - If a token ID is > 0, then it's a real token, set the mask to 1. att_mask = [int(token_id > 0) for token_id in sent] # Store the attention mask for this sentence. attention_masks.append(att_mask) # Use 90% for training and 10% for validation. train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.3, stratify = labels ) # Do the same for the masks. train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.3, stratify = labels) # Convert all inputs and labels into torch tensors, the required datatype # for my model. train_inputs = torch.tensor(train_inputs) validation_inputs = torch.tensor(validation_inputs) train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) train_masks = torch.tensor(train_masks) validation_masks = torch.tensor(validation_masks) # The DataLoader needs to know the batch size for training, so I specify it here. # For fine-tuning BERT on a specific task, the authors recommend a batch size of # 16 or 32. # Create the DataLoader for training set. train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) # Create the DataLoader for validation set. validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) print(' Selecting a model .....') numberOfClasses = len(set(labels)) # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. if chosen_model == 'bert': model = BertForSequenceClassification.from_pretrained( model_path, # Use the 12-layer BERT model, with an uncased vocab. num_labels = numberOfClasses, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. ) elif chosen_model == 'camembert': model = CamembertForSequenceClassification.from_pretrained( model_path, # Use the 12-layer BERT model, with an uncased vocab. num_labels = numberOfClasses, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. ) # Tell pytorch to run this model on the GPU. model.cuda() #Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW(model.parameters(), lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps = 1e-8 # args.adam_epsilon - default is 1e-8. ) # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, # Default value in run_glue.py num_training_steps = total_steps) # This training code is based on the `run_glue.py` script here: # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Store the average loss after each epoch so I can plot them. loss_values = [] # For each epoch... for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_loss = 0 # Put the model into training mode. model.train() # For each batch of training data... for step, batch in enumerate(train_dataloader): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) # Unpack this training batch from the dataloader. # # As I unpack the batch, I'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # This will return the loss (rather than the model output) because I # have provided the `labels`. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # The call to `model` always returns a tuple, so I need to pull the # loss value out of the tuple. loss = outputs[0] # Accumulate the training loss over all of the batches so that I can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_loss += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # Calculate the average loss over the training data. avg_train_loss = total_loss / len(train_dataloader) # Store the loss value for plotting the learning curve. loss_values.append(avg_train_loss) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(format_time(time.time() - t0))) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure the performance on # the validation set. print("") print("Running Validation...") t0 = time.time() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() # Tracking variables eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 # Evaluate data for one epoch for batch in validation_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and # speeding up validation with torch.no_grad(): # Forward pass, calculate logit predictions. # This will return the logits rather than the loss because we have # not provided labels. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences. tmp_eval_accuracy = flat_accuracy(logits, label_ids) # Accumulate the total accuracy. eval_accuracy += tmp_eval_accuracy # Track the number of batches nb_eval_steps += 1 # Report the final accuracy for this validation run. print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps)) print(" Validation took: {:}".format(format_time(time.time() - t0))) print("") print("Training complete!") return model '''print('Saving Model....') model_save_name = config.get('model','modelName') path = config.get('model','path') #torch.save(model.state_dict(), os.path.join(path,model_save_name)) torch.save(model, os.path.join(path,model_save_name))''' if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input_dataset") parser.add_argument("conf_file") parser.add_argument("output_path") args = parser.parse_args() INPUT_DATASET = args.input_dataset CONF_FILE = args.conf_file OUTPUT_PATH = args.output_path config = configparser.ConfigParser() config.read(CONF_FILE) #dataPath = config.get('general','dataPath') columnText = config.get('general','columnText') columnClass = config.get('general','columnClass') minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) model_path = config.get('model','path') chosen_model = config.get('model','model') max_len = int(config.get('model','max_len_sequences')) batch_size = int(config.get('model','batch_size')) epochs = int(config.get('model','epochs')) df = pd.read_csv(INPUT_DATASET, sep="\t") df = remove_weak_classes(df, columnClass, minOfInstancePerClass) df = resample_classes(df, columnClass, maxOfInstancePerClass) #df = df[df[columnClass] != 'unclassified'] y = df[columnClass] numberOfClasses = y.nunique() encoder = preprocessing.LabelEncoder() y = encoder.fit_transform(y) #train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) #sentences = train_x[columnText].values sentences = df[columnText].values #labels = train_y.tolist() labels = y.tolist() #call train method model = training_bertFineTuning(chosen_model,model_path, sentences, labels, max_len, batch_size, epochs) #save the model model_save_name = chosen_model+"_b"+batch_size+"_e"+epochs torch.save(model, os.path.join(OUTPUT_PATH,model_save_name)) #print the model parameters params = list(model.named_parameters()) print('The BERT model has {:} different named parameters.\n'.format(len(params))) print('==== Embedding Layer ====\n') for p in params[0:5]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) print('\n==== First Transformer ====\n') for p in params[5:21]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) print('\n==== Output Layer ====\n') for p in params[-4:]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))