import torch import pandas as pd import numpy as np from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from transformers import BertTokenizer, CamembertTokenizer def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32): if chosen_model == 'bert-base-multilingual-cased' : print('Loading Bert Tokenizer...') tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True) elif chosen_model == 'camembert-base': print('Loading Camembert Tokenizer...') tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True) # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids_test = [] # For every sentence... for sent in sentences_to_predict: # `encode` will: # (1) Tokenize the sentence. # (2) Prepend the `[CLS]` token to the start. # (3) Append the `[SEP]` token to the end. # (4) Map tokens to their IDs. encoded_sent = tokenizer.encode( sent, # Sentence to encode. add_special_tokens = True, # Add '[CLS]' and '[SEP]' ) input_ids_test.append(encoded_sent) # Pad our input tokens padded_test = [] for i in input_ids_test: if len(i) > max_len: padded_test.extend([i[:max_len]]) else: padded_test.extend([i + [0] * (max_len - len(i))]) input_ids_test = np.array(padded_test) # Create attention masks attention_masks = [] # Create a mask of 1s for each token followed by 0s for padding for seq in input_ids_test: seq_mask = [float(i>0) for i in seq] attention_masks.append(seq_mask) # Convert to tensors. prediction_inputs = torch.tensor(input_ids_test) prediction_masks = torch.tensor(attention_masks) prediction_labels = torch.tensor(labels) # Set the batch size. batch_size = 32 # Create the DataLoader. prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) return prediction_dataloader def predict_class_bertFineTuning(model, sentences_to_predict_dataloader): # If there's a GPU available... if torch.cuda.is_available(): # Tell PyTorch to use the GPU. device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not... else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") # Put model in evaluation mode model.eval() # Tracking variables predictions_test , true_labels = [], [] # Predict for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from the dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and # speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] #print(logits) # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() #print(logits) # Store predictions and true labels predictions_test.append(logits) true_labels.append(label_ids) print(' DONE.') pred_labels = [] for i in range(len(true_labels)): # The predictions for this batch are a 2-column ndarray (one column for "0" # and one column for "1"). Pick the label with the highest value and turn this # in to a list of 0s and 1s. pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten() pred_labels.append(pred_labels_i) pred_labels_ = [item for sublist in pred_labels for item in sublist] true_labels_ = [item for sublist in true_labels for item in sublist] return predictions_test_, true_labels_ def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict): if chosen_model == 'bert-base-multilingual-cased' : print('Loading Bert Tokenizer...') tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True) elif chosen_model == 'camembert-base': print('Loading Camembert Tokenizer...') tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True) # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids_test = [] # For every sentence... for sent in sentences_to_predict: # `encode` will: # (1) Tokenize the sentence. # (2) Prepend the `[CLS]` token to the start. # (3) Append the `[SEP]` token to the end. # (4) Map tokens to their IDs. encoded_sent = tokenizer.encode( sent, # Sentence to encode. add_special_tokens = True, # Add '[CLS]' and '[SEP]' ) input_ids_test.append(encoded_sent) with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0]