-
Khalleud authoreda5158391
predict_bertFineTuning.py 5.71 KiB
import torch
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, CamembertTokenizer
def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32):
if chosen_model == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
elif chosen_model == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
# For every sentence...
for sent in sentences_to_predict:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
input_ids_test.append(encoded_sent)
# Pad our input tokens
padded_test = []
for i in input_ids_test:
if len(i) > max_len:
padded_test.extend([i[:max_len]])
else:
padded_test.extend([i + [0] * (max_len - len(i))])
input_ids_test = np.array(padded_test)
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids_test)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
# Set the batch size.
batch_size = 32
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
return prediction_dataloader
def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions_test , true_labels = [], []
# Predict
for batch in prediction_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader
b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
#print(logits)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
#print(logits)
# Store predictions and true labels
predictions_test.append(logits)
true_labels.append(label_ids)
print(' DONE.')
pred_labels = []
for i in range(len(true_labels)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
pred_labels.append(pred_labels_i)
pred_labels_ = [item for sublist in pred_labels for item in sublist]
true_labels_ = [item for sublist in true_labels for item in sublist]
return predictions_test_, true_labels_
def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict):
if chosen_model == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
elif chosen_model == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
# For every sentence...
for sent in sentences_to_predict:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
input_ids_test.append(encoded_sent)
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]