predict_bertFineTuning.py

import torch

import pandas as pd

import numpy as np

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, CamembertTokenizer

def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32):

    if chosen_model == 'bert-base-multilingual-cased' :
        print('Loading Bert Tokenizer...')
        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
    elif chosen_model == 'camembert-base':
        print('Loading Camembert Tokenizer...')
        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_test = []
    # For every sentence...
    for sent in sentences_to_predict:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    )

        input_ids_test.append(encoded_sent)

    # Pad our input tokens
    padded_test = []
    for i in input_ids_test:

      if len(i) > max_len:
        padded_test.extend([i[:max_len]])
      else:
        padded_test.extend([i + [0] * (max_len - len(i))])
    input_ids_test = np.array(padded_test)

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids_test:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert to tensors.
    prediction_inputs = torch.tensor(input_ids_test)
    prediction_masks = torch.tensor(attention_masks)
    prediction_labels = torch.tensor(labels)

    # Set the batch size.
    batch_size = 32

    # Create the DataLoader.
    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

    return prediction_dataloader


def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):


    # If there's a GPU available...
    if torch.cuda.is_available():

        # Tell PyTorch to use the GPU.
        device = torch.device("cuda")

        print('There are %d GPU(s) available.' % torch.cuda.device_count())

        print('We will use the GPU:', torch.cuda.get_device_name(0))

        # If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    predictions_test , true_labels = [], []

    # Predict
    for batch in prediction_dataloader:
    # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from the dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]
        #print(logits)

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        #print(logits)

        # Store predictions and true labels
        predictions_test.append(logits)
        true_labels.append(label_ids)

        print('    DONE.')

        pred_labels = []


        for i in range(len(true_labels)):

            # The predictions for this batch are a 2-column ndarray (one column for "0"
            # and one column for "1"). Pick the label with the highest value and turn this
            # in to a list of 0s and 1s.
            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
            pred_labels.append(pred_labels_i)

        pred_labels_ = [item for sublist in pred_labels for item in sublist]
        true_labels_ = [item for sublist in true_labels for item in sublist]
        return predictions_test_, true_labels_


def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict):
    
    if chosen_model == 'bert-base-multilingual-cased' :
        print('Loading Bert Tokenizer...')
        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
    elif chosen_model == 'camembert-base':
        print('Loading Camembert Tokenizer...')
        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_test = []
    # For every sentence...
    for sent in sentences_to_predict:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    )

        input_ids_test.append(encoded_sent)
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]