Skip to content
Snippets Groups Projects
Commit a7d08239 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update Classification_BertFineTuning.ipynb

parent 9717e4c0
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# BERT fine-tuning for EDdA classification
%% Cell type:markdown id: tags:
## Setup colab environment
%% Cell type:code id: tags:
``` python
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20:
print('Not using a high-RAM runtime')
else:
print('You are using a high-RAM runtime!')
```
%% Cell type:code id: tags:
``` python
from google.colab import drive
drive.mount('/content/drive')
```
%% Cell type:markdown id: tags:
## Setup GPU
%% Cell type:code id: tags:
``` python
import torch
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# for MacOS
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
device = torch.device("mps")
print('We will use the GPU')
else:
device = torch.device("cpu")
print('No GPU available, using the CPU instead.')
```
%% Output
We will use the GPU
%% Cell type:markdown id: tags:
## Install packages
%% Cell type:code id: tags:
``` python
!pip install transformers==4.10.3
!pip install sentencepiece
```
%% Cell type:markdown id: tags:
## Import librairies
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import csv
import os
import pickle
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from transformers import BertTokenizer, CamembertTokenizer, BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import random
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
```
%% Cell type:markdown id: tags:
## Utils functions
%% Cell type:code id: tags:
``` python
def resample_classes(df, classColumnName, numberOfInstances):
#random numberOfInstances elements
replace = False # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
return df.groupby(classColumnName, as_index=False).apply(fn)
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round((elapsed)))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
```
%% Cell type:markdown id: tags:
## Load Data
%% Cell type:code id: tags:
``` python
!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/training_set.tsv
!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/test_set.tsv
```
%% Cell type:code id: tags:
``` python
!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/training_set_superdomains.tsv
!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/test_set_superdomains.tsv
```
%% Cell type:markdown id: tags:
### Loading dataset
%% Cell type:code id: tags:
``` python
#train_path = '../data/training_set.tsv'
#test_path = '../data/test_set.tsv'
train_path = '../data/training_set_superdomains.tsv'
test_path = '../data/test_set_superdomains.tsv'
```
%% Cell type:code id: tags:
``` python
df_train = pd.read_csv(train_path, sep="\t")
df_train.head()
```
%% Cell type:code id: tags:
``` python
print(df_train.shape)
```
%% Cell type:markdown id: tags:
## Configuration
%% Cell type:code id: tags:
``` python
columnText = 'contentWithoutClass'
#columnClass = 'ensemble_domaine_enccre'
columnClass = 'super_domain'
maxOfInstancePerClass = 10000
model_chosen = "bert"
#model_chosen = "camembert"
batch_size = 16 # 16 or 32 recommended
max_len = 512
#path = "drive/MyDrive/Classification-EDdA/"
path = "../models/new/"
encoder_filename = "label_encoder.pkl"
```
%% Cell type:markdown id: tags:
## Preprocessing
%% Cell type:code id: tags:
``` python
if maxOfInstancePerClass != 10000:
df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)
```
%% Cell type:code id: tags:
``` python
labels = df_train[columnClass]
numberOfClasses = labels.nunique()
if os.path.isfile(path+encoder_filename):
# load existing encoder
with open(path+encoder_filename, 'rb') as file:
encoder = pickle.load(file)
else:
encoder = preprocessing.LabelEncoder()
encoder.fit(labels)
with open(path+encoder_filename, 'wb') as file:
pickle.dump(encoder, file)
labels = encoder.transform(labels)
```
%% Cell type:code id: tags:
``` python
sentences_train = df_train[columnText].values
labels_train = labels.tolist()
```
%% Cell type:code id: tags:
``` python
sentences_train
```
%% Cell type:markdown id: tags:
# Model
## Tokenisation & Input Formatting
%% Cell type:code id: tags:
``` python
if model_chosen == "bert":
tokeniser_bert = 'bert-base-multilingual-cased'
model_bert = "bert-base-multilingual-cased"
elif model_chosen == "camembert":
tokeniser_bert = 'camembert-base'
model_bert = 'camembert-base'
```
%% Cell type:code id: tags:
``` python
# Load the BERT tokenizer.
if model_chosen == "bert":
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(tokeniser_bert)
elif model_chosen == "camembert":
print('Loading CamemBERT tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(tokeniser_bert)
```
%% Cell type:code id: tags:
``` python
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_train = []
# For every sentence...
for sent in sentences_train:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent_train = tokenizer.encode(
str(sent), # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
# This function also supports truncation and conversion
# to pytorch tensors, but I need to do padding, so I
# can't use these features.
#max_length = 128, # Truncate all sentences.
#return_tensors = 'pt', # Return pytorch tensors.
)
# Add the encoded sentence to the list.
input_ids_train.append(encoded_sent_train)
```
%% Cell type:code id: tags:
``` python
print('Max sentence length train: ', max([len(sen) for sen in input_ids_train]))
```
%% Cell type:code id: tags:
``` python
padded_train = []
for i in input_ids_train:
if len(i) > max_len:
padded_train.extend([i[:max_len]])
else:
padded_train.extend([i + [0] * (max_len - len(i))])
padded_train = input_ids_train = np.array(padded_train)
```
%% Cell type:code id: tags:
``` python
# Create attention masks
attention_masks_train = []
# For each sentence...
for sent in padded_train:
# Create the attention mask.
# - If a token ID is 0, then it's padding, set the mask to 0.
# - If a token ID is > 0, then it's a real token, set the mask to 1.
att_mask = [int(token_id > 0) for token_id in sent]
# Store the attention mask for this sentence.
attention_masks_train.append(att_mask)
```
%% Cell type:code id: tags:
``` python
# Use 70% for training and 30% for validation.
#train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
# random_state=2018, test_size=0.3, stratify = labels)
# Do the same for the masks.
#train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
# random_state=2018, test_size=0.3, stratify = labels)
```
%% Cell type:code id: tags:
``` python
# Convert all inputs and labels into torch tensors, the required datatype
# for my model.
train_inputs = torch.tensor(padded_train)
train_labels = torch.tensor(labels_train)
train_masks = torch.tensor(attention_masks_train)
```
%% Cell type:code id: tags:
``` python
# The DataLoader needs to know the batch size for training, so I specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.
# Create the DataLoader for training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
```
%% Cell type:markdown id: tags:
## Training
%% Cell type:code id: tags:
``` python
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
#model = CamembertForSequenceClassification.from_pretrained(
if model_chosen == "bert":
model = BertForSequenceClassification.from_pretrained(
model_bert, # Use the 12-layer BERT model, with an uncased vocab.
num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
elif model_chosen == "camembert":
model = CamembertForSequenceClassification.from_pretrained(
model_bert, # Use the 12-layer BERT model, with an uncased vocab.
num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
# Tell pytorch to run this model on the GPU.
#model.cuda()
model.to("mps")
```
%% Cell type:code id: tags:
``` python
#Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
)
```
%% Cell type:code id: tags:
``` python
# Number of training epochs (authors recommend between 2 and 4)
epochs = 4
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
```
%% Cell type:code id: tags:
``` python
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# Store the average loss after each epoch so I can plot them.
loss_values = []
# For each epoch...
for epoch_i in range(0, epochs):
# ========================================
# Training
# ========================================
# Perform one full pass over the training set.
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
# Measure how long the training epoch takes.
t0 = time.time()
# Reset the total loss for this epoch.
total_loss = 0
# Put the model into training mode.
model.train()
# For each batch of training data...
for step, batch in enumerate(train_dataloader):
# Progress update every 40 batches.
if step % 5 == 0 and not step == 0:
# Calculate elapsed time in minutes.
elapsed = format_time(time.time() - t0)
# Report progress.
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
# Unpack this training batch from the dataloader.
#
# As I unpack the batch, I'll also copy each tensor to the GPU using the
# `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# Always clear any previously calculated gradients before performing a
# backward pass. PyTorch doesn't do this automatically because
# accumulating the gradients is "convenient while training RNNs".
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
model.zero_grad()
# Perform a forward pass (evaluate the model on this training batch).
# This will return the loss (rather than the model output) because I
# have provided the `labels`.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
# The call to `model` always returns a tuple, so I need to pull the
# loss value out of the tuple.
loss = outputs[0]
# Accumulate the training loss over all of the batches so that I can
# calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value
# from the tensor.
total_loss += loss.item()
# Perform a backward pass to calculate the gradients.
loss.backward()
# Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and take a step using the computed gradient.
# The optimizer dictates the "update rule"--how the parameters are
# modified based on their gradients, the learning rate, etc.
optimizer.step()
# Update the learning rate.
scheduler.step()
# Calculate the average loss over the training data.
avg_train_loss = total_loss / len(train_dataloader)
# Store the loss value for plotting the learning curve.
loss_values.append(avg_train_loss)
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epoch took: {:}".format(format_time(time.time() - t0)))
print("")
print("Training complete!")
```
%% Cell type:markdown id: tags:
## Saving model
%% Cell type:code id: tags:
``` python
name = model_bert + "_s" + str(maxOfInstancePerClass)
model_path = path + "model_"+name+".pt"
```
%% Cell type:code id: tags:
``` python
#torch.save(model, model_path)
```
%% Cell type:code id: tags:
``` python
model.save_pretrained(model_path)
#ludo: changement de la façon de sauver le modèle
```
%% Cell type:markdown id: tags:
## Loading model
%% Cell type:code id: tags:
``` python
#model = torch.load(model_path)
model = BertForSequenceClassification.from_pretrained(model_path).to("mps") #.to("cuda")
```
%% Cell type:markdown id: tags:
## Evaluation
%% Cell type:code id: tags:
``` python
def evaluate_bert(data, labels, model, batch_size):
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
# For every sentence...
for sent in data:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
str(sent), # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
input_ids.append(encoded_sent)
# Pad our input tokens
padded = []
for i in input_ids:
if len(i) > max_len:
padded.extend([i[:max_len]])
else:
padded.extend([i + [0] * (max_len - len(i))])
input_ids = np.array(padded)
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions , true_labels = [], []
# Predict
for batch in prediction_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader
b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
#print(logits)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
#print(logits)
# Store predictions and true labels
predictions.append(logits)
true_labels.append(label_ids)
print(' DONE.')
pred_labels = []
# Evaluate each test batch using many matrics
print('Calculating the matrics for each batch...')
for i in range(len(true_labels)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
pred_labels.append(pred_labels_i)
pred_labels_ = [item for sublist in pred_labels for item in sublist]
true_labels_ = [item for sublist in true_labels for item in sublist]
return pred_labels_, true_labels_
```
%% Cell type:code id: tags:
``` python
dataset = "test"
df_eval = pd.read_csv(test_path, sep="\t")
data_eval = df_eval[columnText].values
y = df_eval[columnClass]
y = encoder.transform(y)
labels = y.tolist()
model_path = path+"/model_"+model_bert+"_s"+str(maxOfInstancePerClass)+".pt"
model = torch.load(model_path)
if model_bert == "bert-base-multilingual-cased":
tokenizer = BertTokenizer.from_pretrained(model_bert)
elif model_bert == "camembert-base":
tokenizer = CamembertTokenizer.from_pretrained(model_bert)
pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)
report = classification_report(true_labels_, pred_labels_, output_dict = True)
classes = [str(e) for e in encoder.transform(encoder.classes_)]
classesName = encoder.classes_
precision = []
recall = []
f1 = []
support = []
dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
for c in classes:
precision.append(report[c]['precision'])
recall.append(report[c]['recall'])
f1.append(report[c]['f1-score'])
support.append(report[c]['support'])
accuracy = report['accuracy']
weighted_avg = report['weighted avg']
cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)
dff['className'] = classesName
dff['precision'] = precision
dff['recall'] = recall
dff['f1-score'] = f1
dff['support'] = support
dff['FP'] = FP
dff['FN'] = FN
dff['TP'] = TP
dff['TN'] = TN
print(name)
name = "test_"+ name
content = name + "\n"
print(name)
content += str(weighted_avg) + "\n"
print(weighted_avg)
print(accuracy)
print(dff)
dff.to_csv(path+"/report_"+name+".csv", index=False)
# enregistrer les predictions
pd.DataFrame({'labels': pd.Series(true_labels_), 'predictions': pd.Series(pred_labels_)}).to_csv(path+"/predictions/predictions_"+name+".csv")
with open(path+"reports/report_"+name+".txt", 'w') as f:
f.write(content)
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
model_path = "drive/MyDrive/Classification-EDdA/model_bert-base-multilingual-cased_s10000.pt"
```
%% Cell type:code id: tags:
``` python
model = torch.load(model_path)
```
%% Cell type:code id: tags:
``` python
!wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv
```
%% Cell type:code id: tags:
``` python
df_LGE = pd.read_csv("LGE_withContent.tsv", sep="\t")
data_LGE = df_LGE["content"].values
#pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)
```
%% Cell type:code id: tags:
``` python
df_LGE.head()
```
%% Cell type:code id: tags:
``` python
df_LGE.shape
```
%% Cell type:code id: tags:
``` python
def generate_prediction_dataloader(chosen_model, sentences_to_predict, batch_size = 8, max_len = 512):
if chosen_model == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(chosen_model)
elif chosen_model == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(chosen_model)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
# For every sentence...
for sent in sentences_to_predict:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
input_ids_test.append(encoded_sent)
# Pad our input tokens
padded_test = []
for i in input_ids_test:
if len(i) > max_len:
padded_test.extend([i[:max_len]])
else:
padded_test.extend([i + [0] * (max_len - len(i))])
input_ids_test = np.array(padded_test)
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids_test)
prediction_masks = torch.tensor(attention_masks)
#set batch size
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
return prediction_dataloader
def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions_test , true_labels = [], []
pred_labels_ = []
# Predict
for batch in sentences_to_predict_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader
b_input_ids, b_input_mask = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
#print(logits)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
#print(logits)
# Store predictions and true labels
predictions_test.append(logits)
#print(' DONE.')
pred_labels = []
for i in range(len(predictions_test)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
pred_labels.append(pred_labels_i)
pred_labels_ += [item for sublist in pred_labels for item in sublist]
return pred_labels_
```
%% Cell type:code id: tags:
``` python
data_loader = generate_prediction_dataloader('bert-base-multilingual-cased', data_LGE)
#data_loader = generate_prediction_dataloader('camembert-base', data_LGE)
```
%% Cell type:code id: tags:
``` python
p = predict_class_bertFineTuning( model, data_loader )
```
%% Cell type:code id: tags:
``` python
len(p)
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
# Il faudrait enregistrer l'encoder,
# sinon on est obligé de le refaire à partir du jeu d'entrainement pour récupérer le noms des classes.
encoder
```
%% Cell type:code id: tags:
``` python
p2 = list(encoder.inverse_transform(p))
```
%% Cell type:code id: tags:
``` python
p2
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
df_LGE['class_bert'] = p2
```
%% Cell type:code id: tags:
``` python
df_LGE.head()
```
%% Cell type:code id: tags:
``` python
df_LGE.to_csv("drive/MyDrive/Classification-EDdA/classification_LGE.tsv", sep="\t")
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment