Skip to content
Snippets Groups Projects
Commit 90192d15 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update Classification_BertFineTuning.ipynb

parent 13110f08
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# BERT fine-tuning for EDdA classification # BERT fine-tuning for EDdA classification
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Setup colab environment ## Setup colab environment
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from psutil import virtual_memory from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9 ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb)) print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20: if ram_gb < 20:
print('Not using a high-RAM runtime') print('Not using a high-RAM runtime')
else: else:
print('You are using a high-RAM runtime!') print('You are using a high-RAM runtime!')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from google.colab import drive from google.colab import drive
drive.mount('/content/drive') drive.mount('/content/drive')
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Setup GPU ## Setup GPU
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import torch import torch
# If there's a GPU available... # If there's a GPU available...
if torch.cuda.is_available(): if torch.cuda.is_available():
# Tell PyTorch to use the GPU. # Tell PyTorch to use the GPU.
device = torch.device("cuda") device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0)) print('We will use the GPU:', torch.cuda.get_device_name(0))
# for MacOS # for MacOS
elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
device = torch.device("mps") device = torch.device("mps")
print('We will use the GPU') print('We will use the GPU')
else: else:
device = torch.device("cpu") device = torch.device("cpu")
print('No GPU available, using the CPU instead.') print('No GPU available, using the CPU instead.')
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Install packages ## Install packages
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
!pip install transformers==4.10.3 !pip install transformers==4.10.3
!pip install sentencepiece !pip install sentencepiece
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Import librairies ## Import librairies
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import csv import csv
import os import os
import pickle import pickle
from sklearn import preprocessing from sklearn import preprocessing
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.metrics import * from sklearn.metrics import *
from transformers import BertTokenizer, CamembertTokenizer, BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification from transformers import BertTokenizer, CamembertTokenizer, BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup from transformers import get_linear_schedule_with_warmup
import time import time
import datetime import datetime
import random import random
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report from sklearn.metrics import classification_report
import seaborn as sns import seaborn as sns
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Utils functions ## Utils functions
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def resample_classes(df, classColumnName, numberOfInstances): def resample_classes(df, classColumnName, numberOfInstances):
#random numberOfInstances elements #random numberOfInstances elements
replace = False # with replacement replace = False # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
return df.groupby(classColumnName, as_index=False).apply(fn) return df.groupby(classColumnName, as_index=False).apply(fn)
# Function to calculate the accuracy of our predictions vs labels # Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels): def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten() pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten() labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat) return np.sum(pred_flat == labels_flat) / len(labels_flat)
def format_time(elapsed): def format_time(elapsed):
''' '''
Takes a time in seconds and returns a string hh:mm:ss Takes a time in seconds and returns a string hh:mm:ss
''' '''
# Round to the nearest second. # Round to the nearest second.
elapsed_rounded = int(round((elapsed))) elapsed_rounded = int(round((elapsed)))
# Format as hh:mm:ss # Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded)) return str(datetime.timedelta(seconds=elapsed_rounded))
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Load Data ## Load Data
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv !wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/training_set.tsv
!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv !wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/test_set.tsv
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Loading dataset ### Loading dataset
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
train_path = '../data/training_set.tsv' train_path = '../data/training_set.tsv'
test_path = '../data/test_set.tsv' test_path = '../data/test_set.tsv'
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_train = pd.read_csv(train_path, sep="\t") df_train = pd.read_csv(train_path, sep="\t")
df_train.head() df_train.head()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(df_train.shape) print(df_train.shape)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Configuration ## Configuration
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
columnText = 'contentWithoutClass' columnText = 'contentWithoutClass'
columnClass = 'ensemble_domaine_enccre' columnClass = 'ensemble_domaine_enccre'
maxOfInstancePerClass = 10000 maxOfInstancePerClass = 10000
model_chosen = "bert" model_chosen = "bert"
#model_chosen = "camembert" #model_chosen = "camembert"
batch_size = 16 # 16 or 32 recommended batch_size = 16 # 16 or 32 recommended
max_len = 512 max_len = 512
#path = "drive/MyDrive/Classification-EDdA/" #path = "drive/MyDrive/Classification-EDdA/"
path = "../models/new/" path = "../models/new/"
encoder_filename = "label_encoder.pkl" encoder_filename = "label_encoder.pkl"
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Preprocessing ## Preprocessing
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
if maxOfInstancePerClass != 10000: if maxOfInstancePerClass != 10000:
df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass) df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
labels = df_train[columnClass] labels = df_train[columnClass]
numberOfClasses = labels.nunique() numberOfClasses = labels.nunique()
if os.path.isfile(path+encoder_filename): if os.path.isfile(path+encoder_filename):
# load existing encoder # load existing encoder
with open(path+encoder_filename, 'rb') as file: with open(path+encoder_filename, 'rb') as file:
encoder = pickle.load(file) encoder = pickle.load(file)
else: else:
encoder = preprocessing.LabelEncoder() encoder = preprocessing.LabelEncoder()
encoder.fit(labels) encoder.fit(labels)
with open(path+encoder_filename, 'wb') as file: with open(path+encoder_filename, 'wb') as file:
pickle.dump(encoder, file) pickle.dump(encoder, file)
labels = encoder.transform(labels) labels = encoder.transform(labels)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
sentences_train = df_train[columnText].values sentences_train = df_train[columnText].values
labels_train = labels.tolist() labels_train = labels.tolist()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
sentences_train sentences_train
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Model # Model
## Tokenisation & Input Formatting ## Tokenisation & Input Formatting
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
if model_chosen == "bert": if model_chosen == "bert":
tokeniser_bert = 'bert-base-multilingual-cased' tokeniser_bert = 'bert-base-multilingual-cased'
model_bert = "bert-base-multilingual-cased" model_bert = "bert-base-multilingual-cased"
elif model_chosen == "camembert": elif model_chosen == "camembert":
tokeniser_bert = 'camembert-base' tokeniser_bert = 'camembert-base'
model_bert = 'camembert-base' model_bert = 'camembert-base'
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Load the BERT tokenizer. # Load the BERT tokenizer.
if model_chosen == "bert": if model_chosen == "bert":
print('Loading BERT tokenizer...') print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(tokeniser_bert) tokenizer = BertTokenizer.from_pretrained(tokeniser_bert)
elif model_chosen == "camembert": elif model_chosen == "camembert":
print('Loading CamemBERT tokenizer...') print('Loading CamemBERT tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(tokeniser_bert) tokenizer = CamembertTokenizer.from_pretrained(tokeniser_bert)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Tokenize all of the sentences and map the tokens to thier word IDs. # Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_train = [] input_ids_train = []
# For every sentence... # For every sentence...
for sent in sentences_train: for sent in sentences_train:
# `encode` will: # `encode` will:
# (1) Tokenize the sentence. # (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start. # (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end. # (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs. # (4) Map tokens to their IDs.
encoded_sent_train = tokenizer.encode( encoded_sent_train = tokenizer.encode(
str(sent), # Sentence to encode. str(sent), # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]' add_special_tokens = True, # Add '[CLS]' and '[SEP]'
# This function also supports truncation and conversion # This function also supports truncation and conversion
# to pytorch tensors, but I need to do padding, so I # to pytorch tensors, but I need to do padding, so I
# can't use these features. # can't use these features.
#max_length = 128, # Truncate all sentences. #max_length = 128, # Truncate all sentences.
#return_tensors = 'pt', # Return pytorch tensors. #return_tensors = 'pt', # Return pytorch tensors.
) )
# Add the encoded sentence to the list. # Add the encoded sentence to the list.
input_ids_train.append(encoded_sent_train) input_ids_train.append(encoded_sent_train)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print('Max sentence length train: ', max([len(sen) for sen in input_ids_train])) print('Max sentence length train: ', max([len(sen) for sen in input_ids_train]))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
padded_train = [] padded_train = []
for i in input_ids_train: for i in input_ids_train:
if len(i) > max_len: if len(i) > max_len:
padded_train.extend([i[:max_len]]) padded_train.extend([i[:max_len]])
else: else:
padded_train.extend([i + [0] * (max_len - len(i))]) padded_train.extend([i + [0] * (max_len - len(i))])
padded_train = input_ids_train = np.array(padded_train) padded_train = input_ids_train = np.array(padded_train)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Create attention masks # Create attention masks
attention_masks_train = [] attention_masks_train = []
# For each sentence... # For each sentence...
for sent in padded_train: for sent in padded_train:
# Create the attention mask. # Create the attention mask.
# - If a token ID is 0, then it's padding, set the mask to 0. # - If a token ID is 0, then it's padding, set the mask to 0.
# - If a token ID is > 0, then it's a real token, set the mask to 1. # - If a token ID is > 0, then it's a real token, set the mask to 1.
att_mask = [int(token_id > 0) for token_id in sent] att_mask = [int(token_id > 0) for token_id in sent]
# Store the attention mask for this sentence. # Store the attention mask for this sentence.
attention_masks_train.append(att_mask) attention_masks_train.append(att_mask)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Use 70% for training and 30% for validation. # Use 70% for training and 30% for validation.
#train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, #train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
# random_state=2018, test_size=0.3, stratify = labels) # random_state=2018, test_size=0.3, stratify = labels)
# Do the same for the masks. # Do the same for the masks.
#train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, #train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
# random_state=2018, test_size=0.3, stratify = labels) # random_state=2018, test_size=0.3, stratify = labels)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Convert all inputs and labels into torch tensors, the required datatype # Convert all inputs and labels into torch tensors, the required datatype
# for my model. # for my model.
train_inputs = torch.tensor(padded_train) train_inputs = torch.tensor(padded_train)
train_labels = torch.tensor(labels_train) train_labels = torch.tensor(labels_train)
train_masks = torch.tensor(attention_masks_train) train_masks = torch.tensor(attention_masks_train)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# The DataLoader needs to know the batch size for training, so I specify it here. # The DataLoader needs to know the batch size for training, so I specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of # For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32. # 16 or 32.
# Create the DataLoader for training set. # Create the DataLoader for training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels) train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data) train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Training ## Training
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Load BertForSequenceClassification, the pretrained BERT model with a single # Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top. # linear classification layer on top.
#model = CamembertForSequenceClassification.from_pretrained( #model = CamembertForSequenceClassification.from_pretrained(
if model_chosen == "bert": if model_chosen == "bert":
model = BertForSequenceClassification.from_pretrained( model = BertForSequenceClassification.from_pretrained(
model_bert, # Use the 12-layer BERT model, with an uncased vocab. model_bert, # Use the 12-layer BERT model, with an uncased vocab.
num_labels = numberOfClasses, # The number of output labels--2 for binary classification. num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks. # You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights. output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states. output_hidden_states = False, # Whether the model returns all hidden-states.
) )
elif model_chosen == "camembert": elif model_chosen == "camembert":
model = CamembertForSequenceClassification.from_pretrained( model = CamembertForSequenceClassification.from_pretrained(
model_bert, # Use the 12-layer BERT model, with an uncased vocab. model_bert, # Use the 12-layer BERT model, with an uncased vocab.
num_labels = numberOfClasses, # The number of output labels--2 for binary classification. num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks. # You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights. output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states. output_hidden_states = False, # Whether the model returns all hidden-states.
) )
# Tell pytorch to run this model on the GPU. # Tell pytorch to run this model on the GPU.
#model.cuda() #model.cuda()
model.to("mps") model.to("mps")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Note: AdamW is a class from the huggingface library (as opposed to pytorch) #Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix" # I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(), optimizer = AdamW(model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon - default is 1e-8. eps = 1e-8 # args.adam_epsilon - default is 1e-8.
) )
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Number of training epochs (authors recommend between 2 and 4) # Number of training epochs (authors recommend between 2 and 4)
epochs = 4 epochs = 4
# Total number of training steps is number of batches * number of epochs. # Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler. # Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps) num_training_steps = total_steps)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# This training code is based on the `run_glue.py` script here: # This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# Set the seed value all over the place to make this reproducible. # Set the seed value all over the place to make this reproducible.
seed_val = 42 seed_val = 42
random.seed(seed_val) random.seed(seed_val)
np.random.seed(seed_val) np.random.seed(seed_val)
torch.manual_seed(seed_val) torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val) torch.cuda.manual_seed_all(seed_val)
# Store the average loss after each epoch so I can plot them. # Store the average loss after each epoch so I can plot them.
loss_values = [] loss_values = []
# For each epoch... # For each epoch...
for epoch_i in range(0, epochs): for epoch_i in range(0, epochs):
# ======================================== # ========================================
# Training # Training
# ======================================== # ========================================
# Perform one full pass over the training set. # Perform one full pass over the training set.
print("") print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...') print('Training...')
# Measure how long the training epoch takes. # Measure how long the training epoch takes.
t0 = time.time() t0 = time.time()
# Reset the total loss for this epoch. # Reset the total loss for this epoch.
total_loss = 0 total_loss = 0
# Put the model into training mode. # Put the model into training mode.
model.train() model.train()
# For each batch of training data... # For each batch of training data...
for step, batch in enumerate(train_dataloader): for step, batch in enumerate(train_dataloader):
# Progress update every 40 batches. # Progress update every 40 batches.
if step % 5 == 0 and not step == 0: if step % 5 == 0 and not step == 0:
# Calculate elapsed time in minutes. # Calculate elapsed time in minutes.
elapsed = format_time(time.time() - t0) elapsed = format_time(time.time() - t0)
# Report progress. # Report progress.
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
# Unpack this training batch from the dataloader. # Unpack this training batch from the dataloader.
# #
# As I unpack the batch, I'll also copy each tensor to the GPU using the # As I unpack the batch, I'll also copy each tensor to the GPU using the
# `to` method. # `to` method.
# #
# `batch` contains three pytorch tensors: # `batch` contains three pytorch tensors:
# [0]: input ids # [0]: input ids
# [1]: attention masks # [1]: attention masks
# [2]: labels # [2]: labels
b_input_ids = batch[0].to(device) b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device) b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device) b_labels = batch[2].to(device)
# Always clear any previously calculated gradients before performing a # Always clear any previously calculated gradients before performing a
# backward pass. PyTorch doesn't do this automatically because # backward pass. PyTorch doesn't do this automatically because
# accumulating the gradients is "convenient while training RNNs". # accumulating the gradients is "convenient while training RNNs".
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
model.zero_grad() model.zero_grad()
# Perform a forward pass (evaluate the model on this training batch). # Perform a forward pass (evaluate the model on this training batch).
# This will return the loss (rather than the model output) because I # This will return the loss (rather than the model output) because I
# have provided the `labels`. # have provided the `labels`.
# The documentation for this `model` function is here: # The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
outputs = model(b_input_ids, outputs = model(b_input_ids,
token_type_ids=None, token_type_ids=None,
attention_mask=b_input_mask, attention_mask=b_input_mask,
labels=b_labels) labels=b_labels)
# The call to `model` always returns a tuple, so I need to pull the # The call to `model` always returns a tuple, so I need to pull the
# loss value out of the tuple. # loss value out of the tuple.
loss = outputs[0] loss = outputs[0]
# Accumulate the training loss over all of the batches so that I can # Accumulate the training loss over all of the batches so that I can
# calculate the average loss at the end. `loss` is a Tensor containing a # calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value # single value; the `.item()` function just returns the Python value
# from the tensor. # from the tensor.
total_loss += loss.item() total_loss += loss.item()
# Perform a backward pass to calculate the gradients. # Perform a backward pass to calculate the gradients.
loss.backward() loss.backward()
# Clip the norm of the gradients to 1.0. # Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem. # This is to help prevent the "exploding gradients" problem.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and take a step using the computed gradient. # Update parameters and take a step using the computed gradient.
# The optimizer dictates the "update rule"--how the parameters are # The optimizer dictates the "update rule"--how the parameters are
# modified based on their gradients, the learning rate, etc. # modified based on their gradients, the learning rate, etc.
optimizer.step() optimizer.step()
# Update the learning rate. # Update the learning rate.
scheduler.step() scheduler.step()
# Calculate the average loss over the training data. # Calculate the average loss over the training data.
avg_train_loss = total_loss / len(train_dataloader) avg_train_loss = total_loss / len(train_dataloader)
# Store the loss value for plotting the learning curve. # Store the loss value for plotting the learning curve.
loss_values.append(avg_train_loss) loss_values.append(avg_train_loss)
print("") print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epoch took: {:}".format(format_time(time.time() - t0))) print(" Training epoch took: {:}".format(format_time(time.time() - t0)))
print("") print("")
print("Training complete!") print("Training complete!")
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Saving model ## Saving model
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
name = model_bert + "_s" + str(maxOfInstancePerClass) name = model_bert + "_s" + str(maxOfInstancePerClass)
model_path = path + "model_"+name+".pt" model_path = path + "model_"+name+".pt"
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#torch.save(model, model_path) #torch.save(model, model_path)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
model.save_pretrained(model_path) model.save_pretrained(model_path)
#ludo: changement de la façon de sauver le modèle #ludo: changement de la façon de sauver le modèle
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Loading model ## Loading model
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#model = torch.load(model_path) #model = torch.load(model_path)
model = BertForSequenceClassification.from_pretrained(model_path).to("mps") #.to("cuda") model = BertForSequenceClassification.from_pretrained(model_path).to("mps") #.to("cuda")
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Evaluation ## Evaluation
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def evaluate_bert(data, labels, model, batch_size): def evaluate_bert(data, labels, model, batch_size):
# Tokenize all of the sentences and map the tokens to thier word IDs. # Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = [] input_ids = []
# For every sentence... # For every sentence...
for sent in data: for sent in data:
# `encode` will: # `encode` will:
# (1) Tokenize the sentence. # (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start. # (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end. # (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs. # (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode( encoded_sent = tokenizer.encode(
str(sent), # Sentence to encode. str(sent), # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]' add_special_tokens = True, # Add '[CLS]' and '[SEP]'
) )
input_ids.append(encoded_sent) input_ids.append(encoded_sent)
# Pad our input tokens # Pad our input tokens
padded = [] padded = []
for i in input_ids: for i in input_ids:
if len(i) > max_len: if len(i) > max_len:
padded.extend([i[:max_len]]) padded.extend([i[:max_len]])
else: else:
padded.extend([i + [0] * (max_len - len(i))]) padded.extend([i + [0] * (max_len - len(i))])
input_ids = np.array(padded) input_ids = np.array(padded)
# Create attention masks # Create attention masks
attention_masks = [] attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding # Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids: for seq in input_ids:
seq_mask = [float(i>0) for i in seq] seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask) attention_masks.append(seq_mask)
# Convert to tensors. # Convert to tensors.
prediction_inputs = torch.tensor(input_ids) prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks) prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels) prediction_labels = torch.tensor(labels)
# Create the DataLoader. # Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data) prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs))) print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
# Put model in evaluation mode # Put model in evaluation mode
model.eval() model.eval()
# Tracking variables # Tracking variables
predictions , true_labels = [], [] predictions , true_labels = [], []
# Predict # Predict
for batch in prediction_dataloader: for batch in prediction_dataloader:
# Add batch to GPU # Add batch to GPU
batch = tuple(t.to(device) for t in batch) batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader # Unpack the inputs from the dataloader
b_input_ids, b_input_mask, b_labels = batch b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and # Telling the model not to compute or store gradients, saving memory and
# speeding up prediction # speeding up prediction
with torch.no_grad(): with torch.no_grad():
# Forward pass, calculate logit predictions # Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None, outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask) attention_mask=b_input_mask)
logits = outputs[0] logits = outputs[0]
#print(logits) #print(logits)
# Move logits and labels to CPU # Move logits and labels to CPU
logits = logits.detach().cpu().numpy() logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy() label_ids = b_labels.to('cpu').numpy()
#print(logits) #print(logits)
# Store predictions and true labels # Store predictions and true labels
predictions.append(logits) predictions.append(logits)
true_labels.append(label_ids) true_labels.append(label_ids)
print(' DONE.') print(' DONE.')
pred_labels = [] pred_labels = []
# Evaluate each test batch using many matrics # Evaluate each test batch using many matrics
print('Calculating the matrics for each batch...') print('Calculating the matrics for each batch...')
for i in range(len(true_labels)): for i in range(len(true_labels)):
# The predictions for this batch are a 2-column ndarray (one column for "0" # The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this # and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s. # in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions[i], axis=1).flatten() pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
pred_labels.append(pred_labels_i) pred_labels.append(pred_labels_i)
pred_labels_ = [item for sublist in pred_labels for item in sublist] pred_labels_ = [item for sublist in pred_labels for item in sublist]
true_labels_ = [item for sublist in true_labels for item in sublist] true_labels_ = [item for sublist in true_labels for item in sublist]
return pred_labels_, true_labels_ return pred_labels_, true_labels_
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
dataset = "test" dataset = "test"
df_eval = pd.read_csv(dataset+"_set.tsv", sep="\t") df_eval = pd.read_csv(dataset+"_set.tsv", sep="\t")
data_eval = df_eval[columnText].values data_eval = df_eval[columnText].values
y = df_eval[columnClass] y = df_eval[columnClass]
y = encoder.transform(y) y = encoder.transform(y)
labels = y.tolist() labels = y.tolist()
model_path = path+"/model_"+model_bert+"_s"+str(maxOfInstancePerClass)+".pt" model_path = path+"/model_"+model_bert+"_s"+str(maxOfInstancePerClass)+".pt"
model = torch.load(model_path) model = torch.load(model_path)
if model_bert == "bert-base-multilingual-cased": if model_bert == "bert-base-multilingual-cased":
tokenizer = BertTokenizer.from_pretrained(model_bert) tokenizer = BertTokenizer.from_pretrained(model_bert)
elif model_bert == "camembert-base": elif model_bert == "camembert-base":
tokenizer = CamembertTokenizer.from_pretrained(model_bert) tokenizer = CamembertTokenizer.from_pretrained(model_bert)
pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size) pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)
report = classification_report(true_labels_, pred_labels_, output_dict = True) report = classification_report(true_labels_, pred_labels_, output_dict = True)
classes = [str(e) for e in encoder.transform(encoder.classes_)] classes = [str(e) for e in encoder.transform(encoder.classes_)]
classesName = encoder.classes_ classesName = encoder.classes_
precision = [] precision = []
recall = [] recall = []
f1 = [] f1 = []
support = [] support = []
dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN']) dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
for c in classes: for c in classes:
precision.append(report[c]['precision']) precision.append(report[c]['precision'])
recall.append(report[c]['recall']) recall.append(report[c]['recall'])
f1.append(report[c]['f1-score']) f1.append(report[c]['f1-score'])
support.append(report[c]['support']) support.append(report[c]['support'])
accuracy = report['accuracy'] accuracy = report['accuracy']
weighted_avg = report['weighted avg'] weighted_avg = report['weighted avg']
cnf_matrix = confusion_matrix(true_labels_, pred_labels_) cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix) TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP) TN = cnf_matrix.sum() - (FP + FN + TP)
dff['className'] = classesName dff['className'] = classesName
dff['precision'] = precision dff['precision'] = precision
dff['recall'] = recall dff['recall'] = recall
dff['f1-score'] = f1 dff['f1-score'] = f1
dff['support'] = support dff['support'] = support
dff['FP'] = FP dff['FP'] = FP
dff['FN'] = FN dff['FN'] = FN
dff['TP'] = TP dff['TP'] = TP
dff['TN'] = TN dff['TN'] = TN
print(name) print(name)
name = "test_"+ name name = "test_"+ name
content = name + "\n" content = name + "\n"
print(name) print(name)
content += str(weighted_avg) + "\n" content += str(weighted_avg) + "\n"
print(weighted_avg) print(weighted_avg)
print(accuracy) print(accuracy)
print(dff) print(dff)
dff.to_csv(path+"/report_"+name+".csv", index=False) dff.to_csv(path+"/report_"+name+".csv", index=False)
# enregistrer les predictions # enregistrer les predictions
pd.DataFrame({'labels': pd.Series(true_labels_), 'predictions': pd.Series(pred_labels_)}).to_csv(path+"/predictions/predictions_"+name+".csv") pd.DataFrame({'labels': pd.Series(true_labels_), 'predictions': pd.Series(pred_labels_)}).to_csv(path+"/predictions/predictions_"+name+".csv")
with open(path+"reports/report_"+name+".txt", 'w') as f: with open(path+"reports/report_"+name+".txt", 'w') as f:
f.write(content) f.write(content)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
model_path = "drive/MyDrive/Classification-EDdA/model_bert-base-multilingual-cased_s10000.pt" model_path = "drive/MyDrive/Classification-EDdA/model_bert-base-multilingual-cased_s10000.pt"
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
model = torch.load(model_path) model = torch.load(model_path)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
!wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv !wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE = pd.read_csv("LGE_withContent.tsv", sep="\t") df_LGE = pd.read_csv("LGE_withContent.tsv", sep="\t")
data_LGE = df_LGE["content"].values data_LGE = df_LGE["content"].values
#pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size) #pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.head() df_LGE.head()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.shape df_LGE.shape
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def generate_prediction_dataloader(chosen_model, sentences_to_predict, batch_size = 8, max_len = 512): def generate_prediction_dataloader(chosen_model, sentences_to_predict, batch_size = 8, max_len = 512):
if chosen_model == 'bert-base-multilingual-cased' : if chosen_model == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...') print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(chosen_model) tokenizer = BertTokenizer.from_pretrained(chosen_model)
elif chosen_model == 'camembert-base': elif chosen_model == 'camembert-base':
print('Loading Camembert Tokenizer...') print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(chosen_model) tokenizer = CamembertTokenizer.from_pretrained(chosen_model)
# Tokenize all of the sentences and map the tokens to thier word IDs. # Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = [] input_ids_test = []
# For every sentence... # For every sentence...
for sent in sentences_to_predict: for sent in sentences_to_predict:
# `encode` will: # `encode` will:
# (1) Tokenize the sentence. # (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start. # (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end. # (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs. # (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode( encoded_sent = tokenizer.encode(
sent, # Sentence to encode. sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]' add_special_tokens = True, # Add '[CLS]' and '[SEP]'
) )
input_ids_test.append(encoded_sent) input_ids_test.append(encoded_sent)
# Pad our input tokens # Pad our input tokens
padded_test = [] padded_test = []
for i in input_ids_test: for i in input_ids_test:
if len(i) > max_len: if len(i) > max_len:
padded_test.extend([i[:max_len]]) padded_test.extend([i[:max_len]])
else: else:
padded_test.extend([i + [0] * (max_len - len(i))]) padded_test.extend([i + [0] * (max_len - len(i))])
input_ids_test = np.array(padded_test) input_ids_test = np.array(padded_test)
# Create attention masks # Create attention masks
attention_masks = [] attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding # Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test: for seq in input_ids_test:
seq_mask = [float(i>0) for i in seq] seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask) attention_masks.append(seq_mask)
# Convert to tensors. # Convert to tensors.
prediction_inputs = torch.tensor(input_ids_test) prediction_inputs = torch.tensor(input_ids_test)
prediction_masks = torch.tensor(attention_masks) prediction_masks = torch.tensor(attention_masks)
#set batch size #set batch size
# Create the DataLoader. # Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks) prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data) prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
return prediction_dataloader return prediction_dataloader
def predict_class_bertFineTuning(model, sentences_to_predict_dataloader): def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
# If there's a GPU available... # If there's a GPU available...
if torch.cuda.is_available(): if torch.cuda.is_available():
# Tell PyTorch to use the GPU. # Tell PyTorch to use the GPU.
device = torch.device("cuda") device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0)) print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not... # If not...
else: else:
print('No GPU available, using the CPU instead.') print('No GPU available, using the CPU instead.')
device = torch.device("cpu") device = torch.device("cpu")
# Put model in evaluation mode # Put model in evaluation mode
model.eval() model.eval()
# Tracking variables # Tracking variables
predictions_test , true_labels = [], [] predictions_test , true_labels = [], []
pred_labels_ = [] pred_labels_ = []
# Predict # Predict
for batch in sentences_to_predict_dataloader: for batch in sentences_to_predict_dataloader:
# Add batch to GPU # Add batch to GPU
batch = tuple(t.to(device) for t in batch) batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader # Unpack the inputs from the dataloader
b_input_ids, b_input_mask = batch b_input_ids, b_input_mask = batch
# Telling the model not to compute or store gradients, saving memory and # Telling the model not to compute or store gradients, saving memory and
# speeding up prediction # speeding up prediction
with torch.no_grad(): with torch.no_grad():
# Forward pass, calculate logit predictions # Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None, outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask) attention_mask=b_input_mask)
logits = outputs[0] logits = outputs[0]
#print(logits) #print(logits)
# Move logits and labels to CPU # Move logits and labels to CPU
logits = logits.detach().cpu().numpy() logits = logits.detach().cpu().numpy()
#print(logits) #print(logits)
# Store predictions and true labels # Store predictions and true labels
predictions_test.append(logits) predictions_test.append(logits)
#print(' DONE.') #print(' DONE.')
pred_labels = [] pred_labels = []
for i in range(len(predictions_test)): for i in range(len(predictions_test)):
# The predictions for this batch are a 2-column ndarray (one column for "0" # The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this # and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s. # in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten() pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
pred_labels.append(pred_labels_i) pred_labels.append(pred_labels_i)
pred_labels_ += [item for sublist in pred_labels for item in sublist] pred_labels_ += [item for sublist in pred_labels for item in sublist]
return pred_labels_ return pred_labels_
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
data_loader = generate_prediction_dataloader('bert-base-multilingual-cased', data_LGE) data_loader = generate_prediction_dataloader('bert-base-multilingual-cased', data_LGE)
#data_loader = generate_prediction_dataloader('camembert-base', data_LGE) #data_loader = generate_prediction_dataloader('camembert-base', data_LGE)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
p = predict_class_bertFineTuning( model, data_loader ) p = predict_class_bertFineTuning( model, data_loader )
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
len(p) len(p)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Il faudrait enregistrer l'encoder, # Il faudrait enregistrer l'encoder,
# sinon on est obligé de le refaire à partir du jeu d'entrainement pour récupérer le noms des classes. # sinon on est obligé de le refaire à partir du jeu d'entrainement pour récupérer le noms des classes.
encoder encoder
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
p2 = list(encoder.inverse_transform(p)) p2 = list(encoder.inverse_transform(p))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
p2 p2
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE['class_bert'] = p2 df_LGE['class_bert'] = p2
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.head() df_LGE.head()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.to_csv("drive/MyDrive/Classification-EDdA/classification_LGE.tsv", sep="\t") df_LGE.to_csv("drive/MyDrive/Classification-EDdA/classification_LGE.tsv", sep="\t")
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment