722ad2dfd67774bb7c5b731af619c0e5de16ba89 to a7d082393c8d0161c2b04d5fbf37b72728bf32af · Projet GEODE / EDdA Classification

Some changes are not shown.

For a faster browsing experience, only 1 of 3 files are shown. Download one of the files below to see all changes.

notebooks/Classification_BertFineTuning.ipynb

+4 −0

Original line number	Diff line number	Diff line
		%% Cell type:markdown id: tags:

		# BERT fine-tuning for EDdA classification

		%% Cell type:markdown id: tags:

		## Setup colab environment

		%% Cell type:code id: tags:

		``` python
		from psutil import virtual_memory
		ram_gb = virtual_memory().total / 1e9
		print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

		if ram_gb < 20:
		print('Not using a high-RAM runtime')
		else:
		print('You are using a high-RAM runtime!')
		```

		%% Cell type:code id: tags:

		``` python
		from google.colab import drive
		drive.mount('/content/drive')
		```

		%% Cell type:markdown id: tags:

		## Setup GPU

		%% Cell type:code id: tags:

		``` python
		import torch

		# If there's a GPU available...
		if torch.cuda.is_available():
		# Tell PyTorch to use the GPU.
		device = torch.device("cuda")
		print('There are %d GPU(s) available.' % torch.cuda.device_count())
		print('We will use the GPU:', torch.cuda.get_device_name(0))

		# for MacOS
		elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
		device = torch.device("mps")
		print('We will use the GPU')
		else:
		device = torch.device("cpu")
		print('No GPU available, using the CPU instead.')
		```

		%% Output

		We will use the GPU

		%% Cell type:markdown id: tags:

		## Install packages

		%% Cell type:code id: tags:

		``` python
		!pip install transformers==4.10.3
		!pip install sentencepiece
		```

		%% Cell type:markdown id: tags:

		## Import librairies

		%% Cell type:code id: tags:

		``` python
		import pandas as pd
		import numpy as np
		import csv
		import os
		import pickle
		from sklearn import preprocessing
		from sklearn.model_selection import train_test_split
		from sklearn.metrics import *

		from transformers import BertTokenizer, CamembertTokenizer, BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
		from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
		from transformers import get_linear_schedule_with_warmup

		import time
		import datetime

		import random

		import matplotlib.pyplot as plt
		from sklearn.metrics import plot_confusion_matrix
		from sklearn.metrics import confusion_matrix
		from sklearn.metrics import classification_report
		import seaborn as sns
		```

		%% Cell type:markdown id: tags:

		## Utils functions

		%% Cell type:code id: tags:

		``` python
		def resample_classes(df, classColumnName, numberOfInstances):
		#random numberOfInstances elements
		replace = False # with replacement
		fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
		return df.groupby(classColumnName, as_index=False).apply(fn)

		# Function to calculate the accuracy of our predictions vs labels
		def flat_accuracy(preds, labels):
		pred_flat = np.argmax(preds, axis=1).flatten()
		labels_flat = labels.flatten()
		return np.sum(pred_flat == labels_flat) / len(labels_flat)

		def format_time(elapsed):
		'''
		Takes a time in seconds and returns a string hh:mm:ss
		'''
		# Round to the nearest second.
		elapsed_rounded = int(round((elapsed)))

		# Format as hh:mm:ss
		return str(datetime.timedelta(seconds=elapsed_rounded))
		```

		%% Cell type:markdown id: tags:

		## Load Data

		%% Cell type:code id: tags:

		``` python
		!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/training_set.tsv
		!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/test_set.tsv
		```

		%% Cell type:code id: tags:

		``` python
		!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/training_set_superdomains.tsv
		!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/test_set_superdomains.tsv
		```

		%% Cell type:markdown id: tags:

		### Loading dataset

		%% Cell type:code id: tags:

		``` python
		#train_path = '../data/training_set.tsv'
		#test_path = '../data/test_set.tsv'

		train_path = '../data/training_set_superdomains.tsv'
		test_path = '../data/test_set_superdomains.tsv'
		```

		%% Cell type:code id: tags:

		``` python
		df_train = pd.read_csv(train_path, sep="\t")
		df_train.head()
		```

		%% Cell type:code id: tags:

		``` python
		print(df_train.shape)
		```

		%% Cell type:markdown id: tags:

		## Configuration

		%% Cell type:code id: tags:

		``` python
		columnText = 'contentWithoutClass'
		#columnClass = 'ensemble_domaine_enccre'
		columnClass = 'super_domain'

		maxOfInstancePerClass = 10000

		model_chosen = "bert"
		#model_chosen = "camembert"

		batch_size = 16 # 16 or 32 recommended
		max_len = 512

		#path = "drive/MyDrive/Classification-EDdA/"
		path = "../models/new/"
		encoder_filename = "label_encoder.pkl"
		```

		%% Cell type:markdown id: tags:

		## Preprocessing

		%% Cell type:code id: tags:

		``` python
		if maxOfInstancePerClass != 10000:
		df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)
		```

		%% Cell type:code id: tags:

		``` python
		labels = df_train[columnClass]
		numberOfClasses = labels.nunique()


		if os.path.isfile(path+encoder_filename):
		# load existing encoder
		with open(path+encoder_filename, 'rb') as file:
		encoder = pickle.load(file)

		else:
		encoder = preprocessing.LabelEncoder()
		encoder.fit(labels)

		with open(path+encoder_filename, 'wb') as file:
		pickle.dump(encoder, file)


		labels = encoder.transform(labels)
		```

		%% Cell type:code id: tags:

		``` python
		sentences_train = df_train[columnText].values
		labels_train = labels.tolist()
		```

		%% Cell type:code id: tags:

		``` python
		sentences_train
		```

		%% Cell type:markdown id: tags:

		# Model
		## Tokenisation & Input Formatting

		%% Cell type:code id: tags:

		``` python
		if model_chosen == "bert":
		tokeniser_bert = 'bert-base-multilingual-cased'
		model_bert = "bert-base-multilingual-cased"
		elif model_chosen == "camembert":
		tokeniser_bert = 'camembert-base'
		model_bert = 'camembert-base'
		```

		%% Cell type:code id: tags:

		``` python
		# Load the BERT tokenizer.
		if model_chosen == "bert":
		print('Loading BERT tokenizer...')
		tokenizer = BertTokenizer.from_pretrained(tokeniser_bert)
		elif model_chosen == "camembert":
		print('Loading CamemBERT tokenizer...')
		tokenizer = CamembertTokenizer.from_pretrained(tokeniser_bert)
		```

		%% Cell type:code id: tags:

		``` python
		# Tokenize all of the sentences and map the tokens to thier word IDs.
		input_ids_train = []

		# For every sentence...
		for sent in sentences_train:
		# `encode` will:
		# (1) Tokenize the sentence.
		# (2) Prepend the `[CLS]` token to the start.
		# (3) Append the `[SEP]` token to the end.
		# (4) Map tokens to their IDs.
		encoded_sent_train = tokenizer.encode(
		str(sent), # Sentence to encode.
		add_special_tokens = True, # Add '[CLS]' and '[SEP]'

		# This function also supports truncation and conversion
		# to pytorch tensors, but I need to do padding, so I
		# can't use these features.
		#max_length = 128, # Truncate all sentences.
		#return_tensors = 'pt', # Return pytorch tensors.
		)

		# Add the encoded sentence to the list.
		input_ids_train.append(encoded_sent_train)

		```

		%% Cell type:code id: tags:

		``` python
		print('Max sentence length train: ', max([len(sen) for sen in input_ids_train]))
		```

		%% Cell type:code id: tags:

		``` python

		padded_train = []
		for i in input_ids_train:

		if len(i) > max_len:
		padded_train.extend([i[:max_len]])
		else:
		padded_train.extend([i + [0] * (max_len - len(i))])


		padded_train = input_ids_train = np.array(padded_train)
		```

		%% Cell type:code id: tags:

		``` python
		# Create attention masks
		attention_masks_train = []

		# For each sentence...
		for sent in padded_train:

		# Create the attention mask.
		# - If a token ID is 0, then it's padding, set the mask to 0.
		# - If a token ID is > 0, then it's a real token, set the mask to 1.
		att_mask = [int(token_id > 0) for token_id in sent]

		# Store the attention mask for this sentence.
		attention_masks_train.append(att_mask)

		```

		%% Cell type:code id: tags:

		``` python
		# Use 70% for training and 30% for validation.
		#train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
		# random_state=2018, test_size=0.3, stratify = labels)
		# Do the same for the masks.
		#train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
		# random_state=2018, test_size=0.3, stratify = labels)
		```

		%% Cell type:code id: tags:

		``` python
		# Convert all inputs and labels into torch tensors, the required datatype
		# for my model.
		train_inputs = torch.tensor(padded_train)

		train_labels = torch.tensor(labels_train)

		train_masks = torch.tensor(attention_masks_train)
		```

		%% Cell type:code id: tags:

		``` python
		# The DataLoader needs to know the batch size for training, so I specify it here.
		# For fine-tuning BERT on a specific task, the authors recommend a batch size of
		# 16 or 32.

		# Create the DataLoader for training set.
		train_data = TensorDataset(train_inputs, train_masks, train_labels)
		train_sampler = RandomSampler(train_data)
		train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

		```

		%% Cell type:markdown id: tags:

		## Training

		%% Cell type:code id: tags:

		``` python
		# Load BertForSequenceClassification, the pretrained BERT model with a single
		# linear classification layer on top.

		#model = CamembertForSequenceClassification.from_pretrained(
		if model_chosen == "bert":
		model = BertForSequenceClassification.from_pretrained(
		model_bert, # Use the 12-layer BERT model, with an uncased vocab.
		num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
		# You can increase this for multi-class tasks.
		output_attentions = False, # Whether the model returns attentions weights.
		output_hidden_states = False, # Whether the model returns all hidden-states.
		)
		elif model_chosen == "camembert":
		model = CamembertForSequenceClassification.from_pretrained(
		model_bert, # Use the 12-layer BERT model, with an uncased vocab.
		num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
		# You can increase this for multi-class tasks.
		output_attentions = False, # Whether the model returns attentions weights.
		output_hidden_states = False, # Whether the model returns all hidden-states.
		)

		# Tell pytorch to run this model on the GPU.
		#model.cuda()
		model.to("mps")
		```

		%% Cell type:code id: tags:

		``` python
		#Note: AdamW is a class from the huggingface library (as opposed to pytorch)
		# I believe the 'W' stands for 'Weight Decay fix"
		optimizer = AdamW(model.parameters(),
		lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
		eps = 1e-8 # args.adam_epsilon - default is 1e-8.
		)
		```

		%% Cell type:code id: tags:

		``` python
		# Number of training epochs (authors recommend between 2 and 4)
		epochs = 4

		# Total number of training steps is number of batches * number of epochs.
		total_steps = len(train_dataloader) * epochs

		# Create the learning rate scheduler.
		scheduler = get_linear_schedule_with_warmup(optimizer,
		num_warmup_steps = 0, # Default value in run_glue.py
		num_training_steps = total_steps)
		```

		%% Cell type:code id: tags:

		``` python
		# This training code is based on the `run_glue.py` script here:
		# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

		# Set the seed value all over the place to make this reproducible.
		seed_val = 42

		random.seed(seed_val)
		np.random.seed(seed_val)
		torch.manual_seed(seed_val)
		torch.cuda.manual_seed_all(seed_val)

		# Store the average loss after each epoch so I can plot them.
		loss_values = []

		# For each epoch...
		for epoch_i in range(0, epochs):

		# ========================================
		# Training
		# ========================================

		# Perform one full pass over the training set.

		print("")
		print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
		print('Training...')

		# Measure how long the training epoch takes.
		t0 = time.time()

		# Reset the total loss for this epoch.
		total_loss = 0

		# Put the model into training mode.
		model.train()

		# For each batch of training data...
		for step, batch in enumerate(train_dataloader):

		# Progress update every 40 batches.
		if step % 5 == 0 and not step == 0:
		# Calculate elapsed time in minutes.
		elapsed = format_time(time.time() - t0)

		# Report progress.
		print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

		# Unpack this training batch from the dataloader.
		#
		# As I unpack the batch, I'll also copy each tensor to the GPU using the
		# `to` method.
		#
		# `batch` contains three pytorch tensors:
		# [0]: input ids
		# [1]: attention masks
		# [2]: labels
		b_input_ids = batch[0].to(device)
		b_input_mask = batch[1].to(device)
		b_labels = batch[2].to(device)

		# Always clear any previously calculated gradients before performing a
		# backward pass. PyTorch doesn't do this automatically because
		# accumulating the gradients is "convenient while training RNNs".
		# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
		model.zero_grad()

		# Perform a forward pass (evaluate the model on this training batch).
		# This will return the loss (rather than the model output) because I
		# have provided the `labels`.
		# The documentation for this `model` function is here:
		# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
		outputs = model(b_input_ids,
		token_type_ids=None,
		attention_mask=b_input_mask,
		labels=b_labels)

		# The call to `model` always returns a tuple, so I need to pull the
		# loss value out of the tuple.
		loss = outputs[0]

		# Accumulate the training loss over all of the batches so that I can
		# calculate the average loss at the end. `loss` is a Tensor containing a
		# single value; the `.item()` function just returns the Python value
		# from the tensor.
		total_loss += loss.item()

		# Perform a backward pass to calculate the gradients.
		loss.backward()

		# Clip the norm of the gradients to 1.0.
		# This is to help prevent the "exploding gradients" problem.
		torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

		# Update parameters and take a step using the computed gradient.
		# The optimizer dictates the "update rule"--how the parameters are
		# modified based on their gradients, the learning rate, etc.
		optimizer.step()

		# Update the learning rate.
		scheduler.step()

		# Calculate the average loss over the training data.
		avg_train_loss = total_loss / len(train_dataloader)

		# Store the loss value for plotting the learning curve.
		loss_values.append(avg_train_loss)

		print("")
		print(" Average training loss: {0:.2f}".format(avg_train_loss))
		print(" Training epoch took: {:}".format(format_time(time.time() - t0)))

		print("")
		print("Training complete!")
		```

		%% Cell type:markdown id: tags:

		## Saving model

		%% Cell type:code id: tags:

		``` python
		name = model_bert + "_s" + str(maxOfInstancePerClass)
		model_path = path + "model_"+name+".pt"
		```

		%% Cell type:code id: tags:

		``` python
		#torch.save(model, model_path)
		```

		%% Cell type:code id: tags:

		``` python
		model.save_pretrained(model_path)
		#ludo: changement de la façon de sauver le modèle
		```

		%% Cell type:markdown id: tags:

		## Loading model

		%% Cell type:code id: tags:

		``` python
		#model = torch.load(model_path)
		model = BertForSequenceClassification.from_pretrained(model_path).to("mps") #.to("cuda")
		```

		%% Cell type:markdown id: tags:

		## Evaluation

		%% Cell type:code id: tags:

		``` python
		def evaluate_bert(data, labels, model, batch_size):
		# Tokenize all of the sentences and map the tokens to thier word IDs.
		input_ids = []
		# For every sentence...
		for sent in data:
		# `encode` will:
		# (1) Tokenize the sentence.
		# (2) Prepend the `[CLS]` token to the start.
		# (3) Append the `[SEP]` token to the end.
		# (4) Map tokens to their IDs.
		encoded_sent = tokenizer.encode(
		str(sent), # Sentence to encode.
		add_special_tokens = True, # Add '[CLS]' and '[SEP]'
		)

		input_ids.append(encoded_sent)

		# Pad our input tokens
		padded = []
		for i in input_ids:

		if len(i) > max_len:
		padded.extend([i[:max_len]])
		else:
		padded.extend([i + [0] * (max_len - len(i))])
		input_ids = np.array(padded)

		# Create attention masks
		attention_masks = []

		# Create a mask of 1s for each token followed by 0s for padding
		for seq in input_ids:
		seq_mask = [float(i>0) for i in seq]
		attention_masks.append(seq_mask)

		# Convert to tensors.
		prediction_inputs = torch.tensor(input_ids)
		prediction_masks = torch.tensor(attention_masks)
		prediction_labels = torch.tensor(labels)

		# Create the DataLoader.
		prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
		prediction_sampler = SequentialSampler(prediction_data)
		prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

		print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

		# Put model in evaluation mode
		model.eval()

		# Tracking variables
		predictions , true_labels = [], []

		# Predict
		for batch in prediction_dataloader:
		# Add batch to GPU
		batch = tuple(t.to(device) for t in batch)

		# Unpack the inputs from the dataloader
		b_input_ids, b_input_mask, b_labels = batch

		# Telling the model not to compute or store gradients, saving memory and
		# speeding up prediction
		with torch.no_grad():
		# Forward pass, calculate logit predictions
		outputs = model(b_input_ids, token_type_ids=None,
		attention_mask=b_input_mask)

		logits = outputs[0]
		#print(logits)

		# Move logits and labels to CPU
		logits = logits.detach().cpu().numpy()
		label_ids = b_labels.to('cpu').numpy()
		#print(logits)

		# Store predictions and true labels
		predictions.append(logits)
		true_labels.append(label_ids)

		print(' DONE.')


		pred_labels = []

		# Evaluate each test batch using many matrics
		print('Calculating the matrics for each batch...')

		for i in range(len(true_labels)):

		# The predictions for this batch are a 2-column ndarray (one column for "0"
		# and one column for "1"). Pick the label with the highest value and turn this
		# in to a list of 0s and 1s.
		pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
		pred_labels.append(pred_labels_i)


		pred_labels_ = [item for sublist in pred_labels for item in sublist]
		true_labels_ = [item for sublist in true_labels for item in sublist]

		return pred_labels_, true_labels_
		```

		%% Cell type:code id: tags:

		``` python
		dataset = "test"


		df_eval = pd.read_csv(test_path, sep="\t")

		data_eval = df_eval[columnText].values

		y = df_eval[columnClass]



		y = encoder.transform(y)
		labels = y.tolist()


		model_path = path+"/model_"+model_bert+"_s"+str(maxOfInstancePerClass)+".pt"
		model = torch.load(model_path)

		if model_bert == "bert-base-multilingual-cased":
		tokenizer = BertTokenizer.from_pretrained(model_bert)
		elif model_bert == "camembert-base":
		tokenizer = CamembertTokenizer.from_pretrained(model_bert)

		pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)


		report = classification_report(true_labels_, pred_labels_, output_dict = True)

		classes = [str(e) for e in encoder.transform(encoder.classes_)]
		classesName = encoder.classes_

		precision = []
		recall = []
		f1 = []
		support = []
		dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
		for c in classes:
		precision.append(report[c]['precision'])
		recall.append(report[c]['recall'])
		f1.append(report[c]['f1-score'])
		support.append(report[c]['support'])

		accuracy = report['accuracy']
		weighted_avg = report['weighted avg']
		cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
		FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
		FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
		TP = np.diag(cnf_matrix)
		TN = cnf_matrix.sum() - (FP + FN + TP)

		dff['className'] = classesName
		dff['precision'] = precision
		dff['recall'] = recall
		dff['f1-score'] = f1
		dff['support'] = support
		dff['FP'] = FP
		dff['FN'] = FN
		dff['TP'] = TP
		dff['TN'] = TN

		print(name)

		name = "test_"+ name
		content = name + "\n"
		print(name)
		content += str(weighted_avg) + "\n"

		print(weighted_avg)
		print(accuracy)
		print(dff)

		dff.to_csv(path+"/report_"+name+".csv", index=False)
		# enregistrer les predictions
		pd.DataFrame({'labels': pd.Series(true_labels_), 'predictions': pd.Series(pred_labels_)}).to_csv(path+"/predictions/predictions_"+name+".csv")

		with open(path+"reports/report_"+name+".txt", 'w') as f:
		f.write(content)
		```

		%% Cell type:code id: tags:

		``` python
		```

		%% Cell type:code id: tags:

		``` python
		```

		%% Cell type:code id: tags:

		``` python
		```

		%% Cell type:code id: tags:

		``` python
		```

		%% Cell type:code id: tags:

		``` python
		```

		%% Cell type:code id: tags:

		``` python
		```

		%% Cell type:code id: tags:

		``` python
		model_path = "drive/MyDrive/Classification-EDdA/model_bert-base-multilingual-cased_s10000.pt"
		```

		%% Cell type:code id: tags:

		``` python
		model = torch.load(model_path)
		```

		%% Cell type:code id: tags:

		``` python
		!wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv
		```

		%% Cell type:code id: tags:

		``` python
		df_LGE = pd.read_csv("LGE_withContent.tsv", sep="\t")
		data_LGE = df_LGE["content"].values


		#pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)
		```

		%% Cell type:code id: tags:

		``` python
		df_LGE.head()
		```

		%% Cell type:code id: tags:

		``` python
		df_LGE.shape
		```

		%% Cell type:code id: tags:

		``` python
		def generate_prediction_dataloader(chosen_model, sentences_to_predict, batch_size = 8, max_len = 512):

		if chosen_model == 'bert-base-multilingual-cased' :
		print('Loading Bert Tokenizer...')
		tokenizer = BertTokenizer.from_pretrained(chosen_model)
		elif chosen_model == 'camembert-base':
		print('Loading Camembert Tokenizer...')
		tokenizer = CamembertTokenizer.from_pretrained(chosen_model)

		# Tokenize all of the sentences and map the tokens to thier word IDs.
		input_ids_test = []
		# For every sentence...
		for sent in sentences_to_predict:
		# `encode` will:
		# (1) Tokenize the sentence.
		# (2) Prepend the `[CLS]` token to the start.
		# (3) Append the `[SEP]` token to the end.
		# (4) Map tokens to their IDs.
		encoded_sent = tokenizer.encode(
		sent, # Sentence to encode.
		add_special_tokens = True, # Add '[CLS]' and '[SEP]'
		)

		input_ids_test.append(encoded_sent)

		# Pad our input tokens
		padded_test = []
		for i in input_ids_test:

		if len(i) > max_len:
		padded_test.extend([i[:max_len]])
		else:

		padded_test.extend([i + [0] * (max_len - len(i))])
		input_ids_test = np.array(padded_test)

		# Create attention masks
		attention_masks = []

		# Create a mask of 1s for each token followed by 0s for padding
		for seq in input_ids_test:
		seq_mask = [float(i>0) for i in seq]
		attention_masks.append(seq_mask)

		# Convert to tensors.
		prediction_inputs = torch.tensor(input_ids_test)
		prediction_masks = torch.tensor(attention_masks)
		#set batch size


		# Create the DataLoader.
		prediction_data = TensorDataset(prediction_inputs, prediction_masks)
		prediction_sampler = SequentialSampler(prediction_data)
		prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

		return prediction_dataloader



		def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):


		# If there's a GPU available...
		if torch.cuda.is_available():

		# Tell PyTorch to use the GPU.
		device = torch.device("cuda")

		print('There are %d GPU(s) available.' % torch.cuda.device_count())

		print('We will use the GPU:', torch.cuda.get_device_name(0))

		# If not...
		else:
		print('No GPU available, using the CPU instead.')
		device = torch.device("cpu")

		# Put model in evaluation mode
		model.eval()

		# Tracking variables
		predictions_test , true_labels = [], []
		pred_labels_ = []
		# Predict
		for batch in sentences_to_predict_dataloader:
		# Add batch to GPU
		batch = tuple(t.to(device) for t in batch)

		# Unpack the inputs from the dataloader
		b_input_ids, b_input_mask = batch

		# Telling the model not to compute or store gradients, saving memory and
		# speeding up prediction
		with torch.no_grad():
		# Forward pass, calculate logit predictions
		outputs = model(b_input_ids, token_type_ids=None,
		attention_mask=b_input_mask)

		logits = outputs[0]
		#print(logits)

		# Move logits and labels to CPU
		logits = logits.detach().cpu().numpy()
		#print(logits)

		# Store predictions and true labels
		predictions_test.append(logits)

		#print(' DONE.')

		pred_labels = []

		for i in range(len(predictions_test)):

		# The predictions for this batch are a 2-column ndarray (one column for "0"
		# and one column for "1"). Pick the label with the highest value and turn this
		# in to a list of 0s and 1s.
		pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
		pred_labels.append(pred_labels_i)

		pred_labels_ += [item for sublist in pred_labels for item in sublist]
		return pred_labels_
		```

		%% Cell type:code id: tags:

		``` python
		data_loader = generate_prediction_dataloader('bert-base-multilingual-cased', data_LGE)
		#data_loader = generate_prediction_dataloader('camembert-base', data_LGE)
		```

		%% Cell type:code id: tags:

		``` python
		p = predict_class_bertFineTuning( model, data_loader )
		```

		%% Cell type:code id: tags:

		``` python
		len(p)
		```

		%% Cell type:code id: tags:

		``` python
		```

		%% Cell type:code id: tags:

		``` python
		# Il faudrait enregistrer l'encoder,
		# sinon on est obligé de le refaire à partir du jeu d'entrainement pour récupérer le noms des classes.
		encoder
		```

		%% Cell type:code id: tags:

		``` python
		p2 = list(encoder.inverse_transform(p))
		```

		%% Cell type:code id: tags:

		``` python
		p2
		```

		%% Cell type:code id: tags:

		``` python
		```

		%% Cell type:code id: tags:

		``` python
		df_LGE['class_bert'] = p2
		```

		%% Cell type:code id: tags:

		``` python
		df_LGE.head()
		```

		%% Cell type:code id: tags:

		``` python
		df_LGE.to_csv("drive/MyDrive/Classification-EDdA/classification_LGE.tsv", sep="\t")
		```

notebooks/Clustering.ipynb

+30 −25

File changed.

Preview size limit exceeded, changes collapsed.

notebooks/Predict.ipynb

+443 −277

File changed.

Preview size limit exceeded, changes collapsed.

Compare revisions

Source

Target

Commits on Source 3

Files

Some changes are not shown.

notebooks/Classification_BertFineTuning.ipynb

notebooks/Clustering.ipynb

notebooks/Predict.ipynb