Update Classification_BertFineTuning.ipynb

90192d15 · Ludovic Moncla · 13110f08 · 90192d15
Commit 90192d15 authored 2 years ago by Ludovic Moncla
--- a/notebooks/Classification_BertFineTuning.ipynb
+++ b/notebooks/Classification_BertFineTuning.ipynb
@@ -223,8 +223,8 @@
      },
      "outputs": [],
      "source": [
-        "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n",
+        "!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/training_set.tsv\n",
-        "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv"
+        "!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/test_set.tsv"
      ]
    },
    {
@@ -1553,7 +1553,7 @@
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:01:00) \n[Clang 13.0.1 ]"
    },
    "vscode": {
      "interpreter": {

 %% Cell type:markdown id: tags:
 # BERT fine-tuning for EDdA classification
 %% Cell type:markdown id: tags:
 ## Setup colab environment
 %% Cell type:code id: tags:
 ``` python
 from psutil import virtual_memory
 ram_gb = virtual_memory().total / 1e9
 print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
 if ram_gb < 20:
  print('Not using a high-RAM runtime')
 else:
  print('You are using a high-RAM runtime!')
 ```
 %% Cell type:code id: tags:
 ``` python
 from google.colab import drive
 drive.mount('/content/drive')
 ```
 %% Cell type:markdown id: tags:
 ## Setup GPU
 %% Cell type:code id: tags:
 ``` python
 import torch
 # If there's a GPU available...
 if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
 # for MacOS
 elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print('We will use the GPU')
 else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')
 ```
 %% Cell type:markdown id: tags:
 ## Install packages
 %% Cell type:code id: tags:
 ``` python
 !pip install transformers==4.10.3
 !pip install sentencepiece
 ```
 %% Cell type:markdown id: tags:
 ## Import librairies
 %% Cell type:code id: tags:
 ``` python
 import pandas as pd
 import numpy as np
 import csv
 import os
 import pickle
 from sklearn import preprocessing
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import *
 from transformers import BertTokenizer, CamembertTokenizer, BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
 from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 from transformers import get_linear_schedule_with_warmup
 import time
 import datetime
 import random
 import matplotlib.pyplot as plt
 from sklearn.metrics import plot_confusion_matrix
 from sklearn.metrics import confusion_matrix
 from sklearn.metrics import classification_report
 import seaborn as sns
 ```
 %% Cell type:markdown id: tags:
 ## Utils functions
 %% Cell type:code id: tags:
 ``` python
 def resample_classes(df, classColumnName, numberOfInstances):
  #random numberOfInstances elements
  replace = False  # with replacement
  fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
  return df.groupby(classColumnName, as_index=False).apply(fn)
 # Function to calculate the accuracy of our predictions vs labels
 def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)
 def format_time(elapsed):
  '''
  Takes a time in seconds and returns a string hh:mm:ss
  '''
  # Round to the nearest second.
  elapsed_rounded = int(round((elapsed)))
  # Format as hh:mm:ss
  return str(datetime.timedelta(seconds=elapsed_rounded))
 ```
 %% Cell type:markdown id: tags:
 ## Load Data
 %% Cell type:code id: tags:
 ``` python
-!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv
+!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/training_set.tsv
-!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv
+!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/test_set.tsv
 ```
 %% Cell type:markdown id: tags:
 ### Loading dataset
 %% Cell type:code id: tags:
 ``` python
 train_path = '../data/training_set.tsv'
 test_path =  '../data/test_set.tsv'
 ```
 %% Cell type:code id: tags:
 ``` python
 df_train = pd.read_csv(train_path, sep="\t")
 df_train.head()
 ```
 %% Cell type:code id: tags:
 ``` python
 print(df_train.shape)
 ```
 %% Cell type:markdown id: tags:
 ## Configuration
 %% Cell type:code id: tags:
 ``` python
 columnText = 'contentWithoutClass'
 columnClass = 'ensemble_domaine_enccre'
 maxOfInstancePerClass = 10000
 model_chosen = "bert"
 #model_chosen = "camembert"
 batch_size = 16  # 16 or 32 recommended
 max_len = 512
 #path = "drive/MyDrive/Classification-EDdA/"
 path = "../models/new/"
 encoder_filename = "label_encoder.pkl"
 ```
 %% Cell type:markdown id: tags:
 ## Preprocessing
 %% Cell type:code id: tags:
 ``` python
 if maxOfInstancePerClass != 10000:
  df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)
 ```
 %% Cell type:code id: tags:
 ``` python
 labels  = df_train[columnClass]
 numberOfClasses = labels.nunique()
 if os.path.isfile(path+encoder_filename):
    # load existing encoder
    with open(path+encoder_filename, 'rb') as file:
      encoder = pickle.load(file)
 else:
  encoder = preprocessing.LabelEncoder()
  encoder.fit(labels)
  with open(path+encoder_filename, 'wb') as file:
      pickle.dump(encoder, file)
 labels = encoder.transform(labels)
 ```
 %% Cell type:code id: tags:
 ``` python
 sentences_train = df_train[columnText].values
 labels_train = labels.tolist()
 ```
 %% Cell type:code id: tags:
 ``` python
 sentences_train
 ```
 %% Cell type:markdown id: tags:
 # Model
 ## Tokenisation & Input Formatting
 %% Cell type:code id: tags:
 ``` python
 if model_chosen == "bert":
  tokeniser_bert = 'bert-base-multilingual-cased'
  model_bert =  "bert-base-multilingual-cased"
 elif model_chosen == "camembert":
  tokeniser_bert = 'camembert-base'
  model_bert = 'camembert-base'
 ```
 %% Cell type:code id: tags:
 ``` python
 # Load the BERT tokenizer.
 if model_chosen == "bert":
  print('Loading BERT tokenizer...')
  tokenizer = BertTokenizer.from_pretrained(tokeniser_bert)
 elif model_chosen == "camembert":
  print('Loading CamemBERT tokenizer...')
  tokenizer = CamembertTokenizer.from_pretrained(tokeniser_bert)
 ```
 %% Cell type:code id: tags:
 ``` python
 # Tokenize all of the sentences and map the tokens to thier word IDs.
 input_ids_train = []
 # For every sentence...
 for sent in sentences_train:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent_train = tokenizer.encode(
                        str(sent),                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        # This function also supports truncation and conversion
                        # to pytorch tensors, but I need to do padding, so I
                        # can't use these features.
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    # Add the encoded sentence to the list.
    input_ids_train.append(encoded_sent_train)
 ```
 %% Cell type:code id: tags:
 ``` python
 print('Max sentence length train: ', max([len(sen) for sen in input_ids_train]))
 ```
 %% Cell type:code id: tags:
 ``` python
 padded_train = []
 for i in input_ids_train:
  if len(i) > max_len:
    padded_train.extend([i[:max_len]])
  else:
    padded_train.extend([i + [0] * (max_len - len(i))])
 padded_train = input_ids_train = np.array(padded_train)
 ```
 %% Cell type:code id: tags:
 ``` python
 # Create attention masks
 attention_masks_train = []
 # For each sentence...
 for sent in padded_train:
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    # Store the attention mask for this sentence.
    attention_masks_train.append(att_mask)
 ```
 %% Cell type:code id: tags:
 ``` python
 # Use 70% for training and 30% for validation.
 #train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
 #                                                            random_state=2018, test_size=0.3, stratify = labels)
 # Do the same for the masks.
 #train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
 #                                             random_state=2018, test_size=0.3, stratify = labels)
 ```
 %% Cell type:code id: tags:
 ``` python
 # Convert all inputs and labels into torch tensors, the required datatype
 # for my model.
 train_inputs = torch.tensor(padded_train)
 train_labels = torch.tensor(labels_train)
 train_masks = torch.tensor(attention_masks_train)
 ```
 %% Cell type:code id: tags:
 ``` python
 # The DataLoader needs to know the batch size for training, so I specify it here.
 # For fine-tuning BERT on a specific task, the authors recommend a batch size of
 # 16 or 32.
 # Create the DataLoader for training set.
 train_data = TensorDataset(train_inputs, train_masks, train_labels)
 train_sampler = RandomSampler(train_data)
 train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
 ```
 %% Cell type:markdown id: tags:
 ## Training
 %% Cell type:code id: tags:
 ``` python
 # Load BertForSequenceClassification, the pretrained BERT model with a single
 # linear classification layer on top.
 #model = CamembertForSequenceClassification.from_pretrained(
 if model_chosen == "bert":
  model = BertForSequenceClassification.from_pretrained(
      model_bert, # Use the 12-layer BERT model, with an uncased vocab.
      num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
                      # You can increase this for multi-class tasks.
      output_attentions = False, # Whether the model returns attentions weights.
      output_hidden_states = False, # Whether the model returns all hidden-states.
  )
 elif model_chosen == "camembert":
  model = CamembertForSequenceClassification.from_pretrained(
      model_bert, # Use the 12-layer BERT model, with an uncased vocab.
      num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
                      # You can increase this for multi-class tasks.
      output_attentions = False, # Whether the model returns attentions weights.
      output_hidden_states = False, # Whether the model returns all hidden-states.
  )
 # Tell pytorch to run this model on the GPU.
 #model.cuda()
 model.to("mps")
 ```
 %% Cell type:code id: tags:
 ``` python
 #Note: AdamW is a class from the huggingface library (as opposed to pytorch)
 # I believe the 'W' stands for 'Weight Decay fix"
 optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
 ```
 %% Cell type:code id: tags:
 ``` python
 # Number of training epochs (authors recommend between 2 and 4)
 epochs = 4
 # Total number of training steps is number of batches * number of epochs.
 total_steps = len(train_dataloader) * epochs
 # Create the learning rate scheduler.
 scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
 ```
 %% Cell type:code id: tags:
 ``` python
 # This training code is based on the `run_glue.py` script here:
 # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
 # Set the seed value all over the place to make this reproducible.
 seed_val = 42
 random.seed(seed_val)
 np.random.seed(seed_val)
 torch.manual_seed(seed_val)
 torch.cuda.manual_seed_all(seed_val)
 # Store the average loss after each epoch so I can plot them.
 loss_values = []
 # For each epoch...
 for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_loss = 0
    # Put the model into training mode.
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 5 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        # Unpack this training batch from the dataloader.
        #
        # As I unpack the batch, I'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()
        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because I
        # have provided the `labels`.
        # The documentation for this `model` function is here:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels)
        # The call to `model` always returns a tuple, so I need to pull the
        # loss value out of the tuple.
        loss = outputs[0]
        # Accumulate the training loss over all of the batches so that I can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        total_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
 print("")
 print("Training complete!")
 ```
 %% Cell type:markdown id: tags:
 ## Saving model
 %% Cell type:code id: tags:
 ``` python
 name = model_bert + "_s" + str(maxOfInstancePerClass)
 model_path = path + "model_"+name+".pt"
 ```
 %% Cell type:code id: tags:
 ``` python
 #torch.save(model, model_path)
 ```
 %% Cell type:code id: tags:
 ``` python
 model.save_pretrained(model_path)
 #ludo: changement de la façon de sauver le modèle
 ```
 %% Cell type:markdown id: tags:
 ## Loading model
 %% Cell type:code id: tags:
 ``` python
 #model = torch.load(model_path)
 model = BertForSequenceClassification.from_pretrained(model_path).to("mps") #.to("cuda")
 ```
 %% Cell type:markdown id: tags:
 ## Evaluation
 %% Cell type:code id: tags:
 ``` python
 def evaluate_bert(data, labels, model, batch_size):
  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  # For every sentence...
  for sent in data:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          str(sent),                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                  )
      input_ids.append(encoded_sent)
  # Pad our input tokens
  padded = []
  for i in input_ids:
    if len(i) > max_len:
      padded.extend([i[:max_len]])
    else:
      padded.extend([i + [0] * (max_len - len(i))])
  input_ids = np.array(padded)
  # Create attention masks
  attention_masks = []
  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_masks.append(seq_mask)
  # Convert to tensors.
  prediction_inputs = torch.tensor(input_ids)
  prediction_masks = torch.tensor(attention_masks)
  prediction_labels = torch.tensor(labels)
  # Create the DataLoader.
  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
  print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
  # Put model in evaluation mode
  model.eval()
  # Tracking variables
  predictions , true_labels = [], []
  # Predict
  for batch in prediction_dataloader:
  # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from the dataloader
      b_input_ids, b_input_mask, b_labels = batch
      # Telling the model not to compute or store gradients, saving memory and
      # speeding up prediction
      with torch.no_grad():
          # Forward pass, calculate logit predictions
          outputs = model(b_input_ids, token_type_ids=None,
                          attention_mask=b_input_mask)
      logits = outputs[0]
      #print(logits)
      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      #print(logits)
      # Store predictions and true labels
      predictions.append(logits)
      true_labels.append(label_ids)
  print('    DONE.')
  pred_labels = []
  # Evaluate each test batch using many matrics
  print('Calculating the matrics for each batch...')
  for i in range(len(true_labels)):
    # The predictions for this batch are a 2-column ndarray (one column for "0"
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s.
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    pred_labels.append(pred_labels_i)
  pred_labels_ = [item for sublist in pred_labels for item in sublist]
  true_labels_ = [item for sublist in true_labels for item in sublist]
  return pred_labels_, true_labels_
 ```
 %% Cell type:code id: tags:
 ``` python
 dataset = "test"
 df_eval = pd.read_csv(dataset+"_set.tsv", sep="\t")
 data_eval = df_eval[columnText].values
 y = df_eval[columnClass]
 y = encoder.transform(y)
 labels = y.tolist()
 model_path = path+"/model_"+model_bert+"_s"+str(maxOfInstancePerClass)+".pt"
 model = torch.load(model_path)
 if model_bert == "bert-base-multilingual-cased":
  tokenizer = BertTokenizer.from_pretrained(model_bert)
 elif model_bert == "camembert-base":
  tokenizer = CamembertTokenizer.from_pretrained(model_bert)
 pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)
 report = classification_report(true_labels_, pred_labels_,  output_dict = True)
 classes = [str(e) for e in encoder.transform(encoder.classes_)]
 classesName = encoder.classes_
 precision = []
 recall = []
 f1 = []
 support = []
 dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
 for c in classes:
  precision.append(report[c]['precision'])
  recall.append(report[c]['recall'])
  f1.append(report[c]['f1-score'])
  support.append(report[c]['support'])
 accuracy = report['accuracy']
 weighted_avg = report['weighted avg']
 cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
 FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
 FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
 TP = np.diag(cnf_matrix)
 TN = cnf_matrix.sum() - (FP + FN + TP)
 dff['className'] = classesName
 dff['precision'] = precision
 dff['recall'] = recall
 dff['f1-score'] = f1
 dff['support'] = support
 dff['FP'] = FP
 dff['FN'] = FN
 dff['TP'] = TP
 dff['TN'] = TN
 print(name)
 name = "test_"+ name
 content = name + "\n"
 print(name)
 content += str(weighted_avg) + "\n"
 print(weighted_avg)
 print(accuracy)
 print(dff)
 dff.to_csv(path+"/report_"+name+".csv", index=False)
 # enregistrer les predictions
 pd.DataFrame({'labels': pd.Series(true_labels_), 'predictions': pd.Series(pred_labels_)}).to_csv(path+"/predictions/predictions_"+name+".csv")
 with open(path+"reports/report_"+name+".txt", 'w') as f:
  f.write(content)
 ```
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:code id: tags:
 ``` python
 model_path = "drive/MyDrive/Classification-EDdA/model_bert-base-multilingual-cased_s10000.pt"
 ```
 %% Cell type:code id: tags:
 ``` python
 model = torch.load(model_path)
 ```
 %% Cell type:code id: tags:
 ``` python
 !wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv
 ```
 %% Cell type:code id: tags:
 ``` python
 df_LGE = pd.read_csv("LGE_withContent.tsv", sep="\t")
 data_LGE = df_LGE["content"].values
 #pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)
 ```
 %% Cell type:code id: tags:
 ``` python
 df_LGE.head()
 ```
 %% Cell type:code id: tags:
 ``` python
 df_LGE.shape
 ```
 %% Cell type:code id: tags:
 ``` python
 def generate_prediction_dataloader(chosen_model, sentences_to_predict, batch_size = 8, max_len = 512):
    if chosen_model == 'bert-base-multilingual-cased' :
        print('Loading Bert Tokenizer...')
        tokenizer = BertTokenizer.from_pretrained(chosen_model)
    elif chosen_model == 'camembert-base':
        print('Loading Camembert Tokenizer...')
        tokenizer = CamembertTokenizer.from_pretrained(chosen_model)
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_test = []
    # For every sentence...
    for sent in sentences_to_predict:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    )
        input_ids_test.append(encoded_sent)
    # Pad our input tokens
    padded_test = []
    for i in input_ids_test:
        if len(i) > max_len:
            padded_test.extend([i[:max_len]])
        else:
            padded_test.extend([i + [0] * (max_len - len(i))])
    input_ids_test = np.array(padded_test)
    # Create attention masks
    attention_masks = []
    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids_test:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    # Convert to tensors.
    prediction_inputs = torch.tensor(input_ids_test)
    prediction_masks = torch.tensor(attention_masks)
    #set batch size
    # Create the DataLoader.
    prediction_data = TensorDataset(prediction_inputs, prediction_masks)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
    return prediction_dataloader
 def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
    # If there's a GPU available...
    if torch.cuda.is_available():
        # Tell PyTorch to use the GPU.
        device = torch.device("cuda")
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(0))
        # If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")
    # Put model in evaluation mode
    model.eval()
    # Tracking variables
    predictions_test , true_labels = [], []
    pred_labels_ = []
    # Predict
    for batch in sentences_to_predict_dataloader:
    # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from the dataloader
        b_input_ids, b_input_mask = batch
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)
        logits = outputs[0]
        #print(logits)
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        #print(logits)
        # Store predictions and true labels
        predictions_test.append(logits)
        #print('    DONE.')
        pred_labels = []
        for i in range(len(predictions_test)):
            # The predictions for this batch are a 2-column ndarray (one column for "0"
            # and one column for "1"). Pick the label with the highest value and turn this
            # in to a list of 0s and 1s.
            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
            pred_labels.append(pred_labels_i)
    pred_labels_ += [item for sublist in pred_labels for item in sublist]
    return pred_labels_
 ```
 %% Cell type:code id: tags:
 ``` python
 data_loader = generate_prediction_dataloader('bert-base-multilingual-cased', data_LGE)
 #data_loader = generate_prediction_dataloader('camembert-base', data_LGE)
 ```
 %% Cell type:code id: tags:
 ``` python
 p = predict_class_bertFineTuning( model, data_loader )
 ```
 %% Cell type:code id: tags:
 ``` python
 len(p)
 ```
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:code id: tags:
 ``` python
 # Il faudrait enregistrer l'encoder,
 # sinon on est obligé de le refaire à partir du jeu d'entrainement pour récupérer le noms des classes.
 encoder
 ```
 %% Cell type:code id: tags:
 ``` python
 p2 = list(encoder.inverse_transform(p))
 ```
 %% Cell type:code id: tags:
 ``` python
 p2
 ```
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:code id: tags:
 ``` python
 df_LGE['class_bert'] = p2
 ```
 %% Cell type:code id: tags:
 ``` python
 df_LGE.head()
 ```
 %% Cell type:code id: tags:
 ``` python
 df_LGE.to_csv("drive/MyDrive/Classification-EDdA/classification_LGE.tsv", sep="\t")
 ```