# BERT Predict classification

## 1. Setup the environment

### 1.1 Setup colab environment

#### 1.1.1 Install packages

In [None]:
!pip install transformers==4.10.3
!pip install sentencepiece

#### 1.1.2 Use more RAM

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

#### 1.1.3 Mount GoogleDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### 1.2 Setup GPU

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# for MacOS
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print('We will use the GPU')
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')



No GPU available, using the CPU instead.


### 1.3 Import librairies

In [12]:
import pandas as pd 
import numpy as np

from transformers import BertTokenizer, BertForSequenceClassification, CamembertTokenizer, TextClassificationPipeline
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

import pickle 

## 2. Load Data

In [3]:
#!wget https://geode.liris.cnrs.fr/files/datasets/EDdA/Classification/LGE_withContent.tsv
#!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/EDdA_dataset_articles_no_superdomain.tsv
#!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/Parallel_datatset_articles_230215.tsv

In [5]:
#drive_path = "drive/MyDrive/Classification-EDdA/"
drive_path = "../"
#path = "/Users/lmoncla/git/gitlab.liris/GEODE/EDdA/output/"
path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/"


#filepath = "Parallel_datatset_articles_230215.tsv"
#filepath = "EDdA_dataset_articles.tsv"
filepath = 'LGE_dataset_articles_230314.tsv'

corpus = 'lge'
#corpus = ''

In [6]:
df = pd.read_csv(path + filepath, sep="\t")
df.head()

Unnamed: 0,uid,lge-volume,lge-numero,lge-head,lge-page,lge-id,lge-content,lge-nbWords
0,lge_1_a-0,1,1,A,0,a-0,A(Ling.). Son vocal et première lettre de notr...,1761.0
1,lge_1_a-1,1,2,A,1,a-1,"A(Paléogr.). C’est à l’alphabet phénicien, on ...",839.0
2,lge_1_a-2,1,3,A,4,a-2,A(Log.). Cette voyelle désigne les proposition...,56.0
3,lge_1_a-3,1,4,A,4,a-3,A(Mus.). La lettre a est employée par les musi...,267.0
4,lge_1_a-4,1,5,A,4,a-4,"A(Numis.). Dans la numismatique grecque, la le...",67.0


In [16]:
dataset = df[corpus+'-content'].values

## 3. Load model and predict

### 3.1 BERT / CamemBERT

In [7]:
model_name = "bert-base-multilingual-cased"
#model_name = "camembert-base"
#model_path = path + "models/model_" + model_name + "_s10000.pt"

model_path = drive_path + "models/model_" + model_name + "_s10000_superdomains.pt"

In [8]:
def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_test = []
    # For every sentence...
    for sent in sentences:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            # This function also supports truncation and conversion
                            # to pytorch tensors, but I need to do padding, so I
                            # can't use these features.
                            #max_length = max_len,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                    )
        input_ids_test.append(encoded_sent)

    # Pad our input tokens
    padded_test = []
    for i in input_ids_test:
        if len(i) > max_len:
            padded_test.extend([i[:max_len]])
        else:
            padded_test.extend([i + [0] * (max_len - len(i))])
    input_ids_test = np.array(padded_test)

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids_test:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert to tensors.
    inputs = torch.tensor(input_ids_test)
    masks = torch.tensor(attention_masks)
    #set batch size

    # Create the DataLoader.
    data = TensorDataset(inputs, masks)
    prediction_sampler = SequentialSampler(data)

    return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)



def predict(model, dataloader, device):

    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    predictions_test , true_labels = [], []
    pred_labels_ = []
    # Predict
    for batch in dataloader:
    # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from the dataloader
        b_input_ids, b_input_mask = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]
        #print(logits)

        # Move logits and labels to CPU ???
        logits = logits.detach().cpu().numpy()
        #print(logits)

        # Store predictions and true labels
        predictions_test.append(logits)

        pred_labels = []
        
        for i in range(len(predictions_test)):
            # The predictions for this batch are a 2-column ndarray (one column for "0"
            # and one column for "1"). Pick the label with the highest value and turn this
            # in to a list of 0s and 1s.
            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
            pred_labels.append(pred_labels_i)

    pred_labels_ += [item for sublist in pred_labels for item in sublist]
    return pred_labels_



#https://discuss.huggingface.co/t/i-have-trained-my-classifier-now-how-do-i-do-predictions/3625/3


In [9]:
if model_name == 'bert-base-multilingual-cased' :
    print('Loading Bert Tokenizer...')
    tokenizer = BertTokenizer.from_pretrained(model_name)
elif model_name == 'camembert-base':
    print('Loading Camembert Tokenizer...')
    tokenizer = CamembertTokenizer.from_pretrained(model_name)

Loading Bert Tokenizer...



https://discuss.huggingface.co/t/an-efficient-way-of-loading-a-model-that-was-saved-with-torch-save/9814

https://github.com/huggingface/transformers/issues/2094


In [11]:
model = BertForSequenceClassification.from_pretrained(model_path).to(device.type)

In [10]:
#data_loader = generate_dataloader(tokenizer, data)

Token indices sequence length is longer than the specified maximum sequence length for this model (3408 > 512). Running this sequence through the model will result in indexing errors


In [12]:
#pred = predict(model, data_loader, device)

In [32]:
# https://huggingface.co/docs/transformers/main_classes/pipelines

def data(): #TODO : 
    for d in dataset:
        yield f"{d}"

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=device)

# https://stackoverflow.com/questions/67849833/how-to-truncate-input-in-the-huggingface-pipeline
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}

In [75]:
pred = []
cpt = 0
for out in pipe(data(), **tokenizer_kwargs):
    out = sorted(out, key=lambda d: d['score'], reverse=True) 
    print(int(out[0]['label'][6:]), out[0]['score'], int(out[1]['label'][6:]), out[1]['score'], int(out[2]['label'][6:]), out[2]['score']) # label ### TODO modifier ici
    pred.append([int(out[0]['label'][6:]), out[0]['score'], int(out[1]['label'][6:]), out[1]['score'], int(out[2]['label'][6:]), out[2]['score']])
    cpt += 1
    if cpt == 6:
        break

pred = np.array(pred)

13 0.9375858902931213 2 0.02119206078350544 7 0.012656883336603642
6 0.9926056861877441 7 0.0029342961497604847 8 0.0010190310422331095
13 0.9823671579360962 2 0.004123885650187731 1 0.0022031611297279596
10 0.9058954119682312 2 0.029458750039339066 7 0.014979332685470581
7 0.9861114025115967 2 0.003949114587157965 6 0.0015271356096491218
4 0.9868665933609009 5 0.0021403145510703325 15 0.0018120042514055967


In [34]:
# Load label encoder

#encoder_filename = "models/label_encoder.pkl"
encoder_filename = "models/label_encoder_superdomains.pkl"
with open(drive_path + encoder_filename, 'rb') as file:
      encoder = pickle.load(file)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [72]:
pred1 = list(encoder.inverse_transform(pred[:,0].astype(int)))
pred2 = list(encoder.inverse_transform(pred[:,2].astype(int)))
pred3 = list(encoder.inverse_transform(pred[:,4].astype(int)))


In [73]:
#print(pred1)
#print(pred[:,1])

['Philosophie', 'Géographie', 'Philosophie', 'Musique', 'Histoire', 'Commerce']
[0.93758589 0.99260569 0.98236716 0.90589541 0.9861114  0.98686659]


In [16]:
df[corpus+'-superdomainPred1'] = pred1
df[corpus+'-superdomainProba1'] = pred[:,1]
df[corpus+'-superdomainPred2'] = pred2
df[corpus+'-superdomainProba2'] = pred[:,3]
df[corpus+'-superdomainPred3'] = pred3
df[corpus+'-superdomainProba3'] = pred[:,5]

In [18]:
df.head(10)

Unnamed: 0,uid,lge-volume,lge-numero,lge-head,lge-page,lge-id,lge-content,lge-nbWords,lge-superdomainBert
0,lge_1_a-0,1,1,A,0,a-0,A(Ling.). Son vocal et première lettre de notr...,1761.0,Philosophie
1,lge_1_a-1,1,2,A,1,a-1,"A(Paléogr.). C’est à l’alphabet phénicien, on ...",839.0,Géographie
2,lge_1_a-2,1,3,A,4,a-2,A(Log.). Cette voyelle désigne les proposition...,56.0,Philosophie
3,lge_1_a-3,1,4,A,4,a-3,A(Mus.). La lettre a est employée par les musi...,267.0,Musique
4,lge_1_a-4,1,5,A,4,a-4,"A(Numis.). Dans la numismatique grecque, la le...",67.0,Histoire
5,lge_1_aa-0,1,6,AA,4,aa-0,AA. Ces deux lettres désignent l’atelier monét...,14.0,Commerce
6,lge_1_aa-1,1,7,AA,4,aa-1,AA. Nom de plusieurs cours d’eau de l’Europe o...,75.0,Géographie
7,lge_1_aa-2,1,8,AA,5,aa-2,"AA. Rivière de France, prend sa source aux Tro...",165.0,Géographie
8,lge_1_aa-3,1,9,AA,5,aa-3,"AA. Rivière de Hollande, affluent de la Dommel...",17.0,Géographie
9,lge_1_aa-4,1,10,AA,5,aa-4,AA. Nom de deux fleuves de la Russie. Le premi...,71.0,Géographie


In [19]:
#df.to_csv(drive_path + "predictions/EDdA_dataset_articles_superdomainBERT_230313.tsv", sep="\t")
df.to_csv(drive_path + "predictions/LGE_dataset_articles_superdomainBERT_230321.tsv", sep="\t", index=False)

In [20]:
#df.drop(columns=['contentLGE', 'contentEDdA'], inplace=True)

In [21]:
df.loc[(df[corpus+'-superdomainProba1'] == 'Géographie')]

Unnamed: 0,uid,lge-volume,lge-numero,lge-head,lge-page,lge-id,lge-content,lge-nbWords,lge-superdomainBert
1,lge_1_a-1,1,2,A,1,a-1,"A(Paléogr.). C’est à l’alphabet phénicien, on ...",839.0,Géographie
6,lge_1_aa-1,1,7,AA,4,aa-1,AA. Nom de plusieurs cours d’eau de l’Europe o...,75.0,Géographie
7,lge_1_aa-2,1,8,AA,5,aa-2,"AA. Rivière de France, prend sa source aux Tro...",165.0,Géographie
8,lge_1_aa-3,1,9,AA,5,aa-3,"AA. Rivière de Hollande, affluent de la Dommel...",17.0,Géographie
9,lge_1_aa-4,1,10,AA,5,aa-4,AA. Nom de deux fleuves de la Russie. Le premi...,71.0,Géographie
...,...,...,...,...,...,...,...,...,...
134800,lge_31_zvornix-0,31,7757,ZVORNIX,1370,zvornix-0,"ZVORNIX. Ville de Bosnie, sur la r. g. de la D...",27.0,Géographie
134801,lge_31_zweibrücken-0,31,7758,ZWEIBRÜCKEN,1370,zweibrücken-0,ZWEIBRÜCKEN. Ville de Bavière (V. Deux-Ponts).\n,6.0,Géographie
134803,lge_31_zwickau-0,31,7760,ZWICKAU,1370,zwickau-0,"ZWICKAU. Ville de Saxe, ch.-l. d’un cercle, su...",92.0,Géographie
134806,lge_31_zwolle-0,31,7763,ZWOLLE,1371,zwolle-0,"ZWOLLE. Ville des Pays-Bas, ch.-l. de la prov....",115.0,Géographie


In [22]:
df.shape

(134820, 9)