Skip to content
Snippets Groups Projects
Commit e6ada53c authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update Predict.ipynb

parent 3bba3205
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# BERT Predict classification
## 1. Setup the environment
### 1.1 Setup colab environment
#### 1.1.1 Install packages
%% Cell type:code id: tags:
``` python
!pip install transformers==4.10.3
!pip install sentencepiece
```
%% Cell type:markdown id: tags:
#### 1.1.2 Use more RAM
%% Cell type:code id: tags:
``` python
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20:
print('Not using a high-RAM runtime')
else:
print('You are using a high-RAM runtime!')
```
%% Cell type:markdown id: tags:
#### 1.1.3 Mount GoogleDrive
%% Cell type:code id: tags:
``` python
from google.colab import drive
drive.mount('/content/drive')
```
%% Cell type:markdown id: tags:
### 1.2 Setup GPU
%% Cell type:code id: tags:
``` python
import torch
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# for MacOS
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
device = torch.device("mps")
print('We will use the GPU')
else:
device = torch.device("cpu")
print('No GPU available, using the CPU instead.')
```
%% Output
We will use the GPU
%% Cell type:markdown id: tags:
### 1.3 Import librairies
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, CamembertTokenizer, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
```
%% Cell type:markdown id: tags:
## 2. Load Data
%% Cell type:code id: tags:
``` python
#drive_path = "drive/MyDrive/Classification-EDdA/"
drive_path = "../"
path = "/Users/lmoncla/git/gitlab.liris/GEODE/EDdA/output/"
```
%% Cell type:code id: tags:
``` python
#!wget https://geode.liris.cnrs.fr/files/datasets/EDdA/Classification/LGE_withContent.tsv
#!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/EDdA_dataset_articles_no_superdomain.tsv
!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/Parallel_datatset_articles_230215.tsv
```
%% Cell type:code id: tags:
``` python
#filepath = "data/LGE_withContent.tsv"
#filepath = "EDdA_dataset_articles_no_superdomain.tsv"
#filepath = "Parallel_datatset_articles_230215.tsv"
filepath = "EDdA_dataset_articles.tsv"
```
%% Cell type:code id: tags:
``` python
df = pd.read_csv(path + filepath, sep="\t")
df.head()
```
%% Output
volume numero head author \
0 1 1 Title Page unsigned
1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert
2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert
3 1 5 A, a & a Dumarsais5
4 1 6 A Dumarsais5
edda_class enccre_id enccre_class \
0 unclassified NaN NaN
1 unclassified NaN NaN
2 unclassified NaN NaN
3 Grammaire v1-1-0 Grammaire
4 unclassified v1-1-1 Grammaire
content \
0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
4 \nA, mot, est 1. la troisieme personne du prés...
content_without_designant \
0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
4 \nA, mot, est 1. la troisieme personne du prés...
first_paragraph nb_words super_domain
0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ... 151 Unclassified
1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS... 208 Unclassified
2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n 44669 Unclassified
3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc... 711 Philosophie
4 \nA, mot, est 1. la troisieme personne du prés... 238 Unclassified
%% Cell type:code id: tags:
``` python
#corpus = 'LGE'
corpus = ''
data = df['content'+corpus].values
```
%% Cell type:markdown id: tags:
## 3. Load model and predict
### 3.1 BERT / CamemBERT
%% Cell type:code id: tags:
``` python
model_name = "bert-base-multilingual-cased"
#model_name = "camembert-base"
#model_path = path + "models/model_" + model_name + "_s10000.pt"
model_path = drive_path + "models/model_" + model_name + "_s10000_superdomains.pt"
```
%% Cell type:code id: tags:
``` python
def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
# For every sentence...
for sent in sentences:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
# This function also supports truncation and conversion
# to pytorch tensors, but I need to do padding, so I
# can't use these features.
#max_length = max_len, # Truncate all sentences.
#return_tensors = 'pt', # Return pytorch tensors.
)
input_ids_test.append(encoded_sent)
# Pad our input tokens
padded_test = []
for i in input_ids_test:
if len(i) > max_len:
padded_test.extend([i[:max_len]])
else:
padded_test.extend([i + [0] * (max_len - len(i))])
input_ids_test = np.array(padded_test)
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# Convert to tensors.
inputs = torch.tensor(input_ids_test)
masks = torch.tensor(attention_masks)
#set batch size
# Create the DataLoader.
data = TensorDataset(inputs, masks)
prediction_sampler = SequentialSampler(data)
return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)
def predict(model, dataloader, device):
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions_test , true_labels = [], []
pred_labels_ = []
# Predict
for batch in dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader
b_input_ids, b_input_mask = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
#print(logits)
# Move logits and labels to CPU ???
logits = logits.detach().cpu().numpy()
#print(logits)
# Store predictions and true labels
predictions_test.append(logits)
pred_labels = []
for i in range(len(predictions_test)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
pred_labels.append(pred_labels_i)
pred_labels_ += [item for sublist in pred_labels for item in sublist]
return pred_labels_
```
%% Cell type:code id: tags:
``` python
if model_name == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(model_name)
elif model_name == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(model_name)
```
%% Output
Loading Bert Tokenizer...
%% Cell type:code id: tags:
``` python
data_loader = generate_dataloader(tokenizer, data)
```
%% Output
Token indices sequence length is longer than the specified maximum sequence length for this model (75311 > 512). Running this sequence through the model will result in indexing errors
%% Cell type:markdown id: tags:
https://discuss.huggingface.co/t/an-efficient-way-of-loading-a-model-that-was-saved-with-torch-save/9814
https://github.com/huggingface/transformers/issues/2094
%% Cell type:code id: tags:
``` python
#model = torch.load(model_path, map_location=torch.device('mps'))
#model.load_state_dict(torch.load(model_path, map_location=torch.device('mps')))
#model = BertForSequenceClassification.from_pretrained(model_path).to("cuda")
model = BertForSequenceClassification.from_pretrained(model_path).to("mps")
```
%% Cell type:code id: tags:
``` python
pred = predict(model, data_loader, device)
```
%% Cell type:code id: tags:
``` python
pred
```
%% Cell type:code id: tags:
``` python
import pickle
#encoder_filename = "models/label_encoder.pkl"
encoder_filename = "models/label_encoder_superdomains.pkl"
with open(drive_path + encoder_filename, 'rb') as file:
encoder = pickle.load(file)
```
%% Output
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator LabelEncoder from version 1.0.2 when using version 1.1.3. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
warnings.warn(
%% Cell type:code id: tags:
``` python
p2 = list(encoder.inverse_transform(pred))
```
%% Cell type:code id: tags:
``` python
df['superdomainBert'+corpus] = p2
```
%% Cell type:code id: tags:
``` python
df[df.numero == 2835]['content'+corpus].values
```
%% Cell type:code id: tags:
``` python
df.head(10)
```
%% Output
volume numero head author \
0 1 1 Title Page unsigned
1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert
2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert
3 1 5 A, a & a Dumarsais5
4 1 6 A Dumarsais5
5 1 7 A Dumarsais
6 1 8 A Mallet
7 1 9 A, lettre symbolique Mallet
8 1 10 A, numismatique ou monétaire Mallet
9 1 11 A, lapidaire Mallet
edda_class enccre_id enccre_class \
0 unclassified NaN NaN
1 unclassified NaN NaN
2 unclassified NaN NaN
3 Grammaire v1-1-0 Grammaire
4 unclassified v1-1-1 Grammaire
5 unclassified v1-1-2 Grammaire
6 unclassified v1-1-3 NaN
7 unclassified v1-1-4 NaN
8 unclassified v1-1-5 Médailles
9 unclassified v1-1-6 Histoire
content \
0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
4 \nA, mot, est 1. la troisieme personne du prés...
5 \nA, préposition vient du latin à, à dextris, ...
6 \nA, étoit une lettre numérale parmi les Ancie...
7 \nA, lettre symbolique, étoit un hiéroglyphe c...
8 \nA, numismatique ou monétaire, sur le revers ...
9 \nA, lapidaire, dans les anciennes inscription...
content_without_designant \
0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
4 \nA, mot, est 1. la troisieme personne du prés...
5 \nA, préposition vient du latin à, à dextris, ...
6 \nA, étoit une lettre numérale parmi les Ancie...
7 \nA, lettre symbolique, étoit un hiéroglyphe c...
8 \nA, numismatique ou monétaire, sur le revers ...
9 \nA, lapidaire, dans les anciennes inscription...
first_paragraph nb_words super_domain \
0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ... 151 Unclassified
1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS... 208 Unclassified
2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n 44669 Unclassified
3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc... 711 Philosophie
4 \nA, mot, est 1. la troisieme personne du prés... 238 Unclassified
5 \nA, préposition vient du latin à, à dextris, ... 1980 Unclassified
6 \nA, étoit une lettre numérale parmi les Ancie... 200 Unclassified
7 \nA, lettre symbolique, étoit un hiéroglyphe c... 82 Unclassified
8 \nA, numismatique ou monétaire, sur le revers ... 112 Unclassified
9 \nA, lapidaire, dans les anciennes inscription... 80 Unclassified
superdomainBert
0 Philosophie
1 Philosophie
2 Belles-lettres
3 Philosophie
4 Philosophie
5 Philosophie
6 Histoire
7 Histoire
8 Histoire
9 Histoire
%% Cell type:code id: tags:
``` python
df.to_csv(drive_path + "predictions/EDdA_dataset_articles_superdomainBERT_230313.tsv", sep="\t")
```
%% Cell type:code id: tags:
``` python
df.drop(columns=['contentLGE', 'contentEDdA'], inplace=True)
```
%% Cell type:code id: tags:
``` python
df.to_csv(drive_path + "predictions/metadata_parallel_predictions_superdomain.csv", sep=",", index=False)
```
%% Cell type:code id: tags:
``` python
df.loc[(df['superdomainBert'] == 'Géographie')]
```
%% Output
volume numero head author edda_class \
24 1 26 A Diderot unclassified
25 1 27 AA Diderot unclassified
27 1 29 AACH ou ACH Diderot unclassified
28 1 30 AAHUS Diderot unclassified
30 1 32 AAR Diderot unclassified
... ... ... ... ... ...
74051 17 3070 ZYGRIS Jaucourt Géographie ancienne
74054 17 3073 ZYRAS Jaucourt Géographie ancienne
74055 17 3074 ZZUÉNÉ ou ZZEUENE Jaucourt Géographie ancienne
74080 17 3099 CABOTAGE Jaucourt Navigation
74165 17 3184 GUAYAQUIL La Condamine Géographie
enccre_id enccre_class \
24 v1-9-0 Géographie
25 v1-10-0 Géographie
27 v1-12-0 Géographie
28 v1-13-0 Géographie
30 v1-15-0 Géographie
... ... ...
74051 v17-2068-0 Géographie
74054 v17-2071-0 Géographie
74055 v17-2072-0 Géographie
74080 v17-2097-0 Marine
74165 v17-2177-0 Géographie
content \
24 \n* A, s. petite riviere de France, qui a sa s...
25 \n* AA, s. f. riviere de France, qui prend sa ...
27 \n* AACH ou ACH, s. f. petite ville d'Allemagn...
28 \n* AAHUS, s. petite ville d'Allemagne dans le...
30 \n* AAR, s. grande riviere qui a sa source pro...
... ...
74051 \nZYGRIS, (Géog. anc.) ville du nôme de Lybie\...
74054 \nZYRAS, (Géog. anc.) fleuve de Thrace. Pline,...
74055 \nZZUÉNÉ ou ZZEUENE, (Géog. anc.) ville située...
74080 \nCABOTAGE, s. m. (Navigation.) le cabotage es...
74165 \nGUAYAQUIL, (Géograph.) nom d'une ville &\nd'...
content_without_designant \
24 \n* A, s. petite riviere de France, qui a sa s...
25 \n* AA, s. f. riviere de France, qui prend sa ...
27 \n* AACH ou ACH, s. f. petite ville d'Allemagn...
28 \n* AAHUS, s. petite ville d'Allemagne dans le...
30 \n* AAR, s. grande riviere qui a sa source pro...
... ...
74051 \nZYGRIS, ville du nôme de Lybie\nsur la côte...
74054 \nZYRAS, fleuve de Thrace. Pline,\nliv. IV. c...
74055 \nZZUÉNÉ ou ZZEUENE, ville située\nsur la riv...
74080 \nCABOTAGE, s. m. le cabotage est\nune naviga...
74165 \nGUAYAQUIL, nom d'une ville &\nd'une grande ...
first_paragraph nb_words \
24 \n* A, s. petite riviere de France, qui a sa s... 15
25 \n* AA, s. f. riviere de France, qui prend sa ... 46
27 \n* AACH ou ACH, s. f. petite ville d'Allemagn... 24
28 \n* AAHUS, s. petite ville d'Allemagne dans le... 21
30 \n* AAR, s. grande riviere qui a sa source pro... 30
... ... ...
74051 \nZYGRIS, ville du nôme de Lybie\nsur la côte... 38
74054 \nZYRAS, fleuve de Thrace. Pline,\nliv. IV. c... 28
74055 \nZZUÉNÉ ou ZZEUENE, ville située\nsur la riv... 149
74080 \nCABOTAGE, s. m. le cabotage est\nune naviga... 192
74165 \nGUAYAQUIL, nom d'une ville &\nd'une grande ... 446
super_domain superdomainBert
24 Unclassified Géographie
25 Unclassified Géographie
27 Unclassified Géographie
28 Unclassified Géographie
30 Unclassified Géographie
... ... ...
74051 Géographie Géographie
74054 Géographie Géographie
74055 Géographie Géographie
74080 Géographie Géographie
74165 Géographie Géographie
[15383 rows x 13 columns]
%% Cell type:code id: tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment