Skip to content
Snippets Groups Projects
Commit 722ad2df authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update Predict.ipynb

parent 4731074b
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# BERT Predict classification
## 1. Setup the environment
### 1.1 Setup colab environment
#### 1.1.1 Install packages
%% Cell type:code id: tags:
``` python
!pip install transformers==4.10.3
!pip install sentencepiece
```
%% Cell type:markdown id: tags:
#### 1.1.2 Use more RAM
%% Cell type:code id: tags:
``` python
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20:
print('Not using a high-RAM runtime')
else:
print('You are using a high-RAM runtime!')
```
%% Cell type:markdown id: tags:
#### 1.1.3 Mount GoogleDrive
%% Cell type:code id: tags:
``` python
from google.colab import drive
drive.mount('/content/drive')
```
%% Cell type:markdown id: tags:
### 1.2 Setup GPU
%% Cell type:code id: tags:
``` python
import torch
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# for MacOS
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
device = torch.device("mps")
print('We will use the GPU')
else:
device = torch.device("cpu")
print('No GPU available, using the CPU instead.')
```
%% Output
We will use the GPU
%% Cell type:markdown id: tags:
### 1.3 Import librairies
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification, CamembertTokenizer, TextClassificationPipeline
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import pickle
```
%% Cell type:markdown id: tags:
## 2. Load Data
%% Cell type:code id: tags:
``` python
#!wget https://geode.liris.cnrs.fr/files/datasets/EDdA/Classification/LGE_withContent.tsv
#!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/EDdA_dataset_articles_no_superdomain.tsv
#!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/Parallel_datatset_articles_230215.tsv
```
%% Cell type:code id: tags:
``` python
#drive_path = "drive/MyDrive/Classification-EDdA/"
drive_path = "../"
#path = "/Users/lmoncla/git/gitlab.liris/GEODE/EDdA/output/"
path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/"
#filepath = "Parallel_datatset_articles_230215.tsv"
#filepath = "EDdA_dataset_articles.tsv"
filepath = 'LGE_dataset_articles_230314.tsv'
corpus = 'lge'
#corpus = ''
```
%% Cell type:code id: tags:
``` python
df = pd.read_csv(path + filepath, sep="\t")
df.head()
```
%% Output
uid lge-volume lge-numero lge-head lge-page lge-id \
0 lge_1_a-0 1 1 A 0 a-0
1 lge_1_a-1 1 2 A 1 a-1
2 lge_1_a-2 1 3 A 4 a-2
3 lge_1_a-3 1 4 A 4 a-3
4 lge_1_a-4 1 5 A 4 a-4
lge-content lge-nbWords
0 A(Ling.). Son vocal et première lettre de notr... 1761.0
1 A(Paléogr.). C’est à l’alphabet phénicien, on ... 839.0
2 A(Log.). Cette voyelle désigne les proposition... 56.0
3 A(Mus.). La lettre a est employée par les musi... 267.0
4 A(Numis.). Dans la numismatique grecque, la le... 67.0
%% Cell type:code id: tags:
``` python
dataset = df[corpus+'-content'].values
```
%% Cell type:markdown id: tags:
## 3. Load model and predict
### 3.1 BERT / CamemBERT
%% Cell type:code id: tags:
``` python
model_name = "bert-base-multilingual-cased"
#model_name = "camembert-base"
#model_path = path + "models/model_" + model_name + "_s10000.pt"
model_path = drive_path + "models/model_" + model_name + "_s10000_superdomains.pt"
```
%% Cell type:code id: tags:
``` python
def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
# For every sentence...
for sent in sentences:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
# This function also supports truncation and conversion
# to pytorch tensors, but I need to do padding, so I
# can't use these features.
#max_length = max_len, # Truncate all sentences.
#return_tensors = 'pt', # Return pytorch tensors.
)
input_ids_test.append(encoded_sent)
# Pad our input tokens
padded_test = []
for i in input_ids_test:
if len(i) > max_len:
padded_test.extend([i[:max_len]])
else:
padded_test.extend([i + [0] * (max_len - len(i))])
input_ids_test = np.array(padded_test)
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# Convert to tensors.
inputs = torch.tensor(input_ids_test)
masks = torch.tensor(attention_masks)
#set batch size
# Create the DataLoader.
data = TensorDataset(inputs, masks)
prediction_sampler = SequentialSampler(data)
return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)
def predict(model, dataloader, device):
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions_test , true_labels = [], []
pred_labels_ = []
# Predict
for batch in dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader
b_input_ids, b_input_mask = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
#print(logits)
# Move logits and labels to CPU ???
logits = logits.detach().cpu().numpy()
#print(logits)
# Store predictions and true labels
predictions_test.append(logits)
pred_labels = []
for i in range(len(predictions_test)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
pred_labels.append(pred_labels_i)
pred_labels_ += [item for sublist in pred_labels for item in sublist]
return pred_labels_
#https://discuss.huggingface.co/t/i-have-trained-my-classifier-now-how-do-i-do-predictions/3625/3
```
%% Cell type:code id: tags:
``` python
if model_name == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(model_name)
elif model_name == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(model_name)
```
%% Output
Loading Bert Tokenizer...
%% Cell type:markdown id: tags:
https://discuss.huggingface.co/t/an-efficient-way-of-loading-a-model-that-was-saved-with-torch-save/9814
https://github.com/huggingface/transformers/issues/2094
%% Cell type:code id: tags:
``` python
model = BertForSequenceClassification.from_pretrained(model_path).to(device.type)
```
%% Cell type:code id: tags:
``` python
#data_loader = generate_dataloader(tokenizer, data)
```
%% Cell type:code id: tags:
``` python
#pred = predict(model, data_loader, device)
```
%% Cell type:code id: tags:
``` python
# https://huggingface.co/docs/transformers/main_classes/pipelines
def data(): #TODO :
for d in dataset:
yield f"{d}"
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=device)
# https://stackoverflow.com/questions/67849833/how-to-truncate-input-in-the-huggingface-pipeline
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
```
%% Output
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/transformers/pipelines/text_classification.py:89: UserWarning: `return_all_scores` is now deprecated, if want a similar funcionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.
warnings.warn(
%% Cell type:code id: tags:
``` python
pred = []
for out in pipe(data(), **tokenizer_kwargs):
for out in tqdm(pipe(data(), **tokenizer_kwargs)):
out = sorted(out, key=lambda d: d['score'], reverse=True)
#print(int(out[0]['label'][6:]), out[0]['score'], int(out[1]['label'][6:]), out[1]['score'], int(out[2]['label'][6:]), out[2]['score']) # label ### TODO modifier ici
pred.append([int(out[0]['label'][6:]), out[0]['score'], int(out[1]['label'][6:]), out[1]['score'], int(out[2]['label'][6:]), out[2]['score']])
pred = np.array(pred)
```
%% Output
13 0.9375858902931213 2 0.021192006766796112 7 0.012656938284635544
6 0.9926056861877441 7 0.0029343003407120705 8 0.0010190330212935805
13 0.9823671579360962 2 0.00412388751283288 1 0.0022031590342521667
10 0.9058945775032043 2 0.029459038749337196 7 0.014979560859501362
7 0.9861114025115967 2 0.003949115984141827 6 0.0015271392185240984
4 0.9868664741516113 5 0.002140316180884838 15 0.0018120049498975277
6 0.9541037678718567 7 0.025117166340351105 8 0.00887206755578518
6 0.9981995820999146 7 0.00028012823895551264 8 0.00019026087829843163
6 0.9958584904670715 8 0.0010782132158055902 7 0.000548136536963284
6 0.9979164004325867 7 0.0005610007210634649 9 0.00018632493447512388
6 0.997787356376648 7 0.0003991609555669129 8 0.0002408416330581531
6 0.9979755282402039 7 0.0005005390848964453 8 0.0002189433143939823
11 0.9915592074394226 3 0.00250804889947176 14 0.0010435001458972692
7 0.958525538444519 2 0.011816944926977158 5 0.009215029887855053
8 0.2876076102256775 7 0.2462710738182068 2 0.17002692818641663
8 0.9409826397895813 7 0.03510138392448425 6 0.007040794938802719
8 0.3623795211315155 1 0.3142264485359192 7 0.13734686374664307
7 0.7184596061706543 6 0.11600398272275925 8 0.09759759902954102
7 0.8406069278717041 6 0.12032385170459747 2 0.009349718689918518
7 0.978775143623352 2 0.005065936129540205 4 0.0037283776327967644
6 0.4818583130836487 9 0.22724471986293793 5 0.07886118441820145
6 0.9740952253341675 8 0.015889622271060944 1 0.001933401683345437
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
/var/folders/qm/v_b1md29221_cnpcxf5qc43c0000gn/T/ipykernel_15176/3568789409.py in <cell line: 3>()
1 pred = []
2
----> 3 for out in pipe(data(), **tokenizer_kwargs):
4 out = sorted(out, key=lambda d: d['score'], reverse=True)
5 print(int(out[0]['label'][6:]), out[0]['score'], int(out[1]['label'][6:]), out[1]['score'], int(out[2]['label'][6:]), out[2]['score']) # label ### TODO modifier ici
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/transformers/pipelines/pt_utils.py in __next__(self)
112
113 # We're out of items within a batch
--> 114 item = next(self.iterator)
115 processed = self.infer(item, **self.params)
116 # We now have a batch of "inferred things".
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/transformers/pipelines/pt_utils.py in __next__(self)
113 # We're out of items within a batch
114 item = next(self.iterator)
--> 115 processed = self.infer(item, **self.params)
116 # We now have a batch of "inferred things".
117 if self.loader_batch_size is not None:
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/transformers/pipelines/base.py in forward(self, model_inputs, **forward_params)
988 with inference_context():
989 model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
--> 990 model_outputs = self._forward(model_inputs, **forward_params)
991 model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
992 else:
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/transformers/pipelines/text_classification.py in _forward(self, model_inputs)
165
166 def _forward(self, model_inputs):
--> 167 return self.model(**model_inputs)
168
169 def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)
1550 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1551
-> 1552 outputs = self.bert(
1553 input_ids,
1554 attention_mask=attention_mask,
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
985 # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
986 # ourselves in which case we just need to make it broadcastable to all heads.
--> 987 extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
988
989 # If a 2D or 3D attention mask is provided for the cross-attention
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/transformers/modeling_utils.py in get_extended_attention_mask(self, attention_mask, input_shape, device, dtype)
789 # effectively the same as removing these entirely.
790 extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility
--> 791 extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
792 return extended_attention_mask
793
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/torch/_tensor.py in wrapped(*args, **kwargs)
37 if has_torch_function(args):
38 return handle_torch_function(wrapped, args, *args, **kwargs)
---> 39 return f(*args, **kwargs)
40 except TypeError:
41 return NotImplemented
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/torch/_tensor.py in __rsub__(self, other)
831 @_handle_torch_function_and_wrap_type_error_to_not_implemented
832 def __rsub__(self, other):
--> 833 return _C._VariableFunctions.rsub(self, other)
834
835 @_handle_torch_function_and_wrap_type_error_to_not_implemented
KeyboardInterrupt:
134820it [1:07:31, 33.27it/s]
%% Cell type:code id: tags:
``` python
# Load label encoder
#encoder_filename = "models/label_encoder.pkl"
encoder_filename = "models/label_encoder_superdomains.pkl"
with open(drive_path + encoder_filename, 'rb') as file:
encoder = pickle.load(file)
```
%% Output
/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator LabelEncoder from version 1.0.2 when using version 1.1.3. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
warnings.warn(
%% Cell type:code id: tags:
``` python
pred1 = list(encoder.inverse_transform(pred[:,0].astype(int)))
pred2 = list(encoder.inverse_transform(pred[:,2].astype(int)))
pred3 = list(encoder.inverse_transform(pred[:,4].astype(int)))
```
%% Cell type:code id: tags:
``` python
#print(pred1)
#print(pred[:,1])
```
%% Cell type:code id: tags:
``` python
df[corpus+'-superdomainPred1'] = pred1
df[corpus+'-superdomainProba1'] = pred[:,1]
df[corpus+'-superdomainPred2'] = pred2
df[corpus+'-superdomainProba2'] = pred[:,3]
df[corpus+'-superdomainPred3'] = pred3
df[corpus+'-superdomainProba3'] = pred[:,5]
```
%% Cell type:code id: tags:
``` python
df.head(10)
```
%% Output
uid lge-volume lge-numero lge-head lge-page lge-id \
0 lge_1_a-0 1 1 A 0 a-0
1 lge_1_a-1 1 2 A 1 a-1
2 lge_1_a-2 1 3 A 4 a-2
3 lge_1_a-3 1 4 A 4 a-3
4 lge_1_a-4 1 5 A 4 a-4
5 lge_1_aa-0 1 6 AA 4 aa-0
6 lge_1_aa-1 1 7 AA 4 aa-1
7 lge_1_aa-2 1 8 AA 5 aa-2
8 lge_1_aa-3 1 9 AA 5 aa-3
9 lge_1_aa-4 1 10 AA 5 aa-4
lge-content lge-nbWords \
0 A(Ling.). Son vocal et première lettre de notr... 1761.0
1 A(Paléogr.). C’est à l’alphabet phénicien, on ... 839.0
2 A(Log.). Cette voyelle désigne les proposition... 56.0
3 A(Mus.). La lettre a est employée par les musi... 267.0
4 A(Numis.). Dans la numismatique grecque, la le... 67.0
5 AA. Ces deux lettres désignent l’atelier monét... 14.0
6 AA. Nom de plusieurs cours d’eau de l’Europe o... 75.0
7 AA. Rivière de France, prend sa source aux Tro... 165.0
8 AA. Rivière de Hollande, affluent de la Dommel... 17.0
9 AA. Nom de deux fleuves de la Russie. Le premi... 71.0
lge-superdomainPred1 lge-superdomainProba1 lge-superdomainPred2 \
0 Philosophie 0.937586 Belles-lettres
1 Géographie 0.992606 Histoire
2 Philosophie 0.982367 Belles-lettres
3 Musique 0.905895 Belles-lettres
4 Histoire 0.986111 Belles-lettres
5 Commerce 0.986866 Droit Jurisprudence
6 Géographie 0.954104 Histoire
7 Géographie 0.998200 Histoire
8 Géographie 0.995858 Histoire naturelle
9 Géographie 0.997916 Histoire
lge-superdomainProba2 lge-superdomainPred3 lge-superdomainProba3
0 0.021192 Histoire 0.012657
1 0.002934 Histoire naturelle 0.001019
2 0.004124 Beaux-arts 0.002203
3 0.029459 Histoire 0.014980
4 0.003949 Géographie 0.001527
5 0.002140 Politique 0.001812
6 0.025117 Histoire naturelle 0.008872
7 0.000280 Histoire naturelle 0.000190
8 0.001078 Histoire 0.000548
9 0.000561 Militaire 0.000186
%% Cell type:code id: tags:
``` python
#df.to_csv(drive_path + "predictions/EDdA_dataset_articles_superdomainBERT_230313.tsv", sep="\t")
df.to_csv(drive_path + "predictions/LGE_dataset_articles_superdomainBERT_230321.tsv", sep="\t", index=False)
```
%% Cell type:code id: tags:
``` python
#df.drop(columns=['contentLGE', 'contentEDdA'], inplace=True)
df.loc[(df[corpus+'-superdomainProba1'] == 'Géographie')]
```
%% Cell type:code id: tags:
%% Output
``` python
df.loc[(df[corpus+'-superdomainProba1'] == 'Géographie')]
```
Empty DataFrame
Columns: [uid, lge-volume, lge-numero, lge-head, lge-page, lge-id, lge-content, lge-nbWords, lge-superdomainPred1, lge-superdomainProba1, lge-superdomainPred2, lge-superdomainProba2, lge-superdomainPred3, lge-superdomainProba3]
Index: []
%% Cell type:code id: tags:
``` python
df.shape
```
%% Output
(134820, 14)
%% Cell type:code id: tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment