Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
EDdA Classification
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Projet GEODE
EDdA Classification
Commits
a7d08239
Commit
a7d08239
authored
2 years ago
by
Ludovic Moncla
Browse files
Options
Downloads
Patches
Plain Diff
Update Classification_BertFineTuning.ipynb
parent
9717e4c0
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
notebooks/Classification_BertFineTuning.ipynb
+13
-5
13 additions, 5 deletions
notebooks/Classification_BertFineTuning.ipynb
with
13 additions
and
5 deletions
notebooks/Classification_BertFineTuning.ipynb
+
13
−
5
View file @
a7d08239
...
...
@@ -67,7 +67,7 @@
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
1
,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
...
...
@@ -75,7 +75,15 @@
"id": "dPOU-Efhf4ui",
"outputId": "121dd21e-f98c-483d-d6d1-2838f732a4e2"
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"We will use the GPU\n"
]
}
],
"source": [
"import torch\n",
"\n",
...
...
@@ -131,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
2
,
"metadata": {
"id": "SkErnwgMMbRj"
},
...
...
@@ -173,7 +181,7 @@
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
3
,
"metadata": {
"id": "WkIVcabUgxIl"
},
...
...
@@ -1568,7 +1576,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13
| packaged by conda-forge | (main, May 27 2022, 17:01:00) \n[Clang 13.0.1 ]
"
"version": "3.9.13"
},
"vscode": {
"interpreter": {
...
...
%% Cell type:markdown id: tags:
# BERT fine-tuning for EDdA classification
%% Cell type:markdown id: tags:
## Setup colab environment
%% Cell type:code id: tags:
```
python
from
psutil
import
virtual_memory
ram_gb
=
virtual_memory
().
total
/
1e9
print
(
'
Your runtime has {:.1f} gigabytes of available RAM
\n
'
.
format
(
ram_gb
))
if
ram_gb
<
20
:
print
(
'
Not using a high-RAM runtime
'
)
else
:
print
(
'
You are using a high-RAM runtime!
'
)
```
%% Cell type:code id: tags:
```
python
from
google.colab
import
drive
drive
.
mount
(
'
/content/drive
'
)
```
%% Cell type:markdown id: tags:
## Setup GPU
%% Cell type:code id: tags:
```
python
import
torch
# If there's a GPU available...
if
torch
.
cuda
.
is_available
():
# Tell PyTorch to use the GPU.
device
=
torch
.
device
(
"
cuda
"
)
print
(
'
There are %d GPU(s) available.
'
%
torch
.
cuda
.
device_count
())
print
(
'
We will use the GPU:
'
,
torch
.
cuda
.
get_device_name
(
0
))
# for MacOS
elif
torch
.
backends
.
mps
.
is_available
()
and
torch
.
backends
.
mps
.
is_built
():
device
=
torch
.
device
(
"
mps
"
)
print
(
'
We will use the GPU
'
)
else
:
device
=
torch
.
device
(
"
cpu
"
)
print
(
'
No GPU available, using the CPU instead.
'
)
```
%% Output
We will use the GPU
%% Cell type:markdown id: tags:
## Install packages
%% Cell type:code id: tags:
```
python
!
pip
install
transformers
==
4.10
.
3
!
pip
install
sentencepiece
```
%% Cell type:markdown id: tags:
## Import librairies
%% Cell type:code id: tags:
```
python
import
pandas
as
pd
import
numpy
as
np
import
csv
import
os
import
pickle
from
sklearn
import
preprocessing
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
*
from
transformers
import
BertTokenizer
,
CamembertTokenizer
,
BertForSequenceClassification
,
AdamW
,
BertConfig
,
CamembertForSequenceClassification
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
RandomSampler
,
SequentialSampler
from
transformers
import
get_linear_schedule_with_warmup
import
time
import
datetime
import
random
import
matplotlib.pyplot
as
plt
from
sklearn.metrics
import
plot_confusion_matrix
from
sklearn.metrics
import
confusion_matrix
from
sklearn.metrics
import
classification_report
import
seaborn
as
sns
```
%% Cell type:markdown id: tags:
## Utils functions
%% Cell type:code id: tags:
```
python
def
resample_classes
(
df
,
classColumnName
,
numberOfInstances
):
#random numberOfInstances elements
replace
=
False
# with replacement
fn
=
lambda
obj
:
obj
.
loc
[
np
.
random
.
choice
(
obj
.
index
,
numberOfInstances
if
len
(
obj
)
>
numberOfInstances
else
len
(
obj
),
replace
),:]
return
df
.
groupby
(
classColumnName
,
as_index
=
False
).
apply
(
fn
)
# Function to calculate the accuracy of our predictions vs labels
def
flat_accuracy
(
preds
,
labels
):
pred_flat
=
np
.
argmax
(
preds
,
axis
=
1
).
flatten
()
labels_flat
=
labels
.
flatten
()
return
np
.
sum
(
pred_flat
==
labels_flat
)
/
len
(
labels_flat
)
def
format_time
(
elapsed
):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded
=
int
(
round
((
elapsed
)))
# Format as hh:mm:ss
return
str
(
datetime
.
timedelta
(
seconds
=
elapsed_rounded
))
```
%% Cell type:markdown id: tags:
## Load Data
%% Cell type:code id: tags:
```
python
!
wget
https
:
//
geode
.
liris
.
cnrs
.
fr
/
EDdA
-
Classification
/
datasets
/
training_set
.
tsv
!
wget
https
:
//
geode
.
liris
.
cnrs
.
fr
/
EDdA
-
Classification
/
datasets
/
test_set
.
tsv
```
%% Cell type:code id: tags:
```
python
!
wget
https
:
//
geode
.
liris
.
cnrs
.
fr
/
EDdA
-
Classification
/
datasets
/
training_set_superdomains
.
tsv
!
wget
https
:
//
geode
.
liris
.
cnrs
.
fr
/
EDdA
-
Classification
/
datasets
/
test_set_superdomains
.
tsv
```
%% Cell type:markdown id: tags:
### Loading dataset
%% Cell type:code id: tags:
```
python
#train_path = '../data/training_set.tsv'
#test_path = '../data/test_set.tsv'
train_path
=
'
../data/training_set_superdomains.tsv
'
test_path
=
'
../data/test_set_superdomains.tsv
'
```
%% Cell type:code id: tags:
```
python
df_train
=
pd
.
read_csv
(
train_path
,
sep
=
"
\t
"
)
df_train
.
head
()
```
%% Cell type:code id: tags:
```
python
print
(
df_train
.
shape
)
```
%% Cell type:markdown id: tags:
## Configuration
%% Cell type:code id: tags:
```
python
columnText
=
'
contentWithoutClass
'
#columnClass = 'ensemble_domaine_enccre'
columnClass
=
'
super_domain
'
maxOfInstancePerClass
=
10000
model_chosen
=
"
bert
"
#model_chosen = "camembert"
batch_size
=
16
# 16 or 32 recommended
max_len
=
512
#path = "drive/MyDrive/Classification-EDdA/"
path
=
"
../models/new/
"
encoder_filename
=
"
label_encoder.pkl
"
```
%% Cell type:markdown id: tags:
## Preprocessing
%% Cell type:code id: tags:
```
python
if
maxOfInstancePerClass
!=
10000
:
df_train
=
resample_classes
(
df_train
,
columnClass
,
maxOfInstancePerClass
)
```
%% Cell type:code id: tags:
```
python
labels
=
df_train
[
columnClass
]
numberOfClasses
=
labels
.
nunique
()
if
os
.
path
.
isfile
(
path
+
encoder_filename
):
# load existing encoder
with
open
(
path
+
encoder_filename
,
'
rb
'
)
as
file
:
encoder
=
pickle
.
load
(
file
)
else
:
encoder
=
preprocessing
.
LabelEncoder
()
encoder
.
fit
(
labels
)
with
open
(
path
+
encoder_filename
,
'
wb
'
)
as
file
:
pickle
.
dump
(
encoder
,
file
)
labels
=
encoder
.
transform
(
labels
)
```
%% Cell type:code id: tags:
```
python
sentences_train
=
df_train
[
columnText
].
values
labels_train
=
labels
.
tolist
()
```
%% Cell type:code id: tags:
```
python
sentences_train
```
%% Cell type:markdown id: tags:
# Model
## Tokenisation & Input Formatting
%% Cell type:code id: tags:
```
python
if
model_chosen
==
"
bert
"
:
tokeniser_bert
=
'
bert-base-multilingual-cased
'
model_bert
=
"
bert-base-multilingual-cased
"
elif
model_chosen
==
"
camembert
"
:
tokeniser_bert
=
'
camembert-base
'
model_bert
=
'
camembert-base
'
```
%% Cell type:code id: tags:
```
python
# Load the BERT tokenizer.
if
model_chosen
==
"
bert
"
:
print
(
'
Loading BERT tokenizer...
'
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
tokeniser_bert
)
elif
model_chosen
==
"
camembert
"
:
print
(
'
Loading CamemBERT tokenizer...
'
)
tokenizer
=
CamembertTokenizer
.
from_pretrained
(
tokeniser_bert
)
```
%% Cell type:code id: tags:
```
python
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_train
=
[]
# For every sentence...
for
sent
in
sentences_train
:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent_train
=
tokenizer
.
encode
(
str
(
sent
),
# Sentence to encode.
add_special_tokens
=
True
,
# Add '[CLS]' and '[SEP]'
# This function also supports truncation and conversion
# to pytorch tensors, but I need to do padding, so I
# can't use these features.
#max_length = 128, # Truncate all sentences.
#return_tensors = 'pt', # Return pytorch tensors.
)
# Add the encoded sentence to the list.
input_ids_train
.
append
(
encoded_sent_train
)
```
%% Cell type:code id: tags:
```
python
print
(
'
Max sentence length train:
'
,
max
([
len
(
sen
)
for
sen
in
input_ids_train
]))
```
%% Cell type:code id: tags:
```
python
padded_train
=
[]
for
i
in
input_ids_train
:
if
len
(
i
)
>
max_len
:
padded_train
.
extend
([
i
[:
max_len
]])
else
:
padded_train
.
extend
([
i
+
[
0
]
*
(
max_len
-
len
(
i
))])
padded_train
=
input_ids_train
=
np
.
array
(
padded_train
)
```
%% Cell type:code id: tags:
```
python
# Create attention masks
attention_masks_train
=
[]
# For each sentence...
for
sent
in
padded_train
:
# Create the attention mask.
# - If a token ID is 0, then it's padding, set the mask to 0.
# - If a token ID is > 0, then it's a real token, set the mask to 1.
att_mask
=
[
int
(
token_id
>
0
)
for
token_id
in
sent
]
# Store the attention mask for this sentence.
attention_masks_train
.
append
(
att_mask
)
```
%% Cell type:code id: tags:
```
python
# Use 70% for training and 30% for validation.
#train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
# random_state=2018, test_size=0.3, stratify = labels)
# Do the same for the masks.
#train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
# random_state=2018, test_size=0.3, stratify = labels)
```
%% Cell type:code id: tags:
```
python
# Convert all inputs and labels into torch tensors, the required datatype
# for my model.
train_inputs
=
torch
.
tensor
(
padded_train
)
train_labels
=
torch
.
tensor
(
labels_train
)
train_masks
=
torch
.
tensor
(
attention_masks_train
)
```
%% Cell type:code id: tags:
```
python
# The DataLoader needs to know the batch size for training, so I specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.
# Create the DataLoader for training set.
train_data
=
TensorDataset
(
train_inputs
,
train_masks
,
train_labels
)
train_sampler
=
RandomSampler
(
train_data
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
batch_size
)
```
%% Cell type:markdown id: tags:
## Training
%% Cell type:code id: tags:
```
python
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
#model = CamembertForSequenceClassification.from_pretrained(
if
model_chosen
==
"
bert
"
:
model
=
BertForSequenceClassification
.
from_pretrained
(
model_bert
,
# Use the 12-layer BERT model, with an uncased vocab.
num_labels
=
numberOfClasses
,
# The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions
=
False
,
# Whether the model returns attentions weights.
output_hidden_states
=
False
,
# Whether the model returns all hidden-states.
)
elif
model_chosen
==
"
camembert
"
:
model
=
CamembertForSequenceClassification
.
from_pretrained
(
model_bert
,
# Use the 12-layer BERT model, with an uncased vocab.
num_labels
=
numberOfClasses
,
# The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions
=
False
,
# Whether the model returns attentions weights.
output_hidden_states
=
False
,
# Whether the model returns all hidden-states.
)
# Tell pytorch to run this model on the GPU.
#model.cuda()
model
.
to
(
"
mps
"
)
```
%% Cell type:code id: tags:
```
python
#Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix"
optimizer
=
AdamW
(
model
.
parameters
(),
lr
=
2e-5
,
# args.learning_rate - default is 5e-5, our notebook had 2e-5
eps
=
1e-8
# args.adam_epsilon - default is 1e-8.
)
```
%% Cell type:code id: tags:
```
python
# Number of training epochs (authors recommend between 2 and 4)
epochs
=
4
# Total number of training steps is number of batches * number of epochs.
total_steps
=
len
(
train_dataloader
)
*
epochs
# Create the learning rate scheduler.
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
num_warmup_steps
=
0
,
# Default value in run_glue.py
num_training_steps
=
total_steps
)
```
%% Cell type:code id: tags:
```
python
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# Set the seed value all over the place to make this reproducible.
seed_val
=
42
random
.
seed
(
seed_val
)
np
.
random
.
seed
(
seed_val
)
torch
.
manual_seed
(
seed_val
)
torch
.
cuda
.
manual_seed_all
(
seed_val
)
# Store the average loss after each epoch so I can plot them.
loss_values
=
[]
# For each epoch...
for
epoch_i
in
range
(
0
,
epochs
):
# ========================================
# Training
# ========================================
# Perform one full pass over the training set.
print
(
""
)
print
(
'
======== Epoch {:} / {:} ========
'
.
format
(
epoch_i
+
1
,
epochs
))
print
(
'
Training...
'
)
# Measure how long the training epoch takes.
t0
=
time
.
time
()
# Reset the total loss for this epoch.
total_loss
=
0
# Put the model into training mode.
model
.
train
()
# For each batch of training data...
for
step
,
batch
in
enumerate
(
train_dataloader
):
# Progress update every 40 batches.
if
step
%
5
==
0
and
not
step
==
0
:
# Calculate elapsed time in minutes.
elapsed
=
format_time
(
time
.
time
()
-
t0
)
# Report progress.
print
(
'
Batch {:>5,} of {:>5,}. Elapsed: {:}.
'
.
format
(
step
,
len
(
train_dataloader
),
elapsed
))
# Unpack this training batch from the dataloader.
#
# As I unpack the batch, I'll also copy each tensor to the GPU using the
# `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids
=
batch
[
0
].
to
(
device
)
b_input_mask
=
batch
[
1
].
to
(
device
)
b_labels
=
batch
[
2
].
to
(
device
)
# Always clear any previously calculated gradients before performing a
# backward pass. PyTorch doesn't do this automatically because
# accumulating the gradients is "convenient while training RNNs".
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
model
.
zero_grad
()
# Perform a forward pass (evaluate the model on this training batch).
# This will return the loss (rather than the model output) because I
# have provided the `labels`.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
outputs
=
model
(
b_input_ids
,
token_type_ids
=
None
,
attention_mask
=
b_input_mask
,
labels
=
b_labels
)
# The call to `model` always returns a tuple, so I need to pull the
# loss value out of the tuple.
loss
=
outputs
[
0
]
# Accumulate the training loss over all of the batches so that I can
# calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value
# from the tensor.
total_loss
+=
loss
.
item
()
# Perform a backward pass to calculate the gradients.
loss
.
backward
()
# Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
1.0
)
# Update parameters and take a step using the computed gradient.
# The optimizer dictates the "update rule"--how the parameters are
# modified based on their gradients, the learning rate, etc.
optimizer
.
step
()
# Update the learning rate.
scheduler
.
step
()
# Calculate the average loss over the training data.
avg_train_loss
=
total_loss
/
len
(
train_dataloader
)
# Store the loss value for plotting the learning curve.
loss_values
.
append
(
avg_train_loss
)
print
(
""
)
print
(
"
Average training loss: {0:.2f}
"
.
format
(
avg_train_loss
))
print
(
"
Training epoch took: {:}
"
.
format
(
format_time
(
time
.
time
()
-
t0
)))
print
(
""
)
print
(
"
Training complete!
"
)
```
%% Cell type:markdown id: tags:
## Saving model
%% Cell type:code id: tags:
```
python
name
=
model_bert
+
"
_s
"
+
str
(
maxOfInstancePerClass
)
model_path
=
path
+
"
model_
"
+
name
+
"
.pt
"
```
%% Cell type:code id: tags:
```
python
#torch.save(model, model_path)
```
%% Cell type:code id: tags:
```
python
model
.
save_pretrained
(
model_path
)
#ludo: changement de la façon de sauver le modèle
```
%% Cell type:markdown id: tags:
## Loading model
%% Cell type:code id: tags:
```
python
#model = torch.load(model_path)
model
=
BertForSequenceClassification
.
from_pretrained
(
model_path
).
to
(
"
mps
"
)
#.to("cuda")
```
%% Cell type:markdown id: tags:
## Evaluation
%% Cell type:code id: tags:
```
python
def
evaluate_bert
(
data
,
labels
,
model
,
batch_size
):
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids
=
[]
# For every sentence...
for
sent
in
data
:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent
=
tokenizer
.
encode
(
str
(
sent
),
# Sentence to encode.
add_special_tokens
=
True
,
# Add '[CLS]' and '[SEP]'
)
input_ids
.
append
(
encoded_sent
)
# Pad our input tokens
padded
=
[]
for
i
in
input_ids
:
if
len
(
i
)
>
max_len
:
padded
.
extend
([
i
[:
max_len
]])
else
:
padded
.
extend
([
i
+
[
0
]
*
(
max_len
-
len
(
i
))])
input_ids
=
np
.
array
(
padded
)
# Create attention masks
attention_masks
=
[]
# Create a mask of 1s for each token followed by 0s for padding
for
seq
in
input_ids
:
seq_mask
=
[
float
(
i
>
0
)
for
i
in
seq
]
attention_masks
.
append
(
seq_mask
)
# Convert to tensors.
prediction_inputs
=
torch
.
tensor
(
input_ids
)
prediction_masks
=
torch
.
tensor
(
attention_masks
)
prediction_labels
=
torch
.
tensor
(
labels
)
# Create the DataLoader.
prediction_data
=
TensorDataset
(
prediction_inputs
,
prediction_masks
,
prediction_labels
)
prediction_sampler
=
SequentialSampler
(
prediction_data
)
prediction_dataloader
=
DataLoader
(
prediction_data
,
sampler
=
prediction_sampler
,
batch_size
=
batch_size
)
print
(
'
Predicting labels for {:,} test sentences...
'
.
format
(
len
(
prediction_inputs
)))
# Put model in evaluation mode
model
.
eval
()
# Tracking variables
predictions
,
true_labels
=
[],
[]
# Predict
for
batch
in
prediction_dataloader
:
# Add batch to GPU
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
# Unpack the inputs from the dataloader
b_input_ids
,
b_input_mask
,
b_labels
=
batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with
torch
.
no_grad
():
# Forward pass, calculate logit predictions
outputs
=
model
(
b_input_ids
,
token_type_ids
=
None
,
attention_mask
=
b_input_mask
)
logits
=
outputs
[
0
]
#print(logits)
# Move logits and labels to CPU
logits
=
logits
.
detach
().
cpu
().
numpy
()
label_ids
=
b_labels
.
to
(
'
cpu
'
).
numpy
()
#print(logits)
# Store predictions and true labels
predictions
.
append
(
logits
)
true_labels
.
append
(
label_ids
)
print
(
'
DONE.
'
)
pred_labels
=
[]
# Evaluate each test batch using many matrics
print
(
'
Calculating the matrics for each batch...
'
)
for
i
in
range
(
len
(
true_labels
)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i
=
np
.
argmax
(
predictions
[
i
],
axis
=
1
).
flatten
()
pred_labels
.
append
(
pred_labels_i
)
pred_labels_
=
[
item
for
sublist
in
pred_labels
for
item
in
sublist
]
true_labels_
=
[
item
for
sublist
in
true_labels
for
item
in
sublist
]
return
pred_labels_
,
true_labels_
```
%% Cell type:code id: tags:
```
python
dataset
=
"
test
"
df_eval
=
pd
.
read_csv
(
test_path
,
sep
=
"
\t
"
)
data_eval
=
df_eval
[
columnText
].
values
y
=
df_eval
[
columnClass
]
y
=
encoder
.
transform
(
y
)
labels
=
y
.
tolist
()
model_path
=
path
+
"
/model_
"
+
model_bert
+
"
_s
"
+
str
(
maxOfInstancePerClass
)
+
"
.pt
"
model
=
torch
.
load
(
model_path
)
if
model_bert
==
"
bert-base-multilingual-cased
"
:
tokenizer
=
BertTokenizer
.
from_pretrained
(
model_bert
)
elif
model_bert
==
"
camembert-base
"
:
tokenizer
=
CamembertTokenizer
.
from_pretrained
(
model_bert
)
pred_labels_
,
true_labels_
=
evaluate_bert
(
data_eval
,
labels
,
model
,
batch_size
)
report
=
classification_report
(
true_labels_
,
pred_labels_
,
output_dict
=
True
)
classes
=
[
str
(
e
)
for
e
in
encoder
.
transform
(
encoder
.
classes_
)]
classesName
=
encoder
.
classes_
precision
=
[]
recall
=
[]
f1
=
[]
support
=
[]
dff
=
pd
.
DataFrame
(
columns
=
[
'
className
'
,
'
precision
'
,
'
recall
'
,
'
f1-score
'
,
'
support
'
,
'
FP
'
,
'
FN
'
,
'
TP
'
,
'
TN
'
])
for
c
in
classes
:
precision
.
append
(
report
[
c
][
'
precision
'
])
recall
.
append
(
report
[
c
][
'
recall
'
])
f1
.
append
(
report
[
c
][
'
f1-score
'
])
support
.
append
(
report
[
c
][
'
support
'
])
accuracy
=
report
[
'
accuracy
'
]
weighted_avg
=
report
[
'
weighted avg
'
]
cnf_matrix
=
confusion_matrix
(
true_labels_
,
pred_labels_
)
FP
=
cnf_matrix
.
sum
(
axis
=
0
)
-
np
.
diag
(
cnf_matrix
)
FN
=
cnf_matrix
.
sum
(
axis
=
1
)
-
np
.
diag
(
cnf_matrix
)
TP
=
np
.
diag
(
cnf_matrix
)
TN
=
cnf_matrix
.
sum
()
-
(
FP
+
FN
+
TP
)
dff
[
'
className
'
]
=
classesName
dff
[
'
precision
'
]
=
precision
dff
[
'
recall
'
]
=
recall
dff
[
'
f1-score
'
]
=
f1
dff
[
'
support
'
]
=
support
dff
[
'
FP
'
]
=
FP
dff
[
'
FN
'
]
=
FN
dff
[
'
TP
'
]
=
TP
dff
[
'
TN
'
]
=
TN
print
(
name
)
name
=
"
test_
"
+
name
content
=
name
+
"
\n
"
print
(
name
)
content
+=
str
(
weighted_avg
)
+
"
\n
"
print
(
weighted_avg
)
print
(
accuracy
)
print
(
dff
)
dff
.
to_csv
(
path
+
"
/report_
"
+
name
+
"
.csv
"
,
index
=
False
)
# enregistrer les predictions
pd
.
DataFrame
({
'
labels
'
:
pd
.
Series
(
true_labels_
),
'
predictions
'
:
pd
.
Series
(
pred_labels_
)}).
to_csv
(
path
+
"
/predictions/predictions_
"
+
name
+
"
.csv
"
)
with
open
(
path
+
"
reports/report_
"
+
name
+
"
.txt
"
,
'
w
'
)
as
f
:
f
.
write
(
content
)
```
%% Cell type:code id: tags:
```
python
``
`
%%
Cell
type
:
code
id
:
tags
:
```
python
```
%% Cell type:code id: tags:
```
python
```
%% Cell type:code id: tags:
```
python
```
%% Cell type:code id: tags:
```
python
```
%% Cell type:code id: tags:
```
python
```
%% Cell type:code id: tags:
```
python
model_path = "drive/MyDrive/Classification-EDdA/model_bert-base-multilingual-cased_s10000.pt"
```
%% Cell type:code id: tags:
```
python
model = torch.load(model_path)
```
%% Cell type:code id: tags:
```
python
!wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv
```
%% Cell type:code id: tags:
```
python
df_LGE = pd.read_csv("LGE_withContent.tsv", sep="
\t
")
data_LGE = df_LGE["content"].values
#pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)
```
%% Cell type:code id: tags:
```
python
df_LGE.head()
```
%% Cell type:code id: tags:
```
python
df_LGE.shape
```
%% Cell type:code id: tags:
```
python
def generate_prediction_dataloader(chosen_model, sentences_to_predict, batch_size = 8, max_len = 512):
if chosen_model == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(chosen_model)
elif chosen_model == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(chosen_model)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
# For every sentence...
for sent in sentences_to_predict:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
input_ids_test.append(encoded_sent)
# Pad our input tokens
padded_test = []
for i in input_ids_test:
if len(i) > max_len:
padded_test.extend([i[:max_len]])
else:
padded_test.extend([i + [0] * (max_len - len(i))])
input_ids_test = np.array(padded_test)
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids_test)
prediction_masks = torch.tensor(attention_masks)
#set batch size
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
return prediction_dataloader
def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions_test , true_labels = [], []
pred_labels_ = []
# Predict
for batch in sentences_to_predict_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader
b_input_ids, b_input_mask = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
#print(logits)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
#print(logits)
# Store predictions and true labels
predictions_test.append(logits)
#print(' DONE.')
pred_labels = []
for i in range(len(predictions_test)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
pred_labels.append(pred_labels_i)
pred_labels_ += [item for sublist in pred_labels for item in sublist]
return pred_labels_
```
%% Cell type:code id: tags:
```
python
data_loader = generate_prediction_dataloader('bert-base-multilingual-cased', data_LGE)
#data_loader = generate_prediction_dataloader('camembert-base', data_LGE)
```
%% Cell type:code id: tags:
```
python
p = predict_class_bertFineTuning( model, data_loader )
```
%% Cell type:code id: tags:
```
python
len(p)
```
%% Cell type:code id: tags:
```
python
```
%% Cell type:code id: tags:
```
python
# Il faudrait enregistrer l'encoder,
# sinon on est obligé de le refaire à partir du jeu d'entrainement pour récupérer le noms des classes.
encoder
```
%% Cell type:code id: tags:
```
python
p2 = list(encoder.inverse_transform(p))
```
%% Cell type:code id: tags:
```
python
p2
```
%% Cell type:code id: tags:
```
python
```
%% Cell type:code id: tags:
```
python
df_LGE['class_bert'] = p2
```
%% Cell type:code id: tags:
```
python
df_LGE.head()
```
%% Cell type:code id: tags:
```
python
df_LGE.to_csv("drive/MyDrive/Classification-EDdA/classification_LGE.tsv", sep="
\t
")
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment