Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
EDdA Classification
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Projet GEODE
EDdA Classification
Commits
01434873
Commit
01434873
authored
2 years ago
by
Ludovic Moncla
Browse files
Options
Downloads
Patches
Plain Diff
Create Predict_LGE.py
parent
cafa9021
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/Predict_LGE.py
+206
-0
206 additions, 0 deletions
scripts/Predict_LGE.py
with
206 additions
and
0 deletions
scripts/Predict_LGE.py
0 → 100644
+
206
−
0
View file @
01434873
import
os
import
pandas
as
pd
import
numpy
as
np
import
pickle
import
torch
import
tqdm
from
transformers
import
BertTokenizer
,
BertForSequenceClassification
,
CamembertTokenizer
,
CamembertForSequenceClassification
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
SequentialSampler
def
generate_dataloader
(
tokenizer
,
sentences
,
batch_size
=
8
,
max_len
=
512
):
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test
=
[]
# For every sentence...
for
sent
in
sentences
:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent
=
tokenizer
.
encode
(
sent
,
# Sentence to encode.
add_special_tokens
=
True
,
# Add '[CLS]' and '[SEP]'
# This function also supports truncation and conversion
# to pytorch tensors, but I need to do padding, so I
# can't use these features.
#max_length = max_len, # Truncate all sentences.
#return_tensors = 'pt', # Return pytorch tensors.
)
input_ids_test
.
append
(
encoded_sent
)
# Pad our input tokens
padded_test
=
[]
for
i
in
input_ids_test
:
if
len
(
i
)
>
max_len
:
padded_test
.
extend
([
i
[:
max_len
]])
else
:
padded_test
.
extend
([
i
+
[
0
]
*
(
max_len
-
len
(
i
))])
input_ids_test
=
np
.
array
(
padded_test
)
# Create attention masks
attention_masks
=
[]
# Create a mask of 1s for each token followed by 0s for padding
for
seq
in
input_ids_test
:
seq_mask
=
[
float
(
i
>
0
)
for
i
in
seq
]
attention_masks
.
append
(
seq_mask
)
# Convert to tensors.
inputs
=
torch
.
tensor
(
input_ids_test
)
masks
=
torch
.
tensor
(
attention_masks
)
#set batch size
# Create the DataLoader.
data
=
TensorDataset
(
inputs
,
masks
)
prediction_sampler
=
SequentialSampler
(
data
)
return
DataLoader
(
data
,
sampler
=
prediction_sampler
,
batch_size
=
batch_size
)
def
predict
(
model
,
dataloader
,
device
):
# Put model in evaluation mode
model
.
eval
()
# Tracking variables
predictions_test
,
true_labels
=
[],
[]
pred_labels_
=
[]
# Predict
for
batch
in
dataloader
:
# Add batch to GPU
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
# Unpack the inputs from the dataloader
b_input_ids
,
b_input_mask
=
batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with
torch
.
no_grad
():
# Forward pass, calculate logit predictions
outputs
=
model
(
b_input_ids
,
token_type_ids
=
None
,
attention_mask
=
b_input_mask
)
logits
=
outputs
[
0
]
#print(logits)
# Move logits and labels to CPU ???
logits
=
logits
.
detach
().
cpu
().
numpy
()
#print(logits)
# Store predictions and true labels
predictions_test
.
append
(
logits
)
pred_labels
=
[]
for
i
in
range
(
len
(
predictions_test
)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i
=
np
.
argmax
(
predictions_test
[
i
],
axis
=
1
).
flatten
()
pred_labels
.
append
(
pred_labels_i
)
pred_labels_
+=
[
item
for
sublist
in
pred_labels
for
item
in
sublist
]
return
pred_labels_
def
text_folder_to_dataframe
(
path
):
data
=
[]
# id,tome,filename,nb_words,content,domain
for
tome
in
sorted
(
os
.
listdir
(
path
)):
for
article
in
sorted
(
os
.
listdir
(
path
+
"
/
"
+
tome
)):
filename
=
article
[:
-
4
]
id
=
tome
+
filename
if
article
[
-
4
:]
==
"
.txt
"
:
with
open
(
path
+
"
/
"
+
tome
+
"
/
"
+
article
)
as
f
:
content
=
f
.
read
()
data
.
append
([
id
,
tome
,
filename
,
content
,
len
(
content
.
split
(
'
'
))])
return
pd
.
DataFrame
(
data
,
columns
=
[
'
id
'
,
'
tome
'
,
'
filename
'
,
'
content
'
,
'
nb_words
'
])
if
__name__
==
'
__main__
'
:
# If there's a GPU available...
if
torch
.
cuda
.
is_available
():
# Tell PyTorch to use the GPU.
device
=
torch
.
device
(
"
cuda
"
)
gpu_name
=
"
cuda
"
print
(
'
There are %d GPU(s) available.
'
%
torch
.
cuda
.
device_count
())
print
(
'
We will use the GPU:
'
,
torch
.
cuda
.
get_device_name
(
0
))
# for MacOS
elif
torch
.
backends
.
mps
.
is_available
()
and
torch
.
backends
.
mps
.
is_built
():
device
=
torch
.
device
(
"
mps
"
)
gpu_name
=
"
mps
"
print
(
'
We will use the GPU
'
)
else
:
device
=
torch
.
device
(
"
cpu
"
)
gpu_name
=
"
cpu
"
print
(
'
No GPU available, using the CPU instead.
'
)
#############
## Load data
print
(
"
* Load data
"
)
path
=
"
/Users/lmoncla/Documents/Data/Corpus/LGE/Text/
"
df_LGE
=
text_folder_to_dataframe
(
path
)
#df_LGE = pd.read_csv(path + "data/LGE_withContent.tsv", sep="\t")
data_LGE
=
df_LGE
[
"
content
"
].
values
#df_LGE.head()
#df_LGE.shape
#############
## Load model
print
(
"
* Load model
"
)
model_name
=
"
bert-base-multilingual-cased
"
#model_name = "camembert-base"
model_path
=
path
+
"
models/model_
"
+
model_name
+
"
_s10000.pt
"
if
model_name
==
'
bert-base-multilingual-cased
'
:
print
(
'
Loading Bert Tokenizer...
'
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
model_name
)
elif
model_name
==
'
camembert-base
'
:
print
(
'
Loading Camembert Tokenizer...
'
)
tokenizer
=
CamembertTokenizer
.
from_pretrained
(
model_name
)
data_loader
=
generate_dataloader
(
tokenizer
,
data_LGE
)
model
=
BertForSequenceClassification
.
from_pretrained
(
model_path
).
to
(
gpu_name
)
#############
## Predict
print
(
"
* Predict
"
)
pred
=
predict
(
model
,
data_loader
,
device
)
encoder_filename
=
"
models/label_encoder.pkl
"
with
open
(
path
+
encoder_filename
,
'
rb
'
)
as
file
:
encoder
=
pickle
.
load
(
file
)
p2
=
list
(
encoder
.
inverse_transform
(
pred
))
df_LGE
[
'
domain
'
]
=
p2
#############
## Save results
filepath
=
path
+
"
results_LGE/metadata-withContent.csv
"
print
(
"
* Save results:
"
,
filepath
)
df_LGE
.
to_csv
(
filepath
,
sep
=
"
\,
"
)
df_LGE
.
drop
(
columns
=
[
'
content
'
],
inplace
=
True
)
filepath
=
path
+
"
results_LGE/metadata.csv
"
print
(
"
* Save results:
"
,
filepath
)
df_LGE
.
to_csv
(
filepath
,
sep
=
"
\,
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment