Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
EDdA Classification
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Projet GEODE
EDdA Classification
Commits
85b9486f
Commit
85b9486f
authored
2 years ago
by
Ludovic Moncla
Browse files
Options
Downloads
Patches
Plain Diff
Create utils.py
parent
b8b339ab
Loading
Loading
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/utils.py
+123
-0
123 additions, 0 deletions
scripts/utils.py
with
123 additions
and
0 deletions
scripts/utils.py
0 → 100644
+
123
−
0
View file @
85b9486f
import
numpy
as
np
import
torch
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
SequentialSampler
from
tqdm
import
tqdm
import
os
import
pandas
as
pd
def
generate_dataloader
(
tokenizer
,
sentences
,
batch_size
=
8
,
max_len
=
512
):
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test
=
[]
# For every sentence...
for
sent
in
sentences
:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent
=
tokenizer
.
encode
(
sent
,
# Sentence to encode.
add_special_tokens
=
True
,
# Add '[CLS]' and '[SEP]'
# This function also supports truncation and conversion
# to pytorch tensors, but I need to do padding, so I
# can't use these features.
#max_length = max_len, # Truncate all sentences.
#return_tensors = 'pt', # Return pytorch tensors.
)
input_ids_test
.
append
(
encoded_sent
)
# Pad our input tokens
padded_test
=
[]
for
i
in
input_ids_test
:
if
len
(
i
)
>
max_len
:
padded_test
.
extend
([
i
[:
max_len
]])
else
:
padded_test
.
extend
([
i
+
[
0
]
*
(
max_len
-
len
(
i
))])
input_ids_test
=
np
.
array
(
padded_test
)
# Create attention masks
attention_masks
=
[]
# Create a mask of 1s for each token followed by 0s for padding
for
seq
in
input_ids_test
:
seq_mask
=
[
float
(
i
>
0
)
for
i
in
seq
]
attention_masks
.
append
(
seq_mask
)
# Convert to tensors.
inputs
=
torch
.
tensor
(
input_ids_test
)
masks
=
torch
.
tensor
(
attention_masks
)
#set batch size
# Create the DataLoader.
data
=
TensorDataset
(
inputs
,
masks
)
prediction_sampler
=
SequentialSampler
(
data
)
return
DataLoader
(
data
,
sampler
=
prediction_sampler
,
batch_size
=
batch_size
)
def
predict
(
model
,
dataloader
,
device
):
# Put model in evaluation mode
model
.
eval
()
# Tracking variables
predictions_test
,
true_labels
=
[],
[]
pred_labels_
=
[]
# Predict
for
batch
in
dataloader
:
# Add batch to GPU
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
# Unpack the inputs from the dataloader
b_input_ids
,
b_input_mask
=
batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with
torch
.
no_grad
():
# Forward pass, calculate logit predictions
outputs
=
model
(
b_input_ids
,
token_type_ids
=
None
,
attention_mask
=
b_input_mask
)
logits
=
outputs
[
0
]
#print(logits)
# Move logits and labels to CPU ???
logits
=
logits
.
detach
().
cpu
().
numpy
()
#print(logits)
# Store predictions and true labels
predictions_test
.
append
(
logits
)
pred_labels
=
[]
for
i
in
range
(
len
(
predictions_test
)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i
=
np
.
argmax
(
predictions_test
[
i
],
axis
=
1
).
flatten
()
pred_labels
.
append
(
pred_labels_i
)
pred_labels_
+=
[
item
for
sublist
in
pred_labels
for
item
in
sublist
]
return
pred_labels_
def
text_folder_to_dataframe
(
path
):
data
=
[]
# id,tome,filename,nb_words,content,domain
for
tome
in
sorted
(
os
.
listdir
(
path
)):
try
:
for
article
in
tqdm
(
sorted
(
os
.
listdir
(
path
+
"
/
"
+
tome
))):
filename
=
article
[:
-
4
]
id
=
tome
+
filename
if
article
[
-
4
:]
==
"
.txt
"
:
with
open
(
path
+
"
/
"
+
tome
+
"
/
"
+
article
)
as
f
:
content
=
f
.
read
()
data
.
append
([
id
,
tome
,
filename
,
content
,
len
(
content
.
split
(
'
'
))])
except
NotADirectoryError
:
pass
return
pd
.
DataFrame
(
data
,
columns
=
[
'
id
'
,
'
tome
'
,
'
filename
'
,
'
content
'
,
'
nb_words
'
])
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment