Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
EDdA Classification
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Projet GEODE
EDdA Classification
Commits
a5158391
Commit
a5158391
authored
Sep 16, 2021
by
Khalleud
Browse files
Options
Downloads
Patches
Plain Diff
[ADD] train bert finetuning & predict & evaluate
parent
e7f6f159
Branches
Branches containing commit
No related tags found
1 merge request
!5
Branch dev bert exp
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
evaluate_bertFineTuning.py
+54
-0
54 additions, 0 deletions
evaluate_bertFineTuning.py
main.py
+120
-0
120 additions, 0 deletions
main.py
predict_bertFineTuning.py
+168
-0
168 additions, 0 deletions
predict_bertFineTuning.py
training_bertFineTuning.py
+268
-333
268 additions, 333 deletions
training_bertFineTuning.py
with
610 additions
and
333 deletions
evaluate_bertFineTuning.py
0 → 100644
+
54
−
0
View file @
a5158391
import
matplotlib.pyplot
as
plt
from
sklearn.metrics
import
plot_confusion_matrix
from
sklearn.metrics
import
confusion_matrix
from
sklearn.metrics
import
classification_report
import
seaborn
as
sns
def
evaluate_bertFineTuning
(
pred_labels_
,
true_labels_
,
encoder
):
report
=
classification_report
(
pred_labels_
,
true_labels_
,
output_dict
=
True
)
classes
=
[
str
(
e
)
for
e
in
encoder
.
transform
(
encoder
.
classes_
)]
classesName
=
encoder
.
classes_
accuracy
=
report
[
'
accuracy
'
]
weighted_avg
=
report
[
'
weighted avg
'
]
precision
=
[]
recall
=
[]
f1
=
[]
support
=
[]
dff
=
pd
.
DataFrame
(
columns
=
[
'
className
'
,
'
precision
'
,
'
recall
'
,
'
f1-score
'
,
'
support
'
,
'
FP
'
,
'
FN
'
,
'
TP
'
,
'
TN
'
])
for
c
in
classes
:
precision
.
append
(
report
[
c
][
'
precision
'
])
recall
.
append
(
report
[
c
][
'
recall
'
])
f1
.
append
(
report
[
c
][
'
f1-score
'
])
support
.
append
(
report
[
c
][
'
support
'
])
accuracy
=
report
[
'
accuracy
'
]
weighted_avg
=
report
[
'
weighted avg
'
]
cnf_matrix
=
confusion_matrix
(
true_labels_
,
pred_labels_
)
FP
=
cnf_matrix
.
sum
(
axis
=
0
)
-
np
.
diag
(
cnf_matrix
)
FN
=
cnf_matrix
.
sum
(
axis
=
1
)
-
np
.
diag
(
cnf_matrix
)
TP
=
np
.
diag
(
cnf_matrix
)
TN
=
cnf_matrix
.
sum
()
-
(
FP
+
FN
+
TP
)
dff
[
'
className
'
]
=
classesName
dff
[
'
precision
'
]
=
precision
dff
[
'
recall
'
]
=
recall
dff
[
'
f1-score
'
]
=
f1
dff
[
'
support
'
]
=
support
dff
[
'
FP
'
]
=
FP
dff
[
'
FN
'
]
=
FN
dff
[
'
TP
'
]
=
TP
dff
[
'
TN
'
]
=
TN
return
dff
,
accuracy
,
weighted_avg
This diff is collapsed.
Click to expand it.
main.py
0 → 100644
+
120
−
0
View file @
a5158391
import
pandas
as
pd
import
numpy
as
np
import
configparser
from
sklearn
import
preprocessing
from
sklearn.model_selection
import
train_test_split
from
training_bertFineTuning
import
training_bertFineTuning
from
predict_bertFineTuning
import
predict_class_bertFineTuning
,
generate_prediction_dataloader
from
evaluate_bertFineTuning
import
evaluate_bertFineTuning
def
create_dict
(
df
,
classColumnName
):
return
dict
(
df
[
classColumnName
].
value_counts
())
def
remove_weak_classes
(
df
,
classColumnName
,
threshold
):
dictOfClassInstances
=
create_dict
(
df
,
classColumnName
)
dictionary
=
{
k
:
v
for
k
,
v
in
dictOfClassInstances
.
items
()
if
v
>=
threshold
}
keys
=
[
*
dictionary
]
df_tmp
=
df
[
~
df
[
classColumnName
].
isin
(
keys
)]
df
=
pd
.
concat
([
df
,
df_tmp
]).
drop_duplicates
(
keep
=
False
)
return
df
def
resample_classes
(
df
,
classColumnName
,
numberOfInstances
):
#random numberOfInstances elements
replace
=
False
# with replacement
fn
=
lambda
obj
:
obj
.
loc
[
np
.
random
.
choice
(
obj
.
index
,
numberOfInstances
if
len
(
obj
)
>
numberOfInstances
else
len
(
obj
),
replace
),:]
return
df
.
groupby
(
classColumnName
,
as_index
=
False
).
apply
(
fn
)
def
main
():
config
=
configparser
.
ConfigParser
()
config
.
read
(
'
bert_settings.conf
'
)
dataPath
=
config
.
get
(
'
general
'
,
'
dataPath
'
)
columnText
=
config
.
get
(
'
general
'
,
'
columnText
'
)
columnClass
=
config
.
get
(
'
general
'
,
'
columnClass
'
)
minOfInstancePerClass
=
int
(
config
.
get
(
'
general
'
,
'
minOfInstancePerClass
'
))
maxOfInstancePerClass
=
int
(
config
.
get
(
'
general
'
,
'
maxOfInstancePerClass
'
))
chosen_tokeniser
=
config
.
get
(
'
model
'
,
'
tokeniser
'
)
chosen_model
=
config
.
get
(
'
model
'
,
'
model
'
)
max_len
=
int
(
config
.
get
(
'
model
'
,
'
max_len_sequences
'
))
batch_size
=
int
(
config
.
get
(
'
model
'
,
'
batch_size
'
))
epochs
=
int
(
config
.
get
(
'
model
'
,
'
epochs
'
))
df
=
pd
.
read_csv
(
dataPath
)
df
=
remove_weak_classes
(
df
,
columnClass
,
minOfInstancePerClass
)
df
=
resample_classes
(
df
,
columnClass
,
maxOfInstancePerClass
)
df
=
df
[
df
[
columnClass
]
!=
'
unclassified
'
]
y
=
df
[
columnClass
]
numberOfClasses
=
y
.
nunique
()
encoder
=
preprocessing
.
LabelEncoder
()
y
=
encoder
.
fit_transform
(
y
)
train_x
,
test_x
,
train_y
,
test_y
=
train_test_split
(
df
,
y
,
test_size
=
0.33
,
random_state
=
42
,
stratify
=
y
)
sentences
=
train_x
[
columnText
].
values
labels
=
train_y
.
tolist
()
#call train method
model
=
training_bertFineTuning
(
chosen_model
,
sentences
,
labels
,
max_len
,
batch_size
,
epochs
)
#save the model
model_save_name
=
config
.
get
(
'
model
'
,
'
modelName
'
)
path
=
config
.
get
(
'
model
'
,
'
path
'
)
torch
.
save
(
model
,
os
.
path
.
join
(
path
,
model_save_name
))
#print the model parameters
params
=
list
(
model
.
named_parameters
())
print
(
'
The BERT model has {:} different named parameters.
\n
'
.
format
(
len
(
params
)))
print
(
'
==== Embedding Layer ====
\n
'
)
for
p
in
params
[
0
:
5
]:
print
(
"
{:<55} {:>12}
"
.
format
(
p
[
0
],
str
(
tuple
(
p
[
1
].
size
()))))
print
(
'
\n
==== First Transformer ====
\n
'
)
for
p
in
params
[
5
:
21
]:
print
(
"
{:<55} {:>12}
"
.
format
(
p
[
0
],
str
(
tuple
(
p
[
1
].
size
()))))
print
(
'
\n
==== Output Layer ====
\n
'
)
for
p
in
params
[
-
4
:]:
print
(
"
{:<55} {:>12}
"
.
format
(
p
[
0
],
str
(
tuple
(
p
[
1
].
size
()))))
#call predict method
prediction_dataloader
=
generate_prediction_dataloader
(
chosen_model
,
sentences_to_predict
,
labels
,
max_len
,
batch_size
=
32
)
predicted_class
,
true_labels
=
predict_class_bertFineTuning
(
chosen_model
,
model
,
prediction_dataloader
)
#call Evaluate
result_df
,
accuracy
,
weighted_avg
=
evaluate_bertFineTuning
(
predicted_class
,
true_labels
,
encoder
)
print
(
result_df
)
print
(
accuracy
)
print
(
weighted_avg
)
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
predict_bertFineTuning.py
0 → 100644
+
168
−
0
View file @
a5158391
import
torch
import
pandas
as
pd
import
numpy
as
np
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
RandomSampler
,
SequentialSampler
from
transformers
import
BertTokenizer
,
CamembertTokenizer
def
generate_prediction_dataloader
(
chosen_model
,
sentences_to_predict
,
labels
,
batch_size
=
32
):
if
chosen_model
==
'
bert-base-multilingual-cased
'
:
print
(
'
Loading Bert Tokenizer...
'
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
chosen_model
,
do_lower_case
=
True
)
elif
chosen_model
==
'
camembert-base
'
:
print
(
'
Loading Camembert Tokenizer...
'
)
tokenizer
=
CamembertTokenizer
.
from_pretrained
(
chosen_model
,
do_lower_case
=
True
)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test
=
[]
# For every sentence...
for
sent
in
sentences_to_predict
:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent
=
tokenizer
.
encode
(
sent
,
# Sentence to encode.
add_special_tokens
=
True
,
# Add '[CLS]' and '[SEP]'
)
input_ids_test
.
append
(
encoded_sent
)
# Pad our input tokens
padded_test
=
[]
for
i
in
input_ids_test
:
if
len
(
i
)
>
max_len
:
padded_test
.
extend
([
i
[:
max_len
]])
else
:
padded_test
.
extend
([
i
+
[
0
]
*
(
max_len
-
len
(
i
))])
input_ids_test
=
np
.
array
(
padded_test
)
# Create attention masks
attention_masks
=
[]
# Create a mask of 1s for each token followed by 0s for padding
for
seq
in
input_ids_test
:
seq_mask
=
[
float
(
i
>
0
)
for
i
in
seq
]
attention_masks
.
append
(
seq_mask
)
# Convert to tensors.
prediction_inputs
=
torch
.
tensor
(
input_ids_test
)
prediction_masks
=
torch
.
tensor
(
attention_masks
)
prediction_labels
=
torch
.
tensor
(
labels
)
# Set the batch size.
batch_size
=
32
# Create the DataLoader.
prediction_data
=
TensorDataset
(
prediction_inputs
,
prediction_masks
,
prediction_labels
)
prediction_sampler
=
SequentialSampler
(
prediction_data
)
prediction_dataloader
=
DataLoader
(
prediction_data
,
sampler
=
prediction_sampler
,
batch_size
=
batch_size
)
return
prediction_dataloader
def
predict_class_bertFineTuning
(
model
,
sentences_to_predict_dataloader
):
# If there's a GPU available...
if
torch
.
cuda
.
is_available
():
# Tell PyTorch to use the GPU.
device
=
torch
.
device
(
"
cuda
"
)
print
(
'
There are %d GPU(s) available.
'
%
torch
.
cuda
.
device_count
())
print
(
'
We will use the GPU:
'
,
torch
.
cuda
.
get_device_name
(
0
))
# If not...
else
:
print
(
'
No GPU available, using the CPU instead.
'
)
device
=
torch
.
device
(
"
cpu
"
)
# Put model in evaluation mode
model
.
eval
()
# Tracking variables
predictions_test
,
true_labels
=
[],
[]
# Predict
for
batch
in
prediction_dataloader
:
# Add batch to GPU
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
# Unpack the inputs from the dataloader
b_input_ids
,
b_input_mask
,
b_labels
=
batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with
torch
.
no_grad
():
# Forward pass, calculate logit predictions
outputs
=
model
(
b_input_ids
,
token_type_ids
=
None
,
attention_mask
=
b_input_mask
)
logits
=
outputs
[
0
]
#print(logits)
# Move logits and labels to CPU
logits
=
logits
.
detach
().
cpu
().
numpy
()
label_ids
=
b_labels
.
to
(
'
cpu
'
).
numpy
()
#print(logits)
# Store predictions and true labels
predictions_test
.
append
(
logits
)
true_labels
.
append
(
label_ids
)
print
(
'
DONE.
'
)
pred_labels
=
[]
for
i
in
range
(
len
(
true_labels
)):
# The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s.
pred_labels_i
=
np
.
argmax
(
predictions_test
[
i
],
axis
=
1
).
flatten
()
pred_labels
.
append
(
pred_labels_i
)
pred_labels_
=
[
item
for
sublist
in
pred_labels
for
item
in
sublist
]
true_labels_
=
[
item
for
sublist
in
true_labels
for
item
in
sublist
]
return
predictions_test_
,
true_labels_
def
predict_instance_bertFineTuning
(
chosen_model
,
model
,
sentences_to_predict
):
if
chosen_model
==
'
bert-base-multilingual-cased
'
:
print
(
'
Loading Bert Tokenizer...
'
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
chosen_model
,
do_lower_case
=
True
)
elif
chosen_model
==
'
camembert-base
'
:
print
(
'
Loading Camembert Tokenizer...
'
)
tokenizer
=
CamembertTokenizer
.
from_pretrained
(
chosen_model
,
do_lower_case
=
True
)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test
=
[]
# For every sentence...
for
sent
in
sentences_to_predict
:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent
=
tokenizer
.
encode
(
sent
,
# Sentence to encode.
add_special_tokens
=
True
,
# Add '[CLS]' and '[SEP]'
)
input_ids_test
.
append
(
encoded_sent
)
with
torch
.
no_grad
():
# Forward pass, calculate logit predictions
outputs
=
model
(
b_input_ids
,
token_type_ids
=
None
,
attention_mask
=
b_input_mask
)
logits
=
outputs
[
0
]
This diff is collapsed.
Click to expand it.
training_bertFineTuning.py
+
268
−
333
View file @
a5158391
...
@@ -2,48 +2,38 @@ import torch
...
@@ -2,48 +2,38 @@ import torch
import
pandas
as
pd
import
pandas
as
pd
import
numpy
as
np
import
numpy
as
np
from
sklearn
import
preprocessing
from
sklearn
import
preprocessing
from
sklearn.model_selection
import
train_test_split
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
RandomSampler
,
SequentialSampler
from
transformers
import
BertTokenizer
,
CamembertTokenizer
from
transformers
import
BertTokenizer
,
CamembertTokenizer
from
transformers
import
BertForSequenceClassification
,
AdamW
,
BertConfig
,
CamembertForSequenceClassification
from
transformers
import
BertForSequenceClassification
,
AdamW
,
BertConfig
,
CamembertForSequenceClassification
from
transformers
import
get_linear_schedule_with_warmup
from
transformers
import
get_linear_schedule_with_warmup
import
time
import
time
import
datetime
import
datetime
import
random
import
random
import
os
def
flat_accuracy
(
preds
,
labels
):
pred_flat
=
np
.
argmax
(
preds
,
axis
=
1
).
flatten
()
labels_flat
=
labels
.
flatten
()
return
np
.
sum
(
pred_flat
==
labels_flat
)
/
len
(
labels_flat
)
###########################################################################
########################## Utils Functions ################################
###########################################################################
def
create_dict
(
df
,
classColumnName
):
return
dict
(
df
[
classColumnName
].
value_counts
())
def
remove_weak_classes
(
df
,
classColumnName
,
threshold
):
dictOfClassInstances
=
create_dict
(
df
,
classColumnName
)
dictionary
=
{
k
:
v
for
k
,
v
in
dictOfClassInstances
.
items
()
if
v
>=
threshold
}
keys
=
[
*
dictionary
]
df_tmp
=
df
[
~
df
[
classColumnName
].
isin
(
keys
)]
df
=
pd
.
concat
([
df
,
df_tmp
]).
drop_duplicates
(
keep
=
False
)
return
df
def
resample_classes
(
df
,
classColumnName
,
numberOfInstances
):
def
format_time
(
elapsed
):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded
=
int
(
round
((
elapsed
)))
#
random numberOfInstances element
s
#
Format as hh:mm:s
s
re
place
=
False
# with replacement
re
turn
str
(
datetime
.
timedelta
(
seconds
=
elapsed_rounded
))
fn
=
lambda
obj
:
obj
.
loc
[
np
.
random
.
choice
(
obj
.
index
,
numberOfInstances
if
len
(
obj
)
>
numberOfInstances
else
len
(
obj
),
replace
),:]
return
df
.
groupby
(
classColumnName
,
as_index
=
False
).
apply
(
fn
)
##############################################################################################################
def
training_bertFineTuning
(
chosen_model
,
sentences
,
labels
,
max_len
,
batch_size
,
epochs
=
4
):
########################## Setup GPU #########################################################################
##############################################################################################################
# If there's a GPU available...
# If there's a GPU available...
if
torch
.
cuda
.
is_available
():
if
torch
.
cuda
.
is_available
():
...
@@ -63,62 +53,18 @@ else:
...
@@ -63,62 +53,18 @@ else:
#############################################################################################################
########################## parameters ###################################################################
###########################################################################################################
config
=
configparser
.
ConfigParser
()
config
.
read
(
'
settings.conf
'
)
dataPath
=
config
.
get
(
'
general
'
,
'
dataPath
'
)
columnText
=
config
.
get
(
'
general
'
,
'
columnText
'
)
columnClass
=
config
.
get
(
'
general
'
,
'
columnClass
'
)
minOfInstancePerClass
=
int
(
config
.
get
(
'
general
'
,
'
minOfInstancePerClass
'
))
maxOfInstancePerClass
=
int
(
config
.
get
(
'
general
'
,
'
maxOfInstancePerClass
'
))
chosen_tokeniser
=
config
.
get
(
'
model
'
,
'
tokeniser
'
)
chosen_model
=
config
.
get
(
'
model
'
,
'
model
'
)
max_len
=
int
(
config
.
get
(
'
model
'
,
'
max_len_sequences
'
))
#############################################################################################################
########################## Load Data ###################################################################
###########################################################################################################
df
=
pd
.
read_csv
(
dataPath
)
df
=
remove_weak_classes
(
df
,
columnClass
,
minOfInstancePerClass
)
df
=
resample_classes
(
df
,
columnClass
,
maxOfInstancePerClass
)
df
=
df
[
df
[
columnClass
]
!=
'
unclassified
'
]
y
=
df
[
columnClass
]
numberOfClasses
=
y
.
nunique
()
encoder
=
preprocessing
.
LabelEncoder
()
y
=
encoder
.
fit_transform
(
y
)
sentences
=
train_x
[
columnText
].
values
labels
=
train_y
.
tolist
()
############################################################################################################
############################################################################################################
########################## Model: Tokenization & Input Formatting ###################################################################
########################## Model: Tokenization & Input Formatting ###################################################################
###########################################################################################################
###########################################################################################################
# Load the BERT tokenizer.
if
chosen_model
==
'
bert-base-multilingual-cased
'
:
print
(
'
Loading BERT tokenizer...
'
)
print
(
'
Loading Bert Tokenizer...
'
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
tokeniser_bert
,
do_lower_case
=
True
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
chosen_model
,
do_lower_case
=
True
)
elif
chosen_model
==
'
camembert-base
'
:
print
(
'
Loading Camembert Tokenizer...
'
)
tokenizer
=
CamembertTokenizer
.
from_pretrained
(
chosen_model
,
do_lower_case
=
True
)
# Tokenize all of the sentences and map the tokens to thier word IDs.
# Tokenize all of the sentences and map the tokens to thier word IDs.
...
@@ -157,7 +103,7 @@ for i in input_ids:
...
@@ -157,7 +103,7 @@ for i in input_ids:
padded
.
extend
([
i
+
[
0
]
*
(
max_len
-
len
(
i
))])
padded
.
extend
([
i
+
[
0
]
*
(
max_len
-
len
(
i
))])
padded
=
input_ids
=
np
.
array
(
padded
)
padded
=
np
.
array
(
padded
)
...
@@ -177,11 +123,9 @@ for sent in padded:
...
@@ -177,11 +123,9 @@ for sent in padded:
# Use 90% for training and 10% for validation.
# Use 90% for training and 10% for validation.
train_inputs
,
validation_inputs
,
train_labels
,
validation_labels
=
train_test_split
(
padded
,
labels
,
train_inputs
,
validation_inputs
,
train_labels
,
validation_labels
=
train_test_split
(
padded
,
labels
,
random_state
=
2018
,
test_size
=
0.1
,
stratify
=
labels
)
random_state
=
2018
,
test_size
=
0.1
,
stratify
=
labels
)
# Do the same for the masks.
# Do the same for the masks.
train_masks
,
validation_masks
,
_
,
_
=
train_test_split
(
attention_masks
,
labels
,
train_masks
,
validation_masks
,
_
,
_
=
train_test_split
(
attention_masks
,
labels
,
random_state
=
2018
,
test_size
=
0.1
,
stratify
=
labels
)
random_state
=
2018
,
test_size
=
0.1
,
stratify
=
labels
)
# Convert all inputs and labels into torch tensors, the required datatype
# Convert all inputs and labels into torch tensors, the required datatype
...
@@ -197,13 +141,11 @@ validation_masks = torch.tensor(validation_masks)
...
@@ -197,13 +141,11 @@ validation_masks = torch.tensor(validation_masks)
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
RandomSampler
,
SequentialSampler
# The DataLoader needs to know the batch size for training, so I specify it here.
# The DataLoader needs to know the batch size for training, so I specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.
# 16 or 32.
batch_size
=
int
(
config
.
get
(
'
model
'
,
'
batch_size
'
))
# Create the DataLoader for training set.
# Create the DataLoader for training set.
train_data
=
TensorDataset
(
train_inputs
,
train_masks
,
train_labels
)
train_data
=
TensorDataset
(
train_inputs
,
train_masks
,
train_labels
)
...
@@ -218,18 +160,15 @@ validation_dataloader = DataLoader(validation_data, sampler=validation_sampler,
...
@@ -218,18 +160,15 @@ validation_dataloader = DataLoader(validation_data, sampler=validation_sampler,
############################################################################################################
########################## Model: Training ###################################################################
###########################################################################################################
print
(
'
Selecting a model .....
'
)
print
(
'
Selecting a model .....
'
)
numberOfClasses
=
len
(
set
(
labels
))
# Load BertForSequenceClassification, the pretrained BERT model with a single
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
# linear classification layer on top.
if
chosen_model
==
'
bert-base-multilingual-cased
'
:
model
=
BertForSequenceClassification
.
from_pretrained
(
model
=
BertForSequenceClassification
.
from_pretrained
(
chosen_model
,
# Use the 12-layer BERT model, with an uncased vocab.
chosen_model
,
# Use the 12-layer BERT model, with an uncased vocab.
num_labels
=
numberOfClasses
,
# The number of output labels--2 for binary classification.
num_labels
=
numberOfClasses
,
# The number of output labels--2 for binary classification.
...
@@ -237,6 +176,16 @@ model = BertForSequenceClassification.from_pretrained(
...
@@ -237,6 +176,16 @@ model = BertForSequenceClassification.from_pretrained(
output_attentions
=
False
,
# Whether the model returns attentions weights.
output_attentions
=
False
,
# Whether the model returns attentions weights.
output_hidden_states
=
False
,
# Whether the model returns all hidden-states.
output_hidden_states
=
False
,
# Whether the model returns all hidden-states.
)
)
elif
chosen_model
==
'
camembert-base
'
:
model
=
CamembertForSequenceClassification
.
from_pretrained
(
chosen_model
,
# Use the 12-layer BERT model, with an uncased vocab.
num_labels
=
numberOfClasses
,
# The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions
=
False
,
# Whether the model returns attentions weights.
output_hidden_states
=
False
,
# Whether the model returns all hidden-states.
)
# Tell pytorch to run this model on the GPU.
# Tell pytorch to run this model on the GPU.
model
.
cuda
()
model
.
cuda
()
...
@@ -251,8 +200,6 @@ optimizer = AdamW(model.parameters(),
...
@@ -251,8 +200,6 @@ optimizer = AdamW(model.parameters(),
# Number of training epochs (authors recommend between 2 and 4)
epochs
=
int
(
config
.
get
(
'
model
'
,
'
epochs
'
))
# Total number of training steps is number of batches * number of epochs.
# Total number of training steps is number of batches * number of epochs.
total_steps
=
len
(
train_dataloader
)
*
epochs
total_steps
=
len
(
train_dataloader
)
*
epochs
...
@@ -263,26 +210,6 @@ scheduler = get_linear_schedule_with_warmup(optimizer,
...
@@ -263,26 +210,6 @@ scheduler = get_linear_schedule_with_warmup(optimizer,
num_training_steps
=
total_steps
)
num_training_steps
=
total_steps
)
def
flat_accuracy
(
preds
,
labels
):
pred_flat
=
np
.
argmax
(
preds
,
axis
=
1
).
flatten
()
labels_flat
=
labels
.
flatten
()
return
np
.
sum
(
pred_flat
==
labels_flat
)
/
len
(
labels_flat
)
def
format_time
(
elapsed
):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded
=
int
(
round
((
elapsed
)))
# Format as hh:mm:ss
return
str
(
datetime
.
timedelta
(
seconds
=
elapsed_rounded
))
# This training code is based on the `run_glue.py` script here:
# This training code is based on the `run_glue.py` script here:
...
@@ -463,3 +390,11 @@ for epoch_i in range(0, epochs):
...
@@ -463,3 +390,11 @@ for epoch_i in range(0, epochs):
print
(
""
)
print
(
""
)
print
(
"
Training complete!
"
)
print
(
"
Training complete!
"
)
return
model
'''
print(
'
Saving Model....
'
)
model_save_name = config.get(
'
model
'
,
'
modelName
'
)
path = config.get(
'
model
'
,
'
path
'
)
#torch.save(model.state_dict(), os.path.join(path,model_save_name))
torch.save(model, os.path.join(path,model_save_name))
'''
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
sign in
to comment