Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
O
outillage
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Alice Brenon
outillage
Commits
ef245f29
Commit
ef245f29
authored
1 year ago
by
Alice Brenon
Browse files
Options
Downloads
Patches
Plain Diff
Keep reworking things, factorize source directory handling
parent
38de3c27
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
scripts/ML/Source.py
+29
-0
29 additions, 0 deletions
scripts/ML/Source.py
scripts/ML/gpu.py
+0
-10
0 additions, 10 deletions
scripts/ML/gpu.py
scripts/ML/loaders.py
+31
-0
31 additions, 0 deletions
scripts/ML/loaders.py
scripts/ML/predict.py
+8
-45
8 additions, 45 deletions
scripts/ML/predict.py
with
68 additions
and
55 deletions
scripts/ML/Source.py
0 → 100644
+
29
−
0
View file @
ef245f29
class
Source
:
"""
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def
__init__
(
self
,
root_path
):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self
.
root_path
=
root_path
def
path_to
(
self
,
record
):
article_relative_path
=
"
{work}/T{volume}/{article}
"
.
format
(
**
record
)
prefix
=
f
"
{
self
.
root_path
}
/
{
article_relative_path
}
"
if
'
paragraph
'
in
record
:
return
f
"
{
prefix
}
/
{
record
.
paragraph
}
.txt
"
else
:
return
f
"
{
prefix
}
.txt
"
def
load_text
(
self
,
record
):
with
open
(
self
.
path_to
(
record
),
'
r
'
)
as
file
:
return
file
.
read
()
def
iterate
(
self
,
records
):
for
_
,
record
in
records
.
iterrows
():
yield
self
.
load_text
(
record
)
This diff is collapsed.
Click to expand it.
scripts/ML/gpu.py
deleted
100644 → 0
+
0
−
10
View file @
38de3c27
import
torch
class
WithGPU
:
def
__init__
(
self
):
if
torch
.
cuda
.
is_available
():
print
(
'
We will use the GPU:
'
,
torch
.
cuda
.
get_device_name
(
0
))
self
.
device
=
torch
.
device
(
"
cuda
"
)
else
:
print
(
'
No GPU available, using the CPU instead.
'
)
self
.
device
=
torch
.
device
(
"
cpu
"
)
This diff is collapsed.
Click to expand it.
scripts/ML/loaders.py
0 → 100644
+
31
−
0
View file @
ef245f29
import
os
import
pickle
from
sklearn
import
preprocessing
import
torch
def
get_device
():
if
torch
.
cuda
.
is_available
():
print
(
'
We will use the GPU:
'
,
torch
.
cuda
.
get_device_name
(
0
))
return
torch
.
device
(
"
cuda
"
)
else
:
print
(
'
No GPU available, using the CPU instead.
'
)
return
torch
.
device
(
"
cpu
"
)
def
get_encoder
(
root_path
,
create_from
=
None
):
path
=
f
"
{
root_path
}
/label_encoder.pkl
"
if
os
.
path
.
isfile
(
path
):
with
open
(
path
,
'
rb
'
)
as
pickled
:
return
pickle
.
load
(
pickled
)
elif
create_from
is
not
None
:
encoder
=
preprocessing
.
LabelEncoder
()
encoder
.
fit
(
create_from
)
with
open
(
path
,
'
wb
'
)
as
file
:
pickle
.
dump
(
encoder
,
file
)
return
encoder
else
:
raise
FileNotFoundError
(
path
)
def
get_tokenizer
():
model_name
=
'
bert-base-multilingual-cased
'
print
(
'
Loading BERT tokenizer...
'
)
return
BertTokenizer
.
from_pretrained
(
model_name
)
This diff is collapsed.
Click to expand it.
scripts/ML/predict.py
+
8
−
45
View file @
ef245f29
#!/usr/bin/env python3
from
gpu
import
WithGPU
import
loaders
import
get_device
,
get_encoder
,
get_tokenizer
import
numpy
import
pandas
import
pickle
import
sklearn
from
Source
import
Source
from
sys
import
argv
from
tqdm
import
tqdm
from
transformers
import
BertForSequenceClassification
,
BertTokenizer
,
TextClassificationPipeline
class
Classifier
(
WithGPU
)
:
class
Classifier
:
"""
A class wrapping all the different models and classes used throughout a
classification task:
...
...
@@ -22,20 +22,16 @@ class Classifier(WithGPU):
containing the texts to classify
"""
def
__init__
(
self
,
root_path
):
WithGPU
.
__init__
(
self
)
self
.
_ini
t_tokenizer
()
self
.
device
=
get_device
(
)
self
.
tokenizer
=
ge
t_tokenizer
()
self
.
_init_model
(
root_path
)
self
.
_init_pipe
()
self
.
_ini
t_encoder
(
f
"
{
root_path
}
/label_encoder.pkl
"
)
self
.
encoder
=
ge
t_encoder
(
root_path
)
def
_init_model
(
self
,
path
):
bert
=
BertForSequenceClassification
.
from_pretrained
(
path
)
self
.
model
=
bert
.
to
(
self
.
device
.
type
)
def
_init_tokenizer
(
self
):
model_name
=
'
bert-base-multilingual-cased
'
self
.
tokenizer
=
BertTokenizer
.
from_pretrained
(
model_name
)
def
_init_pipe
(
self
):
self
.
pipe
=
TextClassificationPipeline
(
model
=
self
.
model
,
...
...
@@ -43,10 +39,6 @@ class Classifier(WithGPU):
return_all_scores
=
True
,
device
=
self
.
device
)
def
_init_encoder
(
self
,
path
):
with
open
(
path
,
'
rb
'
)
as
pickled
:
self
.
encoder
=
pickle
.
load
(
pickled
)
def
__call__
(
self
,
text_generator
):
tokenizer_kwargs
=
{
'
padding
'
:
True
,
'
truncation
'
:
True
,
'
max_length
'
:
512
}
predictions
=
[]
...
...
@@ -55,37 +47,8 @@ class Classifier(WithGPU):
predictions
.
append
([
int
(
byScoreDesc
[
0
][
'
label
'
][
6
:]),
byScoreDesc
[
0
][
'
score
'
],
int
(
byScoreDesc
[
1
][
'
label
'
][
6
:])])
predictions
=
numpy
.
array
(
predictions
)
return
list
(
self
.
encoder
.
inverse_transform
(
predictions
[:,
0
].
astype
(
int
)))
class
Source
:
"""
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def
__init__
(
self
,
root_path
):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self
.
root_path
=
root_path
def
path_to
(
self
,
record
):
article_relative_path
=
"
{work}/T{volume}/{article}
"
.
format
(
**
record
)
prefix
=
f
"
{
self
.
root_path
}
/
{
article_relative_path
}
"
if
'
paragraph
'
in
record
:
return
f
"
{
prefix
}
/
{
record
.
paragraph
}
.txt
"
else
:
return
f
"
{
prefix
}
.txt
"
def
load_text
(
self
,
record
):
with
open
(
self
.
path_to
(
record
),
'
r
'
)
as
file
:
return
file
.
read
()
def
iterate
(
self
,
records
):
for
_
,
record
in
records
.
iterrows
():
yield
self
.
load_text
(
record
)
return
self
.
encoder
.
inverse_transform
(
numpy
.
array
(
predictions
)[:,
0
].
astype
(
int
))
def
label
(
classify
,
source
,
tsv_path
,
name
=
'
label
'
):
"""
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment