Skip to content
Snippets Groups Projects
Commit ef245f29 authored by Alice Brenon's avatar Alice Brenon
Browse files

Keep reworking things, factorize source directory handling

parent 38de3c27
No related branches found
No related tags found
No related merge requests found
class Source:
"""
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, root_path):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self.root_path = root_path
def path_to(self, record):
article_relative_path = "{work}/T{volume}/{article}".format(**record)
prefix = f"{self.root_path}/{article_relative_path}"
if 'paragraph' in record:
return f"{prefix}/{record.paragraph}.txt"
else:
return f"{prefix}.txt"
def load_text(self, record):
with open(self.path_to(record), 'r') as file:
return file.read()
def iterate(self, records):
for _, record in records.iterrows():
yield self.load_text(record)
import torch
class WithGPU:
def __init__(self):
if torch.cuda.is_available():
print('We will use the GPU:', torch.cuda.get_device_name(0))
self.device = torch.device("cuda")
else:
print('No GPU available, using the CPU instead.')
self.device = torch.device("cpu")
import os
import pickle
from sklearn import preprocessing
import torch
def get_device():
if torch.cuda.is_available():
print('We will use the GPU:', torch.cuda.get_device_name(0))
return torch.device("cuda")
else:
print('No GPU available, using the CPU instead.')
return torch.device("cpu")
def get_encoder(root_path, create_from=None):
path = f"{root_path}/label_encoder.pkl"
if os.path.isfile(path):
with open(path, 'rb') as pickled:
return pickle.load(pickled)
elif create_from is not None:
encoder = preprocessing.LabelEncoder()
encoder.fit(create_from)
with open(path, 'wb') as file:
pickle.dump(encoder, file)
return encoder
else:
raise FileNotFoundError(path)
def get_tokenizer():
model_name = 'bert-base-multilingual-cased'
print('Loading BERT tokenizer...')
return BertTokenizer.from_pretrained(model_name)
#!/usr/bin/env python3
from gpu import WithGPU
import loaders import get_device, get_encoder, get_tokenizer
import numpy
import pandas
import pickle
import sklearn
from Source import Source
from sys import argv
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizer, TextClassificationPipeline
class Classifier(WithGPU):
class Classifier:
"""
A class wrapping all the different models and classes used throughout a
classification task:
......@@ -22,20 +22,16 @@ class Classifier(WithGPU):
containing the texts to classify
"""
def __init__(self, root_path):
WithGPU.__init__(self)
self._init_tokenizer()
self.device = get_device()
self.tokenizer = get_tokenizer()
self._init_model(root_path)
self._init_pipe()
self._init_encoder(f"{root_path}/label_encoder.pkl")
self.encoder = get_encoder(root_path)
def _init_model(self, path):
bert = BertForSequenceClassification.from_pretrained(path)
self.model = bert.to(self.device.type)
def _init_tokenizer(self):
model_name = 'bert-base-multilingual-cased'
self.tokenizer = BertTokenizer.from_pretrained(model_name)
def _init_pipe(self):
self.pipe = TextClassificationPipeline(
model=self.model,
......@@ -43,10 +39,6 @@ class Classifier(WithGPU):
return_all_scores=True,
device=self.device)
def _init_encoder(self, path):
with open(path, 'rb') as pickled:
self.encoder = pickle.load(pickled)
def __call__(self, text_generator):
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
predictions = []
......@@ -55,37 +47,8 @@ class Classifier(WithGPU):
predictions.append([int(byScoreDesc[0]['label'][6:]),
byScoreDesc[0]['score'],
int(byScoreDesc[1]['label'][6:])])
predictions = numpy.array(predictions)
return list(self.encoder.inverse_transform(predictions[:,0].astype(int)))
class Source:
"""
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, root_path):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self.root_path = root_path
def path_to(self, record):
article_relative_path = "{work}/T{volume}/{article}".format(**record)
prefix = f"{self.root_path}/{article_relative_path}"
if 'paragraph' in record:
return f"{prefix}/{record.paragraph}.txt"
else:
return f"{prefix}.txt"
def load_text(self, record):
with open(self.path_to(record), 'r') as file:
return file.read()
def iterate(self, records):
for _, record in records.iterrows():
yield self.load_text(record)
return self.encoder.inverse_transform(
numpy.array(predictions)[:,0].astype(int))
def label(classify, source, tsv_path, name='label'):
"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment