Skip to content
Snippets Groups Projects
Commit 75f61841 authored by Alice Brenon's avatar Alice Brenon
Browse files

[WIP] Put classifier and trainer into a subdirectory to emphasise their being based on BERT

parent 4fce5770
No related branches found
No related tags found
No related merge requests found
from loaders import get_device from loaders import get_device
from transformers import BertForSequenceClassification, BertTokenizer from transformers import BertForSequenceClassification, BertTokenizer
def loader(f):
def wrapped(*args, **kwargs):
name = f.__name__.replace('_init_', '')
print(f' - {name}', end='')
f(*args, **kwargs)
print(f'\r✔️ {name}')
return wrapped
class BERT: class BERT:
model_name = 'bert-base-multilingual-cased' model_name = 'bert-base-multilingual-cased'
def __init__(self, path): def __init__(self, root_path, training=False):
self.device = get_device() self.device = get_device()
print('Loading BERT tools') print('Loading BERT tools')
print(' - tokenizer', end='') self._init_tokenizer()
self.root_path = root_path
_init_classifier(training)
@loader
def _init_tokenizer():
self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name) self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name)
print('\r✔️ tokenizer')
print(' - classifier', end='') @loader
bert = BertForSequenceClassification.from_pretrained(path) def _init_classifier(training)
if training
bert = BertForSequenceClassification.from_pretrained(self.root_path)
else:
bert = BertForSequenceClassification.from_pretrained(
model_name, # Use the 12-layer BERT model, with an uncased vocab.
num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
# You can increase this
# for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
self.model = bert.to(self.device.type) self.model = bert.to(self.device.type)
print('\r✔️ classifier')
def import_data(self, data):
return map(lambda d: d.to(self.device), data)
def save(self):
self.model.save_pretrained(self.root_path)
from transformers import BertForSequenceClassification, BertTokenizer
import os
import pickle
from sklearn import preprocessing
import torch
def get_device():
if torch.cuda.is_available():
print('We will use the GPU:', torch.cuda.get_device_name(0))
return torch.device("cuda")
else:
print('No GPU available, using the CPU instead.')
return torch.device("cpu")
def get_encoder(root_path, create_from=None):
path = f"{root_path}/label_encoder.pkl"
if os.path.isfile(path):
with open(path, 'rb') as pickled:
return pickle.load(pickled)
elif create_from is not None:
encoder = preprocessing.LabelEncoder()
encoder.fit(create_from)
with open(path, 'wb') as file:
pickle.dump(encoder, file)
return encoder
else:
raise FileNotFoundError(path)
def loader(f):
def wrapped(*args, **kwargs):
name = f.__name__.replace('_init_', '')
print(f' - {name}', end='')
f(*args, **kwargs)
print(f'\r✔️ {name}')
return wrapped
class BERT:
model_name = 'bert-base-multilingual-cased'
def __init__(self, root_path, training=False):
self.device = get_device()
print('Loading BERT tools')
self._init_tokenizer()
self.root_path = root_path
_init_classifier(training)
@loader
def _init_tokenizer():
self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name)
@loader
def _init_classifier(training)
if training
bert = BertForSequenceClassification.from_pretrained(
model_name, # Use the 12-layer BERT model, with an uncased vocab.
num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
# You can increase this
# for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
else:
bert = BertForSequenceClassification.from_pretrained(self.root_path)
self.model = bert.to(self.device.type)
def import_data(self, data):
return map(lambda d: d.to(self.device), data)
def save(self):
self.model.save_pretrained(self.root_path)
from BERT.Base import BERT, get_encoder
import numpy
from tqdm import tqdm
from transformers import TextClassificationPipeline
class Classifier(BERT):
"""
A class wrapping all the different models and classes used throughout a
classification task and based on BERT:
- tokenizer
- classifier
- pipeline
- label encoder
Once created, it behaves as a function which you apply to a generator
containing the texts to classify
"""
def __init__(self, root_path):
BERT.__init__(self, root_path)
self._init_pipe()
self.encoder = get_encoder(root_path)
def _init_pipe(self):
self.pipe = TextClassificationPipeline(
model=self.model,
tokenizer=self.tokenizer,
return_all_scores=True,
device=self.device)
def __call__(self, text_generator):
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
predictions = []
for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
predictions.append([int(byScoreDesc[0]['label'][6:]),
byScoreDesc[0]['score'],
int(byScoreDesc[1]['label'][6:])])
return self.encoder.inverse_transform(
numpy.array(predictions)[:,0].astype(int))
from BERT.Base import BERT
from BERT.Classifier import Classifier
from BERT.Trainer import Trainer
import pandas import pandas
from os import makedirs from os import makedirs
from os.path import dirname, isdir, isfile from os.path import dirname, isdir
def abstract(f): def abstract(f):
def wrapped(*args, **kwargs): def wrapped(*args, **kwargs):
...@@ -43,15 +43,25 @@ class TSVIndexed(Corpus): ...@@ -43,15 +43,25 @@ class TSVIndexed(Corpus):
self.keys.append('paragraph') self.keys.append('paragraph')
@abstract @abstract
def get_content(self, key, row): def content(self, key, row):
pass pass
def get_all(self): def keys(self, _, row):
return row[self.keys].to_dict()
def full(self, key, row):
d = self.keys(key, row)
d[self.column_name] = self.content(key, row).strip() + '\n'
return d
def get_all(self, projector):
if projector is None:
projector = self.full
elif type(projector) == str:
projector = self.__getattribute__(projector)
self.load() self.load()
for key, row in self.data.iterrows(): for row in self.data.iterrows():
keys = self.keys + [self.column_name] yield projector(*row)
values = key + (self.get_content(key, row).strip() + '\n',)
yield dict(zip(keys, values))
class SelfContained(TSVIndexed): class SelfContained(TSVIndexed):
""" """
...@@ -78,7 +88,7 @@ class SelfContained(TSVIndexed): ...@@ -78,7 +88,7 @@ class SelfContained(TSVIndexed):
primary_key = tuple(primary_key) primary_key = tuple(primary_key)
return self.data.xs(primary_key)[self.column_name] return self.data.xs(primary_key)[self.column_name]
def get_content(self, _, row): def content(self, _, row):
return row[self.column_name] return row[self.column_name]
def save(self, iterator): def save(self, iterator):
...@@ -124,7 +134,7 @@ class Directory(TSVIndexed): ...@@ -124,7 +134,7 @@ class Directory(TSVIndexed):
with open(self.path_to(primary_key), 'r') as file: with open(self.path_to(primary_key), 'r') as file:
return file.read() return file.read()
def get_content(self, key, _): def content(self, key, _):
return self.get_text(key) return self.get_text(key)
def write_text(self, primary_key, content): def write_text(self, primary_key, content):
...@@ -140,10 +150,10 @@ class Directory(TSVIndexed): ...@@ -140,10 +150,10 @@ class Directory(TSVIndexed):
self.write_text(row, row[self.column_name]) self.write_text(row, row[self.column_name])
self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False) self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False)
def corpus(path): def corpus(path, **kwargs):
if path[-1:] == '/': if path[-1:] == '/' or isdir(path):
return Directory(path) return Directory(path, **kwargs)
elif path[-4:] == '.tsv': elif path[-4:] == '.tsv':
return SelfContained(path) return SelfContained(path, **kwargs)
else: else:
raise FileNotFoundError(path) raise FileNotFoundError(path)
import os import numpy
import pickle import random
from sklearn import preprocessing
import torch import torch
def get_device():
if torch.cuda.is_available():
print('We will use the GPU:', torch.cuda.get_device_name(0))
return torch.device("cuda")
else:
print('No GPU available, using the CPU instead.')
return torch.device("cpu")
def get_encoder(root_path, create_from=None):
path = f"{root_path}/label_encoder.pkl"
if os.path.isfile(path):
with open(path, 'rb') as pickled:
return pickle.load(pickled)
elif create_from is not None:
encoder = preprocessing.LabelEncoder()
encoder.fit(create_from)
with open(path, 'wb') as file:
pickle.dump(encoder, file)
return encoder
else:
raise FileNotFoundError(path)
def set_random(): def set_random():
seed_value = 42 seed_value = 42
random.seed(seed_val) random.seed(seed_val)
np.random.seed(seed_val) numpy.random.seed(seed_val)
torch.manual_seed(seed_val) torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val) torch.cuda.manual_seed_all(seed_val)
#!/usr/bin/env python3 #!/usr/bin/env python3
from BERT import BERT from BERT import Classifier
from loaders import get_encoder
import numpy
import pandas import pandas
import sklearn from Corpus import corpus
from Source import Source
from sys import argv from sys import argv
from tqdm import tqdm
from transformers import TextClassificationPipeline
class Classifier(BERT): def label(classify, source, name='label'):
"""
A class wrapping all the different models and classes used throughout a
classification task and based on BERT:
- tokenizer
- classifier
- pipeline
- label encoder
Once created, it behaves as a function which you apply to a generator
containing the texts to classify
"""
def __init__(self, root_path):
BERT.__init__(self, root_path)
self._init_pipe()
self.encoder = get_encoder(root_path)
def _init_pipe(self):
self.pipe = TextClassificationPipeline(
model=self.model,
tokenizer=self.tokenizer,
return_all_scores=True,
device=self.device)
def __call__(self, text_generator):
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
predictions = []
for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
predictions.append([int(byScoreDesc[0]['label'][6:]),
byScoreDesc[0]['score'],
int(byScoreDesc[1]['label'][6:])])
return self.encoder.inverse_transform(
numpy.array(predictions)[:,0].astype(int))
def label(classify, source, tsv_path, name='label'):
""" """
Make predictions on a set of document Make predictions on a set of document
Positional arguments Positional arguments
:param classify: an instance of the Classifier class above :param classify: an instance of the Classifier class
:param source: an instance of the Source class above :param source: an instance of the Corpus class
:param tsv_path: the path to a TSV file containing (at least) article or
paragraph records (additional metadata will be ignored)
Keyword arguments Keyword arguments
:param name: defaults to 'label' — the name of the column to be created, that is :param name: defaults to 'label' — the name of the column to be created, that is
...@@ -64,11 +21,11 @@ def label(classify, source, tsv_path, name='label'): ...@@ -64,11 +21,11 @@ def label(classify, source, tsv_path, name='label'):
:return: a panda dataframe containing the records from the input TSV file plus :return: a panda dataframe containing the records from the input TSV file plus
an additional column an additional column
""" """
records = pandas.read_csv(tsv_path, sep='\t') records = pandas.DataFrame(source.get_all('keys'))
records[name] = classify(source.iterate(records)) records[name] = classify(source.get_all('content')
return records return records
if __name__ == '__main__': if __name__ == '__main__':
classify = Classifier(argv[1]) classify = Classifier(argv[1])
source = Source(argv[2]) source = corpus(argv[2])
label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False) label(classify, source).to_csv(argv[3], sep='\t', index=False)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment