Skip to content
Snippets Groups Projects
Commit c37c1787 authored by Alice Brenon's avatar Alice Brenon
Browse files

Implement BERT trainer

parent 02253ba3
No related branches found
No related tags found
No related merge requests found
from transformers import BertForSequenceClassification, BertTokenizer
import os
import pickle
from sklearn import preprocessing
import torch
def get_device():
if torch.cuda.is_available():
print('We will use the GPU:', torch.cuda.get_device_name(0))
return torch.device("cuda")
else:
print('No GPU available, using the CPU instead.')
return torch.device("cpu")
def loader(f):
def wrapped(*args, **kwargs):
name = f.__name__.replace('_init_', '')
print(f' - {name}', end='')
f(*args, **kwargs)
print(f'\r✔️ {name}')
return wrapped
class BERT:
model_name = 'bert-base-multilingual-cased'
def __init__(self, root_path, train_on=None):
self.device = get_device()
print('Loading BERT tools')
self._init_tokenizer()
self.root_path = root_path
self._init_classifier(train_on)
self._init_encoder(train_on)
@loader
def _init_tokenizer(self):
self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name)
@loader
def _init_classifier(self, train_on):
if train_on is not None:
bert = BertForSequenceClassification.from_pretrained(
BERT.model_name, # Use the 12-layer BERT model, with an uncased vocab.
num_labels = len(train_on),
output_attentions = False,
output_hidden_states = False
)
else:
bert = BertForSequenceClassification.from_pretrained(self.root_path)
self.model = bert.to(self.device.type)
@loader
def _init_encoder(self, train_on):
path = f"{self.root_path}/label_encoder.pkl"
if os.path.isfile(path):
with open(path, 'rb') as pickled:
self.encoder = pickle.load(pickled)
elif train_on is not None:
self.encoder = preprocessing.LabelEncoder()
self.encoder.fit(train_on)
with open(path, 'wb') as file:
pickle.dump(self.encoder, file)
else:
raise FileNotFoundError(path)
def import_data(self, data):
return map(lambda d: d.to(self.device), data)
def save(self):
self.model.save_pretrained(self.root_path)
from BERT.Base import BERT
import numpy
from tqdm import tqdm
from transformers import TextClassificationPipeline
class Classifier(BERT):
"""
A class wrapping all the different models and classes used throughout a
classification task and based on BERT:
- tokenizer
- classifier
- pipeline
- label encoder
Once created, it behaves as a function which you apply to a generator
containing the texts to classify
"""
def __init__(self, root_path):
BERT.__init__(self, root_path)
self._init_pipe()
def _init_pipe(self):
self.pipe = TextClassificationPipeline(
model=self.model,
tokenizer=self.tokenizer,
return_all_scores=True,
device=self.device)
def __call__(self, text_generator):
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
predictions = []
for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
predictions.append([int(byScoreDesc[0]['label'][6:]),
byScoreDesc[0]['score'],
int(byScoreDesc[1]['label'][6:])])
return self.encoder.inverse_transform(
numpy.array(predictions)[:,0].astype(int))
from BERT.Base import BERT
import datetime
from loaders import set_random
import time
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
def chrono(f):
def wrapped(*args, **kwargs):
t0 = time.time()
f(*args, **kwargs)
duration = datetime.timedelta(seconds=round(time.time() - t0))
print(f"\n {f.__name__} took: {duration}")
return wrapped
class Trainer(BERT):
def __init__(self, root_path, labeled_data, epochs=4):
self.epochs = epochs
BERT.__init__(self, root_path, train_on=labeled_data.unique)
self._init_utils(labeled_data.load(self))
def _init_utils(self, data_loader):
self.optimizer = AdamW(
self.model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5
)
self.data_loader = data_loader
self.scheduler = get_linear_schedule_with_warmup(
self.optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = self.epochs * len(data_loader))
def __call__(self):
set_random()
losses = [self.epoch(e) for e in range(self.epochs)]
self.save()
print("\nTraining complete!")
@chrono
def epoch(self, epoch):
self._start_epoch(epoch)
self.model.train()
total_loss = sum([self.learn_on(*self.import_data(batch))
for batch in self.data_loader])
avg_train_loss = total_loss / len(self.data_loader)
print("\n Average training loss: {0:.2f}".format(avg_train_loss))
return avg_train_loss
def learn_on(self, input_ids, input_mask, labels):
self.model.zero_grad()
outputs = self.model(input_ids,
token_type_ids=None,
attention_mask=input_mask,
labels=labels)
loss = outputs[0]
loss.backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optimizer.step()
self.scheduler.step()
return loss.item()
def _start_epoch(self, epoch):
print(f'\n======== Epoch {epoch+1} / {self.epochs} ========')
print('Training...')
from BERT.Base import BERT
from BERT.Classifier import Classifier
from BERT.Trainer import Trainer
import pandas
from os import makedirs
from os.path import dirname, isdir
def abstract(f):
def wrapped(*args, **kwargs):
raise NotImplementedError(f.__name__)
return wrapped
class Corpus:
@abstract
def __init__():
pass
@abstract
def get_text(self, primary_key):
pass
@abstract
def get_all(self):
pass
@abstract
def save(self, iterator):
pass
class TSVIndexed(Corpus):
default_keys = ['work', 'volume', 'article']
projectors = ['key', 'content', 'full']
def __init__(self, tsv_path, column_name):
self.tsv_path = tsv_path
self.column_name = column_name
self.data = None
def load(self):
if self.data is None:
self.data = pandas.read_csv(self.tsv_path, sep='\t')
self.detect_keys()
self.data = self.data.set_index(self.keys, drop=False)
def detect_keys(self):
self.keys = self.default_keys.copy()
if 'paragraph' in self.data:
self.keys.append('paragraph')
@abstract
def content(self, key, row):
pass
def key(self, _, row):
return row[self.keys].to_dict()
def full(self, key, row):
d = self.key(key, row)
d[self.column_name] = self.content(key, row).strip() + '\n'
return d
def get_all(self, projector=None):
if projector is None:
projector = self.full
elif type(projector) == str and projector in self.projectors:
projector = self.__getattribute__(projector)
self.load()
for row in self.data.iterrows():
yield projector(*row)
class SelfContained(TSVIndexed):
"""
A class to handle the dataset TSV normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, tsv_path, column_name='content'):
"""
Positional arguments
:param tsv_path: the path to a TSV dataset containing a primary key and
a text content on every line
Keyword arguments
:param column_name: the name of the column where the text content is
stored
"""
TSVIndexed.__init__(self, tsv_path, column_name)
def get_text(self, primary_key):
self.load()
if type(primary_key) == dict:
primary_key = [primary_key[k] for k in self.keys if k in primary_key]
if type(primary_key) != tuple:
primary_key = tuple(primary_key)
return self.data.xs(primary_key)[self.column_name]
def content(self, _, row):
return row[self.column_name]
def save(self, iterator):
self.data = pandas.DataFrame(iterator)
self.detect_keys()
self.data.to_csv(self.tsv_path, sep='\t', index=False)
class Directory(TSVIndexed):
"""
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, root_path, column_name='content'):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self.text_path = f"{root_path}/Text"
TSVIndexed.__init__(self, f"{root_path}/files.tsv", column_name)
def path_to(self, primary_key):
record = self.dict_primary_key(primary_key)
article_relative_path = "{work}/T{volume}/{article}".format(**record)
prefix = f"{self.text_path}/{article_relative_path}"
if 'paragraph' in record:
return f"{prefix}/{record['paragraph']}.txt"
else:
return f"{prefix}.txt"
def dict_primary_key(self, primary_key):
if type(primary_key) == pandas.core.series.Series:
return dict(primary_key)
elif type(primary_key) != dict:
keys = self.default_keys.copy()
if len(primary_key) == 4:
keys.append('paragraph')
return dict(zip(keys, primary_key))
else:
return primary_key
def get_text(self, primary_key):
with open(self.path_to(primary_key), 'r') as file:
return file.read()
def content(self, key, _):
return self.get_text(key)
def write_text(self, primary_key, content):
path = self.path_to(primary_key)
makedirs(dirname(path), exist_ok=True)
with open(path, 'w') as file:
file.write(content)
def save(self, iterator):
self.data = pandas.DataFrame(iterator)
self.detect_keys()
for _, row in self.data.iterrows():
self.write_text(row, row[self.column_name])
self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False)
def corpus(path, **kwargs):
if path[-1:] == '/' or isdir(path):
return Directory(path, **kwargs)
elif path[-4:] == '.tsv':
return SelfContained(path, **kwargs)
else:
raise FileNotFoundError(path)
#!/usr/bin/env python3
from Corpus import corpus
import sys
if __name__ == '__main__':
source = corpus(sys.argv[1])
destination = corpus(sys.argv[2])
destination.save(source.get_all())
import numpy
import random
import torch
def set_random():
seed_value = 42
random.seed(seed_value)
numpy.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
#!/usr/bin/env python3
import numpy
from BERT import Classifier
import pandas
import pickle
import sklearn
from Corpus import corpus
from sys import argv
import torch
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizer, TextClassificationPipeline
class Classifier:
"""
A class wrapping all the different models and classes used throughout a
classification task:
- tokenizer
- classifier
- pipeline
- label encoder
Once created, it behaves as a function which you apply to a generator
containing the texts to classify
"""
def __init__(self, root_path):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self._init_tokenizer()
self._init_model(root_path)
self._init_pipe()
self._init_encoder(f"{root_path}/label_encoder.pkl")
self.log()
def _init_model(self, path):
bert = BertForSequenceClassification.from_pretrained(path)
self.model = bert.to(self.device.type)
def _init_tokenizer(self):
model_name = 'bert-base-multilingual-cased'
self.tokenizer = BertTokenizer.from_pretrained(model_name)
def _init_pipe(self):
self.pipe = TextClassificationPipeline(
model=self.model,
tokenizer=self.tokenizer,
return_all_scores=True,
device=self.device)
def _init_encoder(self, path):
with open(path, 'rb') as pickled:
self.encoder = pickle.load(pickled)
def log(self):
if self.device.type == 'cpu':
print('No GPU available, using the CPU instead.')
else:
print('We will use the GPU:', torch.cuda.get_device_name(0))
def __call__(self, text_generator):
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
predictions = []
for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
predictions.append([int(byScoreDesc[0]['label'][6:]),
byScoreDesc[0]['score'],
int(byScoreDesc[1]['label'][6:])])
predictions = numpy.array(predictions)
return list(self.encoder.inverse_transform(predictions[:,0].astype(int)))
class Source:
"""
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, root_path):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self.root_path = root_path
def path_to(self, record):
article_relative_path = "{work}/T{volume}/{article}".format(**record)
prefix = f"{self.root_path}/{article_relative_path}"
if 'paragraph' in record:
return f"{prefix}/{record.paragraph}.txt"
else:
return f"{prefix}.txt"
def load_text(self, record):
with open(self.path_to(record), 'r') as file:
return file.read()
def iterate(self, records):
for _, record in records.iterrows():
yield self.load_text(record)
def label(classify, source, tsv_path, name='label'):
def label(classify, source, name='label'):
"""
Make predictions on a set of document
Positional arguments
:param classify: an instance of the Classifier class above
:param source: an instance of the Source class above
:param tsv_path: the path to a TSV file containing (at least) article or
paragraph records (additional metadata will be ignored)
:param classify: an instance of the Classifier class
:param source: an instance of the Corpus class
Keyword arguments
:param name: defaults to 'label' — the name of the column to be created, that is
......@@ -113,11 +21,11 @@ def label(classify, source, tsv_path, name='label'):
:return: a panda dataframe containing the records from the input TSV file plus
an additional column
"""
records = pandas.read_csv(tsv_path, sep='\t')
records[name] = classify(source.iterate(records))
records = pandas.DataFrame(source.get_all('key'))
records[name] = classify(source.get_all('content'))
return records
if __name__ == '__main__':
classify = Classifier(argv[1])
source = Source(argv[2])
label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False)
source = corpus(argv[2])
label(classify, source).to_csv(argv[3], sep='\t', index=False)
#!/usr/bin/env python3
from BERT import Trainer
from LabeledData import LabeledData
import sys
if __name__ == '__main__':
labeled_data = LabeledData(sys.argv[1])
trainer = Trainer(sys.argv[2], labeled_data)
trainer()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment