diff --git a/scripts/ML/BERT/Base.py b/scripts/ML/BERT/Base.py new file mode 100644 index 0000000000000000000000000000000000000000..a1d1b8e50cad8fe60ed1fbef0d3b9ee5029bebb2 --- /dev/null +++ b/scripts/ML/BERT/Base.py @@ -0,0 +1,69 @@ +from transformers import BertForSequenceClassification, BertTokenizer +import os +import pickle +from sklearn import preprocessing +import torch + +def get_device(): + if torch.cuda.is_available(): + print('We will use the GPU:', torch.cuda.get_device_name(0)) + return torch.device("cuda") + else: + print('No GPU available, using the CPU instead.') + return torch.device("cpu") + +def loader(f): + def wrapped(*args, **kwargs): + name = f.__name__.replace('_init_', '') + print(f' - {name}', end='') + f(*args, **kwargs) + print(f'\râœ”ï¸ {name}') + return wrapped + +class BERT: + model_name = 'bert-base-multilingual-cased' + + def __init__(self, root_path, train_on=None): + self.device = get_device() + print('Loading BERT tools') + self._init_tokenizer() + self.root_path = root_path + self._init_classifier(train_on) + self._init_encoder(train_on) + + @loader + def _init_tokenizer(self): + self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name) + + @loader + def _init_classifier(self, train_on): + if train_on is not None: + bert = BertForSequenceClassification.from_pretrained( + BERT.model_name, # Use the 12-layer BERT model, with an uncased vocab. + num_labels = len(train_on), + output_attentions = False, + output_hidden_states = False + ) + else: + bert = BertForSequenceClassification.from_pretrained(self.root_path) + self.model = bert.to(self.device.type) + + @loader + def _init_encoder(self, train_on): + path = f"{self.root_path}/label_encoder.pkl" + if os.path.isfile(path): + with open(path, 'rb') as pickled: + self.encoder = pickle.load(pickled) + elif train_on is not None: + self.encoder = preprocessing.LabelEncoder() + self.encoder.fit(train_on) + with open(path, 'wb') as file: + pickle.dump(self.encoder, file) + else: + raise FileNotFoundError(path) + + def import_data(self, data): + return map(lambda d: d.to(self.device), data) + + def save(self): + self.model.save_pretrained(self.root_path) diff --git a/scripts/ML/BERT/Classifier.py b/scripts/ML/BERT/Classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..04bcffa0aa8361b0748444479b1177f1c8bb152b --- /dev/null +++ b/scripts/ML/BERT/Classifier.py @@ -0,0 +1,39 @@ +from BERT.Base import BERT +import numpy +from tqdm import tqdm +from transformers import TextClassificationPipeline + +class Classifier(BERT): + """ + A class wrapping all the different models and classes used throughout a + classification task and based on BERT: + + - tokenizer + - classifier + - pipeline + - label encoder + + Once created, it behaves as a function which you apply to a generator + containing the texts to classify + """ + def __init__(self, root_path): + BERT.__init__(self, root_path) + self._init_pipe() + + def _init_pipe(self): + self.pipe = TextClassificationPipeline( + model=self.model, + tokenizer=self.tokenizer, + return_all_scores=True, + device=self.device) + + def __call__(self, text_generator): + tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512} + predictions = [] + for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)): + byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True) + predictions.append([int(byScoreDesc[0]['label'][6:]), + byScoreDesc[0]['score'], + int(byScoreDesc[1]['label'][6:])]) + return self.encoder.inverse_transform( + numpy.array(predictions)[:,0].astype(int)) diff --git a/scripts/ML/BERT/Trainer.py b/scripts/ML/BERT/Trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..06851eef22cfce4f575beaf358abde39d713e53d --- /dev/null +++ b/scripts/ML/BERT/Trainer.py @@ -0,0 +1,64 @@ +from BERT.Base import BERT +import datetime +from loaders import set_random +import time +import torch +from transformers import AdamW, get_linear_schedule_with_warmup + +def chrono(f): + def wrapped(*args, **kwargs): + t0 = time.time() + f(*args, **kwargs) + duration = datetime.timedelta(seconds=round(time.time() - t0)) + print(f"\n {f.__name__} took: {duration}") + return wrapped + +class Trainer(BERT): + def __init__(self, root_path, labeled_data, epochs=4): + self.epochs = epochs + BERT.__init__(self, root_path, train_on=labeled_data.unique) + self._init_utils(labeled_data.load(self)) + + def _init_utils(self, data_loader): + self.optimizer = AdamW( + self.model.parameters(), + lr = 2e-5, # args.learning_rate - default is 5e-5 + ) + self.data_loader = data_loader + self.scheduler = get_linear_schedule_with_warmup( + self.optimizer, + num_warmup_steps = 0, # Default value in run_glue.py + num_training_steps = self.epochs * len(data_loader)) + + def __call__(self): + set_random() + losses = [self.epoch(e) for e in range(self.epochs)] + self.save() + print("\nTraining complete!") + + @chrono + def epoch(self, epoch): + self._start_epoch(epoch) + self.model.train() + total_loss = sum([self.learn_on(*self.import_data(batch)) + for batch in self.data_loader]) + avg_train_loss = total_loss / len(self.data_loader) + print("\n Average training loss: {0:.2f}".format(avg_train_loss)) + return avg_train_loss + + def learn_on(self, input_ids, input_mask, labels): + self.model.zero_grad() + outputs = self.model(input_ids, + token_type_ids=None, + attention_mask=input_mask, + labels=labels) + loss = outputs[0] + loss.backward() + torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) + self.optimizer.step() + self.scheduler.step() + return loss.item() + + def _start_epoch(self, epoch): + print(f'\n======== Epoch {epoch+1} / {self.epochs} ========') + print('Training...') diff --git a/scripts/ML/BERT/__init__.py b/scripts/ML/BERT/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..50cbcc17fd40517cb519e00e8e1c03a857e2d5b4 --- /dev/null +++ b/scripts/ML/BERT/__init__.py @@ -0,0 +1,3 @@ +from BERT.Base import BERT +from BERT.Classifier import Classifier +from BERT.Trainer import Trainer diff --git a/scripts/ML/Corpus.py b/scripts/ML/Corpus.py new file mode 100644 index 0000000000000000000000000000000000000000..159092b53763e18f1b4b225790e1c6d42de05337 --- /dev/null +++ b/scripts/ML/Corpus.py @@ -0,0 +1,161 @@ +import pandas +from os import makedirs +from os.path import dirname, isdir + +def abstract(f): + def wrapped(*args, **kwargs): + raise NotImplementedError(f.__name__) + return wrapped + +class Corpus: + @abstract + def __init__(): + pass + + @abstract + def get_text(self, primary_key): + pass + + @abstract + def get_all(self): + pass + + @abstract + def save(self, iterator): + pass + +class TSVIndexed(Corpus): + default_keys = ['work', 'volume', 'article'] + projectors = ['key', 'content', 'full'] + + def __init__(self, tsv_path, column_name): + self.tsv_path = tsv_path + self.column_name = column_name + self.data = None + + def load(self): + if self.data is None: + self.data = pandas.read_csv(self.tsv_path, sep='\t') + self.detect_keys() + self.data = self.data.set_index(self.keys, drop=False) + + def detect_keys(self): + self.keys = self.default_keys.copy() + if 'paragraph' in self.data: + self.keys.append('paragraph') + + @abstract + def content(self, key, row): + pass + + def key(self, _, row): + return row[self.keys].to_dict() + + def full(self, key, row): + d = self.key(key, row) + d[self.column_name] = self.content(key, row).strip() + '\n' + return d + + def get_all(self, projector=None): + if projector is None: + projector = self.full + elif type(projector) == str and projector in self.projectors: + projector = self.__getattribute__(projector) + self.load() + for row in self.data.iterrows(): + yield projector(*row) + +class SelfContained(TSVIndexed): + """ + A class to handle the dataset TSV normalised path used in the project and loading the + actual text input as a generator from records when they are needed + """ + def __init__(self, tsv_path, column_name='content'): + """ + Positional arguments + :param tsv_path: the path to a TSV dataset containing a primary key and + a text content on every line + + Keyword arguments + :param column_name: the name of the column where the text content is + stored + """ + TSVIndexed.__init__(self, tsv_path, column_name) + + def get_text(self, primary_key): + self.load() + if type(primary_key) == dict: + primary_key = [primary_key[k] for k in self.keys if k in primary_key] + if type(primary_key) != tuple: + primary_key = tuple(primary_key) + return self.data.xs(primary_key)[self.column_name] + + def content(self, _, row): + return row[self.column_name] + + def save(self, iterator): + self.data = pandas.DataFrame(iterator) + self.detect_keys() + self.data.to_csv(self.tsv_path, sep='\t', index=False) + +class Directory(TSVIndexed): + """ + A class to handle the normalised path used in the project and loading the + actual text input as a generator from records when they are needed + """ + def __init__(self, root_path, column_name='content'): + """ + Positional arguments + :param root_path: the path to a GÉODE-style folder containing the text + version of the corpus on which to predict the classes + """ + self.text_path = f"{root_path}/Text" + TSVIndexed.__init__(self, f"{root_path}/files.tsv", column_name) + + def path_to(self, primary_key): + record = self.dict_primary_key(primary_key) + article_relative_path = "{work}/T{volume}/{article}".format(**record) + prefix = f"{self.text_path}/{article_relative_path}" + if 'paragraph' in record: + return f"{prefix}/{record['paragraph']}.txt" + else: + return f"{prefix}.txt" + + def dict_primary_key(self, primary_key): + if type(primary_key) == pandas.core.series.Series: + return dict(primary_key) + elif type(primary_key) != dict: + keys = self.default_keys.copy() + if len(primary_key) == 4: + keys.append('paragraph') + return dict(zip(keys, primary_key)) + else: + return primary_key + + def get_text(self, primary_key): + with open(self.path_to(primary_key), 'r') as file: + return file.read() + + def content(self, key, _): + return self.get_text(key) + + def write_text(self, primary_key, content): + path = self.path_to(primary_key) + makedirs(dirname(path), exist_ok=True) + with open(path, 'w') as file: + file.write(content) + + def save(self, iterator): + self.data = pandas.DataFrame(iterator) + self.detect_keys() + for _, row in self.data.iterrows(): + self.write_text(row, row[self.column_name]) + self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False) + +def corpus(path, **kwargs): + if path[-1:] == '/' or isdir(path): + return Directory(path, **kwargs) + elif path[-4:] == '.tsv': + return SelfContained(path, **kwargs) + else: + raise FileNotFoundError(path) diff --git a/scripts/ML/convert-corpus.py b/scripts/ML/convert-corpus.py new file mode 100755 index 0000000000000000000000000000000000000000..a37fb2c0e623b07542f75a8127aed6cbe43352d9 --- /dev/null +++ b/scripts/ML/convert-corpus.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +from Corpus import corpus +import sys + +if __name__ == '__main__': + source = corpus(sys.argv[1]) + destination = corpus(sys.argv[2]) + destination.save(source.get_all()) diff --git a/scripts/ML/loaders.py b/scripts/ML/loaders.py new file mode 100644 index 0000000000000000000000000000000000000000..93986f4f4d260f7c92473a5cd5909547da690e4f --- /dev/null +++ b/scripts/ML/loaders.py @@ -0,0 +1,10 @@ +import numpy +import random +import torch + +def set_random(): + seed_value = 42 + random.seed(seed_value) + numpy.random.seed(seed_value) + torch.manual_seed(seed_value) + torch.cuda.manual_seed_all(seed_value) diff --git a/scripts/ML/predict.py b/scripts/ML/predict.py index 974fc69a27bcaa230b2ca5d046849daae89c8758..2dfa3e867e1b39a78d34c160433792de49f7a2e5 100644 --- a/scripts/ML/predict.py +++ b/scripts/ML/predict.py @@ -1,108 +1,16 @@ #!/usr/bin/env python3 -import numpy +from BERT import Classifier import pandas -import pickle -import sklearn +from Corpus import corpus from sys import argv -import torch -from tqdm import tqdm -from transformers import BertForSequenceClassification, BertTokenizer, TextClassificationPipeline -class Classifier: - """ - A class wrapping all the different models and classes used throughout a - classification task: - - - tokenizer - - classifier - - pipeline - - label encoder - - Once created, it behaves as a function which you apply to a generator - containing the texts to classify - """ - def __init__(self, root_path): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self._init_tokenizer() - self._init_model(root_path) - self._init_pipe() - self._init_encoder(f"{root_path}/label_encoder.pkl") - self.log() - - def _init_model(self, path): - bert = BertForSequenceClassification.from_pretrained(path) - self.model = bert.to(self.device.type) - - def _init_tokenizer(self): - model_name = 'bert-base-multilingual-cased' - self.tokenizer = BertTokenizer.from_pretrained(model_name) - - def _init_pipe(self): - self.pipe = TextClassificationPipeline( - model=self.model, - tokenizer=self.tokenizer, - return_all_scores=True, - device=self.device) - - def _init_encoder(self, path): - with open(path, 'rb') as pickled: - self.encoder = pickle.load(pickled) - - def log(self): - if self.device.type == 'cpu': - print('No GPU available, using the CPU instead.') - else: - print('We will use the GPU:', torch.cuda.get_device_name(0)) - - def __call__(self, text_generator): - tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512} - predictions = [] - for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)): - byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True) - predictions.append([int(byScoreDesc[0]['label'][6:]), - byScoreDesc[0]['score'], - int(byScoreDesc[1]['label'][6:])]) - predictions = numpy.array(predictions) - return list(self.encoder.inverse_transform(predictions[:,0].astype(int))) - -class Source: - """ - A class to handle the normalised path used in the project and loading the - actual text input as a generator from records when they are needed - """ - def __init__(self, root_path): - """ - Positional arguments - :param root_path: the path to a GÉODE-style folder containing the text - version of the corpus on which to predict the classes - """ - self.root_path = root_path - - def path_to(self, record): - article_relative_path = "{work}/T{volume}/{article}".format(**record) - prefix = f"{self.root_path}/{article_relative_path}" - if 'paragraph' in record: - return f"{prefix}/{record.paragraph}.txt" - else: - return f"{prefix}.txt" - - def load_text(self, record): - with open(self.path_to(record), 'r') as file: - return file.read() - - def iterate(self, records): - for _, record in records.iterrows(): - yield self.load_text(record) - -def label(classify, source, tsv_path, name='label'): +def label(classify, source, name='label'): """ Make predictions on a set of document Positional arguments - :param classify: an instance of the Classifier class above - :param source: an instance of the Source class above - :param tsv_path: the path to a TSV file containing (at least) article or - paragraph records (additional metadata will be ignored) + :param classify: an instance of the Classifier class + :param source: an instance of the Corpus class Keyword arguments :param name: defaults to 'label' — the name of the column to be created, that is @@ -113,11 +21,11 @@ def label(classify, source, tsv_path, name='label'): :return: a panda dataframe containing the records from the input TSV file plus an additional column """ - records = pandas.read_csv(tsv_path, sep='\t') - records[name] = classify(source.iterate(records)) + records = pandas.DataFrame(source.get_all('key')) + records[name] = classify(source.get_all('content')) return records if __name__ == '__main__': classify = Classifier(argv[1]) - source = Source(argv[2]) - label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False) + source = corpus(argv[2]) + label(classify, source).to_csv(argv[3], sep='\t', index=False) diff --git a/scripts/ML/train.py b/scripts/ML/train.py new file mode 100755 index 0000000000000000000000000000000000000000..95b812f79dde5dcaee522849126dd8188407a731 --- /dev/null +++ b/scripts/ML/train.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 +from BERT import Trainer +from LabeledData import LabeledData +import sys + +if __name__ == '__main__': + labeled_data = LabeledData(sys.argv[1]) + trainer = Trainer(sys.argv[2], labeled_data) + trainer()