diff --git a/scripts/ML/BERT.py b/scripts/ML/BERT.py index 126d4e7e12f5e878b176abe8a1c6190122057aaf..1dc0ed9d486c171324cc02c67c27bfa37750e54d 100644 --- a/scripts/ML/BERT.py +++ b/scripts/ML/BERT.py @@ -1,16 +1,45 @@ from loaders import get_device from transformers import BertForSequenceClassification, BertTokenizer +def loader(f): + def wrapped(*args, **kwargs): + name = f.__name__.replace('_init_', '') + print(f' - {name}', end='') + f(*args, **kwargs) + print(f'\râœ”ï¸ {name}') + return wrapped + class BERT: model_name = 'bert-base-multilingual-cased' - def __init__(self, path): + def __init__(self, root_path, training=False): self.device = get_device() print('Loading BERT tools') - print(' - tokenizer', end='') + self._init_tokenizer() + self.root_path = root_path + _init_classifier(training) + + @loader + def _init_tokenizer(): self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name) - print('\râœ”ï¸ tokenizer') - print(' - classifier', end='') - bert = BertForSequenceClassification.from_pretrained(path) + + @loader + def _init_classifier(training) + if training + bert = BertForSequenceClassification.from_pretrained(self.root_path) + else: + bert = BertForSequenceClassification.from_pretrained( + model_name, # Use the 12-layer BERT model, with an uncased vocab. + num_labels = numberOfClasses, # The number of output labels--2 for binary classification. + # You can increase this + # for multi-class tasks. + output_attentions = False, # Whether the model returns attentions weights. + output_hidden_states = False, # Whether the model returns all hidden-states. + ) self.model = bert.to(self.device.type) - print('\râœ”ï¸ classifier') + + def import_data(self, data): + return map(lambda d: d.to(self.device), data) + + def save(self): + self.model.save_pretrained(self.root_path) diff --git a/scripts/ML/BERT/Base.py b/scripts/ML/BERT/Base.py new file mode 100644 index 0000000000000000000000000000000000000000..c8b8d11eced28882082aea89a3eba1d071ca4ea1 --- /dev/null +++ b/scripts/ML/BERT/Base.py @@ -0,0 +1,70 @@ +from transformers import BertForSequenceClassification, BertTokenizer +import os +import pickle +from sklearn import preprocessing +import torch + +def get_device(): + if torch.cuda.is_available(): + print('We will use the GPU:', torch.cuda.get_device_name(0)) + return torch.device("cuda") + else: + print('No GPU available, using the CPU instead.') + return torch.device("cpu") + +def get_encoder(root_path, create_from=None): + path = f"{root_path}/label_encoder.pkl" + if os.path.isfile(path): + with open(path, 'rb') as pickled: + return pickle.load(pickled) + elif create_from is not None: + encoder = preprocessing.LabelEncoder() + encoder.fit(create_from) + with open(path, 'wb') as file: + pickle.dump(encoder, file) + return encoder + else: + raise FileNotFoundError(path) + +def loader(f): + def wrapped(*args, **kwargs): + name = f.__name__.replace('_init_', '') + print(f' - {name}', end='') + f(*args, **kwargs) + print(f'\râœ”ï¸ {name}') + return wrapped + +class BERT: + model_name = 'bert-base-multilingual-cased' + + def __init__(self, root_path, training=False): + self.device = get_device() + print('Loading BERT tools') + self._init_tokenizer() + self.root_path = root_path + _init_classifier(training) + + @loader + def _init_tokenizer(): + self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name) + + @loader + def _init_classifier(training) + if training + bert = BertForSequenceClassification.from_pretrained( + model_name, # Use the 12-layer BERT model, with an uncased vocab. + num_labels = numberOfClasses, # The number of output labels--2 for binary classification. + # You can increase this + # for multi-class tasks. + output_attentions = False, # Whether the model returns attentions weights. + output_hidden_states = False, # Whether the model returns all hidden-states. + ) + else: + bert = BertForSequenceClassification.from_pretrained(self.root_path) + self.model = bert.to(self.device.type) + + def import_data(self, data): + return map(lambda d: d.to(self.device), data) + + def save(self): + self.model.save_pretrained(self.root_path) diff --git a/scripts/ML/BERT/Classifier.py b/scripts/ML/BERT/Classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..2807e36b77124ca8e0f5c9006a70cd89d1612f10 --- /dev/null +++ b/scripts/ML/BERT/Classifier.py @@ -0,0 +1,40 @@ +from BERT.Base import BERT, get_encoder +import numpy +from tqdm import tqdm +from transformers import TextClassificationPipeline + +class Classifier(BERT): + """ + A class wrapping all the different models and classes used throughout a + classification task and based on BERT: + + - tokenizer + - classifier + - pipeline + - label encoder + + Once created, it behaves as a function which you apply to a generator + containing the texts to classify + """ + def __init__(self, root_path): + BERT.__init__(self, root_path) + self._init_pipe() + self.encoder = get_encoder(root_path) + + def _init_pipe(self): + self.pipe = TextClassificationPipeline( + model=self.model, + tokenizer=self.tokenizer, + return_all_scores=True, + device=self.device) + + def __call__(self, text_generator): + tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512} + predictions = [] + for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)): + byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True) + predictions.append([int(byScoreDesc[0]['label'][6:]), + byScoreDesc[0]['score'], + int(byScoreDesc[1]['label'][6:])]) + return self.encoder.inverse_transform( + numpy.array(predictions)[:,0].astype(int)) diff --git a/scripts/ML/BERT/Trainer.py b/scripts/ML/BERT/Trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/ML/BERT/__init__.py b/scripts/ML/BERT/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..50cbcc17fd40517cb519e00e8e1c03a857e2d5b4 --- /dev/null +++ b/scripts/ML/BERT/__init__.py @@ -0,0 +1,3 @@ +from BERT.Base import BERT +from BERT.Classifier import Classifier +from BERT.Trainer import Trainer diff --git a/scripts/ML/Corpus.py b/scripts/ML/Corpus.py index d2ea16d6d55da99723b053afc87cdf7dc12f8d7d..b81a56c79fd41dd5862a48480260e8ffd2f468c7 100644 --- a/scripts/ML/Corpus.py +++ b/scripts/ML/Corpus.py @@ -1,6 +1,6 @@ import pandas from os import makedirs -from os.path import dirname, isdir, isfile +from os.path import dirname, isdir def abstract(f): def wrapped(*args, **kwargs): @@ -43,15 +43,25 @@ class TSVIndexed(Corpus): self.keys.append('paragraph') @abstract - def get_content(self, key, row): + def content(self, key, row): pass - def get_all(self): + def keys(self, _, row): + return row[self.keys].to_dict() + + def full(self, key, row): + d = self.keys(key, row) + d[self.column_name] = self.content(key, row).strip() + '\n' + return d + + def get_all(self, projector): + if projector is None: + projector = self.full + elif type(projector) == str: + projector = self.__getattribute__(projector) self.load() - for key, row in self.data.iterrows(): - keys = self.keys + [self.column_name] - values = key + (self.get_content(key, row).strip() + '\n',) - yield dict(zip(keys, values)) + for row in self.data.iterrows(): + yield projector(*row) class SelfContained(TSVIndexed): """ @@ -78,7 +88,7 @@ class SelfContained(TSVIndexed): primary_key = tuple(primary_key) return self.data.xs(primary_key)[self.column_name] - def get_content(self, _, row): + def content(self, _, row): return row[self.column_name] def save(self, iterator): @@ -124,7 +134,7 @@ class Directory(TSVIndexed): with open(self.path_to(primary_key), 'r') as file: return file.read() - def get_content(self, key, _): + def content(self, key, _): return self.get_text(key) def write_text(self, primary_key, content): @@ -140,10 +150,10 @@ class Directory(TSVIndexed): self.write_text(row, row[self.column_name]) self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False) -def corpus(path): - if path[-1:] == '/': - return Directory(path) +def corpus(path, **kwargs): + if path[-1:] == '/' or isdir(path): + return Directory(path, **kwargs) elif path[-4:] == '.tsv': - return SelfContained(path) + return SelfContained(path, **kwargs) else: raise FileNotFoundError(path) diff --git a/scripts/ML/loaders.py b/scripts/ML/loaders.py index 859669d42884c62c4b0f62f77e5bf852eb4829e7..5aa9dc7a0ae58cf19612072886eb88a8b8235de3 100644 --- a/scripts/ML/loaders.py +++ b/scripts/ML/loaders.py @@ -1,33 +1,10 @@ -import os -import pickle -from sklearn import preprocessing +import numpy +import random import torch -def get_device(): - if torch.cuda.is_available(): - print('We will use the GPU:', torch.cuda.get_device_name(0)) - return torch.device("cuda") - else: - print('No GPU available, using the CPU instead.') - return torch.device("cpu") - -def get_encoder(root_path, create_from=None): - path = f"{root_path}/label_encoder.pkl" - if os.path.isfile(path): - with open(path, 'rb') as pickled: - return pickle.load(pickled) - elif create_from is not None: - encoder = preprocessing.LabelEncoder() - encoder.fit(create_from) - with open(path, 'wb') as file: - pickle.dump(encoder, file) - return encoder - else: - raise FileNotFoundError(path) - def set_random(): seed_value = 42 random.seed(seed_val) - np.random.seed(seed_val) + numpy.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) diff --git a/scripts/ML/predict.py b/scripts/ML/predict.py index a64100329cf6680e288092dcb32a25bc346586df..f1768db8f7d7ceb338749c0e58459ace47dbbd34 100644 --- a/scripts/ML/predict.py +++ b/scripts/ML/predict.py @@ -1,59 +1,16 @@ #!/usr/bin/env python3 -from BERT import BERT -from loaders import get_encoder -import numpy +from BERT import Classifier import pandas -import sklearn -from Source import Source +from Corpus import corpus from sys import argv -from tqdm import tqdm -from transformers import TextClassificationPipeline -class Classifier(BERT): - """ - A class wrapping all the different models and classes used throughout a - classification task and based on BERT: - - - tokenizer - - classifier - - pipeline - - label encoder - - Once created, it behaves as a function which you apply to a generator - containing the texts to classify - """ - def __init__(self, root_path): - BERT.__init__(self, root_path) - self._init_pipe() - self.encoder = get_encoder(root_path) - - def _init_pipe(self): - self.pipe = TextClassificationPipeline( - model=self.model, - tokenizer=self.tokenizer, - return_all_scores=True, - device=self.device) - - def __call__(self, text_generator): - tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512} - predictions = [] - for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)): - byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True) - predictions.append([int(byScoreDesc[0]['label'][6:]), - byScoreDesc[0]['score'], - int(byScoreDesc[1]['label'][6:])]) - return self.encoder.inverse_transform( - numpy.array(predictions)[:,0].astype(int)) - -def label(classify, source, tsv_path, name='label'): +def label(classify, source, name='label'): """ Make predictions on a set of document Positional arguments - :param classify: an instance of the Classifier class above - :param source: an instance of the Source class above - :param tsv_path: the path to a TSV file containing (at least) article or - paragraph records (additional metadata will be ignored) + :param classify: an instance of the Classifier class + :param source: an instance of the Corpus class Keyword arguments :param name: defaults to 'label' — the name of the column to be created, that is @@ -64,11 +21,11 @@ def label(classify, source, tsv_path, name='label'): :return: a panda dataframe containing the records from the input TSV file plus an additional column """ - records = pandas.read_csv(tsv_path, sep='\t') - records[name] = classify(source.iterate(records)) + records = pandas.DataFrame(source.get_all('keys')) + records[name] = classify(source.get_all('content') return records if __name__ == '__main__': classify = Classifier(argv[1]) - source = Source(argv[2]) - label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False) + source = corpus(argv[2]) + label(classify, source).to_csv(argv[3], sep='\t', index=False)