Implement BERT trainer

c37c1787 · Alice Brenon · 02253ba3 · c37c1787 · c37c1787 · c37c1787
Commit c37c1787 authored 1 year ago by Alice Brenon
--- a/scripts/ML/BERT/Base.py
+++ b/scripts/ML/BERT/Base.py
+from transformers import BertForSequenceClassification, BertTokenizer
+import os
+import pickle
+from sklearn import preprocessing
+import torch
+
+def get_device():
+    if torch.cuda.is_available():
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
+        return torch.device("cuda")
+    else:
+        print('No GPU available, using the CPU instead.')
+        return torch.device("cpu")
+
+def loader(f):
+    def wrapped(*args, **kwargs):
+        name = f.__name__.replace('_init_', '')
+        print(f' - {name}', end='')
+        f(*args, **kwargs)
+        print(f'\r✔️  {name}')
+    return wrapped
+
+class BERT:
+    model_name = 'bert-base-multilingual-cased'
+
+    def __init__(self, root_path, train_on=None):
+        self.device = get_device()
+        print('Loading BERT tools')
+        self._init_tokenizer()
+        self.root_path = root_path
+        self._init_classifier(train_on)
+        self._init_encoder(train_on)
+
+    @loader
+    def _init_tokenizer(self):
+        self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name)
+
+    @loader
+    def _init_classifier(self, train_on):
+        if train_on is not None:
+            bert = BertForSequenceClassification.from_pretrained(
+                    BERT.model_name, # Use the 12-layer BERT model, with an uncased vocab.
+                    num_labels = len(train_on),
+                    output_attentions = False,
+                    output_hidden_states = False
+                    )
+        else:
+            bert = BertForSequenceClassification.from_pretrained(self.root_path)
+        self.model = bert.to(self.device.type)
+
+    @loader
+    def _init_encoder(self, train_on):
+        path = f"{self.root_path}/label_encoder.pkl"
+        if os.path.isfile(path):
+            with open(path, 'rb') as pickled:
+                self.encoder = pickle.load(pickled)
+        elif train_on is not None:
+            self.encoder = preprocessing.LabelEncoder()
+            self.encoder.fit(train_on)
+            with open(path, 'wb') as file:
+                pickle.dump(self.encoder, file)
+        else:
+            raise FileNotFoundError(path)
+
+    def import_data(self, data):
+        return map(lambda d: d.to(self.device), data)
+
+    def save(self):
+        self.model.save_pretrained(self.root_path)
--- a/scripts/ML/BERT/Classifier.py
+++ b/scripts/ML/BERT/Classifier.py
+from BERT.Base import BERT
+import numpy
+from tqdm import tqdm
+from transformers import TextClassificationPipeline
+
+class Classifier(BERT):
+    """
+    A class wrapping all the different models and classes used throughout a
+    classification task and based on BERT:
+
+        - tokenizer
+        - classifier
+        - pipeline
+        - label encoder
+
+    Once created, it behaves as a function which you apply to a generator
+    containing the texts to classify
+    """
+    def __init__(self, root_path):
+        BERT.__init__(self, root_path)
+        self._init_pipe()
+
+    def _init_pipe(self):
+        self.pipe = TextClassificationPipeline(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            return_all_scores=True,
+            device=self.device)
+
+    def __call__(self, text_generator):
+        tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
+        predictions = []
+        for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
+            byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
+            predictions.append([int(byScoreDesc[0]['label'][6:]),
+                                byScoreDesc[0]['score'],
+                                int(byScoreDesc[1]['label'][6:])])
+        return self.encoder.inverse_transform(
+                numpy.array(predictions)[:,0].astype(int))
--- a/scripts/ML/BERT/Trainer.py
+++ b/scripts/ML/BERT/Trainer.py
+from BERT.Base import BERT
+import datetime
+from loaders import set_random
+import time
+import torch
+from transformers import AdamW, get_linear_schedule_with_warmup
+
+def chrono(f):
+    def wrapped(*args, **kwargs):
+        t0 = time.time()
+        f(*args, **kwargs)
+        duration = datetime.timedelta(seconds=round(time.time() - t0))
+        print(f"\n  {f.__name__} took: {duration}")
+    return wrapped
+
+class Trainer(BERT):
+    def __init__(self, root_path, labeled_data, epochs=4):
+        self.epochs = epochs
+        BERT.__init__(self, root_path, train_on=labeled_data.unique)
+        self._init_utils(labeled_data.load(self))
+
+    def _init_utils(self, data_loader):
+        self.optimizer = AdamW(
+                self.model.parameters(),
+                lr = 2e-5, # args.learning_rate - default is 5e-5
+                )
+        self.data_loader = data_loader
+        self.scheduler = get_linear_schedule_with_warmup(
+                self.optimizer,
+                num_warmup_steps = 0, # Default value in run_glue.py
+                num_training_steps = self.epochs * len(data_loader))
+
+    def __call__(self):
+        set_random()
+        losses = [self.epoch(e) for e in range(self.epochs)]
+        self.save()
+        print("\nTraining complete!")
+
+    @chrono
+    def epoch(self, epoch):
+        self._start_epoch(epoch)
+        self.model.train()
+        total_loss = sum([self.learn_on(*self.import_data(batch))
+                          for batch in self.data_loader])
+        avg_train_loss = total_loss / len(self.data_loader)
+        print("\n  Average training loss: {0:.2f}".format(avg_train_loss))
+        return avg_train_loss
+
+    def learn_on(self, input_ids, input_mask, labels):
+        self.model.zero_grad()
+        outputs = self.model(input_ids,
+                        token_type_ids=None,
+                        attention_mask=input_mask,
+                        labels=labels)
+        loss = outputs[0]
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+        self.optimizer.step()
+        self.scheduler.step()
+        return loss.item()
+
+    def _start_epoch(self, epoch):
+        print(f'\n======== Epoch {epoch+1} / {self.epochs} ========')
+        print('Training...')
--- a/scripts/ML/BERT/__init__.py
+++ b/scripts/ML/BERT/__init__.py
+from BERT.Base import BERT
+from BERT.Classifier import Classifier
+from BERT.Trainer import Trainer
--- a/scripts/ML/Corpus.py
+++ b/scripts/ML/Corpus.py
+import pandas
+from os import makedirs
+from os.path import dirname, isdir
+
+def abstract(f):
+    def wrapped(*args, **kwargs):
+        raise NotImplementedError(f.__name__)
+    return wrapped
+
+class Corpus:
+    @abstract
+    def __init__():
+        pass
+
+    @abstract
+    def get_text(self, primary_key):
+        pass
+
+    @abstract
+    def get_all(self):
+        pass
+
+    @abstract
+    def save(self, iterator):
+        pass
+
+class TSVIndexed(Corpus):
+    default_keys = ['work', 'volume', 'article']
+    projectors = ['key', 'content', 'full']
+
+    def __init__(self, tsv_path, column_name):
+        self.tsv_path = tsv_path
+        self.column_name = column_name
+        self.data = None
+
+    def load(self):
+        if self.data is None:
+            self.data = pandas.read_csv(self.tsv_path, sep='\t')
+            self.detect_keys()
+            self.data = self.data.set_index(self.keys, drop=False)
+
+    def detect_keys(self):
+        self.keys = self.default_keys.copy()
+        if 'paragraph' in self.data:
+            self.keys.append('paragraph')
+
+    @abstract
+    def content(self, key, row):
+        pass
+
+    def key(self, _, row):
+        return row[self.keys].to_dict()
+
+    def full(self, key, row):
+        d = self.key(key, row)
+        d[self.column_name] = self.content(key, row).strip() + '\n'
+        return d
+
+    def get_all(self, projector=None):
+        if projector is None:
+            projector = self.full
+        elif type(projector) == str and projector in self.projectors:
+            projector = self.__getattribute__(projector)
+        self.load()
+        for row in self.data.iterrows():
+            yield projector(*row)
+
+class SelfContained(TSVIndexed):
+    """
+    A class to handle the dataset TSV normalised path used in the project and loading the
+    actual text input as a generator from records when they are needed
+    """
+    def __init__(self, tsv_path, column_name='content'):
+        """
+        Positional arguments
+        :param tsv_path: the path to a TSV dataset containing a primary key and
+        a text content on every line
+
+        Keyword arguments
+        :param column_name: the name of the column where the text content is
+        stored
+        """
+        TSVIndexed.__init__(self, tsv_path, column_name)
+
+    def get_text(self, primary_key):
+        self.load()
+        if type(primary_key) == dict:
+            primary_key = [primary_key[k] for k in self.keys if k in primary_key]
+        if type(primary_key) != tuple:
+            primary_key = tuple(primary_key)
+        return self.data.xs(primary_key)[self.column_name]
+
+    def content(self, _, row):
+        return row[self.column_name]
+
+    def save(self, iterator):
+        self.data = pandas.DataFrame(iterator)
+        self.detect_keys()
+        self.data.to_csv(self.tsv_path, sep='\t', index=False)
+
+class Directory(TSVIndexed):
+    """
+    A class to handle the normalised path used in the project and loading the
+    actual text input as a generator from records when they are needed
+    """
+    def __init__(self, root_path, column_name='content'):
+        """
+        Positional arguments
+        :param root_path: the path to a GÉODE-style folder containing the text
+        version of the corpus on which to predict the classes
+        """
+        self.text_path = f"{root_path}/Text"
+        TSVIndexed.__init__(self, f"{root_path}/files.tsv", column_name)
+
+    def path_to(self, primary_key):
+        record = self.dict_primary_key(primary_key)
+        article_relative_path = "{work}/T{volume}/{article}".format(**record)
+        prefix = f"{self.text_path}/{article_relative_path}"
+        if 'paragraph' in record:
+            return f"{prefix}/{record['paragraph']}.txt"
+        else:
+            return f"{prefix}.txt"
+
+    def dict_primary_key(self, primary_key):
+        if type(primary_key) == pandas.core.series.Series:
+            return dict(primary_key)
+        elif type(primary_key) != dict:
+            keys = self.default_keys.copy()
+            if len(primary_key) == 4:
+                keys.append('paragraph')
+            return dict(zip(keys, primary_key))
+        else:
+            return primary_key
+
+    def get_text(self, primary_key):
+        with open(self.path_to(primary_key), 'r') as file:
+            return file.read()
+
+    def content(self, key, _):
+        return self.get_text(key)
+
+    def write_text(self, primary_key, content):
+        path = self.path_to(primary_key)
+        makedirs(dirname(path), exist_ok=True)
+        with open(path, 'w') as file:
+            file.write(content)
+
+    def save(self, iterator):
+        self.data = pandas.DataFrame(iterator)
+        self.detect_keys()
+        for _, row in self.data.iterrows():
+            self.write_text(row, row[self.column_name])
+        self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False)
+
+def corpus(path, **kwargs):
+    if path[-1:] == '/' or isdir(path):
+        return Directory(path, **kwargs)
+    elif path[-4:] == '.tsv':
+        return SelfContained(path, **kwargs)
+    else:
+        raise FileNotFoundError(path)
--- a/scripts/ML/convert-corpus.py
+++ b/scripts/ML/convert-corpus.py
+#!/usr/bin/env python3
+from Corpus import corpus
+import sys
+
+if __name__ == '__main__':
+    source = corpus(sys.argv[1])
+    destination = corpus(sys.argv[2])
+    destination.save(source.get_all())
--- a/scripts/ML/loaders.py
+++ b/scripts/ML/loaders.py
+import numpy
+import random
+import torch
+
+def set_random():
+    seed_value = 42
+    random.seed(seed_value)
+    numpy.random.seed(seed_value)
+    torch.manual_seed(seed_value)
+    torch.cuda.manual_seed_all(seed_value)
--- a/scripts/ML/predict.py
+++ b/scripts/ML/predict.py
 #!/usr/bin/env python3
-import numpy
+from BERT import Classifier
 import pandas
-import pickle
-import sklearn
+from Corpus import corpus
 from sys import argv
-import torch
-from tqdm import tqdm
-from transformers import BertForSequenceClassification, BertTokenizer, TextClassificationPipeline

-class Classifier:
-    """
-    A class wrapping all the different models and classes used throughout a
-    classification task:
-
-        - tokenizer
-        - classifier
-        - pipeline
-        - label encoder
-
-    Once created, it behaves as a function which you apply to a generator
-    containing the texts to classify
-    """
-    def __init__(self, root_path):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self._init_tokenizer()
-        self._init_model(root_path)
-        self._init_pipe()
-        self._init_encoder(f"{root_path}/label_encoder.pkl")
-        self.log()
-
-    def _init_model(self, path):
-        bert = BertForSequenceClassification.from_pretrained(path)
-        self.model = bert.to(self.device.type)
-
-    def _init_tokenizer(self):
-        model_name = 'bert-base-multilingual-cased'
-        self.tokenizer = BertTokenizer.from_pretrained(model_name)
-
-    def _init_pipe(self):
-        self.pipe = TextClassificationPipeline(
-            model=self.model,
-            tokenizer=self.tokenizer,
-            return_all_scores=True,
-            device=self.device)
-
-    def _init_encoder(self, path):
-        with open(path, 'rb') as pickled:
-            self.encoder = pickle.load(pickled)
-
-    def log(self):
-        if self.device.type == 'cpu':
-            print('No GPU available, using the CPU instead.')
-        else:
-            print('We will use the GPU:', torch.cuda.get_device_name(0))
-
-    def __call__(self, text_generator):
-        tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
-        predictions = []
-        for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
-            byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
-            predictions.append([int(byScoreDesc[0]['label'][6:]),
-                                byScoreDesc[0]['score'],
-                                int(byScoreDesc[1]['label'][6:])])
-        predictions = numpy.array(predictions)
-        return list(self.encoder.inverse_transform(predictions[:,0].astype(int)))
-
-class Source:
-    """
-    A class to handle the normalised path used in the project and loading the
-    actual text input as a generator from records when they are needed
-    """
-    def __init__(self, root_path):
-        """
-        Positional arguments
-        :param root_path: the path to a GÉODE-style folder containing the text
-        version of the corpus on which to predict the classes
-        """
-        self.root_path = root_path
-
-    def path_to(self, record):
-        article_relative_path = "{work}/T{volume}/{article}".format(**record)
-        prefix = f"{self.root_path}/{article_relative_path}"
-        if 'paragraph' in record:
-            return f"{prefix}/{record.paragraph}.txt"
-        else:
-            return f"{prefix}.txt"
-
-    def load_text(self, record):
-        with open(self.path_to(record), 'r') as file:
-            return file.read()
-
-    def iterate(self, records):
-        for _, record in records.iterrows():
-            yield self.load_text(record)
-
-def label(classify, source, tsv_path, name='label'):
+def label(classify, source, name='label'):
    """
    Make predictions on a set of document

    Positional arguments
-    :param classify: an instance of the Classifier class above
-    :param source: an instance of the Source class above
-    :param tsv_path: the path to a TSV file containing (at least) article or
-    paragraph records (additional metadata will be ignored)
+    :param classify: an instance of the Classifier class
+    :param source: an instance of the Corpus class

    Keyword arguments
    :param name: defaults to 'label' — the name of the column to be created, that is
@@ -113,11 +21,11 @@ def label(classify, source, tsv_path, name='label'):
    :return: a panda dataframe containing the records from the input TSV file plus
    an additional column
    """
-    records = pandas.read_csv(tsv_path, sep='\t')
-    records[name] = classify(source.iterate(records))
+    records = pandas.DataFrame(source.get_all('key'))
+    records[name] = classify(source.get_all('content'))
    return records

 if __name__ == '__main__':
    classify = Classifier(argv[1])
-    source = Source(argv[2])
-    label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False)
+    source = corpus(argv[2])
+    label(classify, source).to_csv(argv[3], sep='\t', index=False)
--- a/scripts/ML/train.py
+++ b/scripts/ML/train.py
+#!/usr/bin/env python3
+from BERT import Trainer
+from LabeledData import LabeledData
+import sys
+
+if __name__ == '__main__':
+    labeled_data = LabeledData(sys.argv[1])
+    trainer = Trainer(sys.argv[2], labeled_data)
+    trainer()