[WIP] Put classifier and trainer into a subdirectory to emphasise their being based on BERT

75f61841 · Alice Brenon · 4fce5770 · 75f61841 · 75f61841 · 75f61841
Commit 75f61841 authored 1 year ago by Alice Brenon
--- a/scripts/ML/BERT.py
+++ b/scripts/ML/BERT.py
 from loaders import get_device
 from transformers import BertForSequenceClassification, BertTokenizer
+def loader(f):
+    def wrapped(*args, **kwargs):
+        name = f.__name__.replace('_init_', '')
+        print(f' - {name}', end='')
+        f(*args, **kwargs)
+        print(f'\r✔️  {name}')
+    return wrapped
 class BERT:
    model_name = 'bert-base-multilingual-cased'
-    def __init__(self, path):
+    def __init__(self, root_path, training=False):
        self.device = get_device()
        print('Loading BERT tools')
-        print(' - tokenizer', end='')
+        self._init_tokenizer()
+        self.root_path = root_path
+        _init_classifier(training)
+    @loader
+    def _init_tokenizer():
        self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name)
-        print('\r✔️  tokenizer')
-        print(' - classifier', end='')
+    @loader
-        bert = BertForSequenceClassification.from_pretrained(path)
+    def _init_classifier(training)
+        if training
+            bert = BertForSequenceClassification.from_pretrained(self.root_path)
+        else:
+            bert = BertForSequenceClassification.from_pretrained(
+                    model_name, # Use the 12-layer BERT model, with an uncased vocab.
+                    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+                                                  # You can increase this
+                                                  # for multi-class tasks.
+                    output_attentions = False, # Whether the model returns attentions weights.
+                    output_hidden_states = False, # Whether the model returns all hidden-states.
+                    )
        self.model = bert.to(self.device.type)
-        print('\r✔️  classifier')
+    def import_data(self, data):
+        return map(lambda d: d.to(self.device), data)
+    def save(self):
+        self.model.save_pretrained(self.root_path)
--- a/scripts/ML/BERT/Base.py
+++ b/scripts/ML/BERT/Base.py
+from transformers import BertForSequenceClassification, BertTokenizer
+import os
+import pickle
+from sklearn import preprocessing
+import torch
+def get_device():
+    if torch.cuda.is_available():
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
+        return torch.device("cuda")
+    else:
+        print('No GPU available, using the CPU instead.')
+        return torch.device("cpu")
+def get_encoder(root_path, create_from=None):
+    path = f"{root_path}/label_encoder.pkl"
+    if os.path.isfile(path):
+        with open(path, 'rb') as pickled:
+            return pickle.load(pickled)
+    elif create_from is not None:
+        encoder = preprocessing.LabelEncoder()
+        encoder.fit(create_from)
+        with open(path, 'wb') as file:
+            pickle.dump(encoder, file)
+        return encoder
+    else:
+        raise FileNotFoundError(path)
+def loader(f):
+    def wrapped(*args, **kwargs):
+        name = f.__name__.replace('_init_', '')
+        print(f' - {name}', end='')
+        f(*args, **kwargs)
+        print(f'\r✔️  {name}')
+    return wrapped
+class BERT:
+    model_name = 'bert-base-multilingual-cased'
+    def __init__(self, root_path, training=False):
+        self.device = get_device()
+        print('Loading BERT tools')
+        self._init_tokenizer()
+        self.root_path = root_path
+        _init_classifier(training)
+    @loader
+    def _init_tokenizer():
+        self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name)
+    @loader
+    def _init_classifier(training)
+        if training
+            bert = BertForSequenceClassification.from_pretrained(
+                    model_name, # Use the 12-layer BERT model, with an uncased vocab.
+                    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+                                                  # You can increase this
+                                                  # for multi-class tasks.   
+                    output_attentions = False, # Whether the model returns attentions weights.
+                    output_hidden_states = False, # Whether the model returns all hidden-states.
+                    )
+        else:
+            bert = BertForSequenceClassification.from_pretrained(self.root_path)
+        self.model = bert.to(self.device.type)
+    def import_data(self, data):
+        return map(lambda d: d.to(self.device), data)
+    def save(self):
+        self.model.save_pretrained(self.root_path)
--- a/scripts/ML/BERT/Classifier.py
+++ b/scripts/ML/BERT/Classifier.py
+from BERT.Base import BERT, get_encoder
+import numpy
+from tqdm import tqdm
+from transformers import TextClassificationPipeline
+class Classifier(BERT):
+    """
+    A class wrapping all the different models and classes used throughout a
+    classification task and based on BERT:
+        - tokenizer
+        - classifier
+        - pipeline
+        - label encoder
+    Once created, it behaves as a function which you apply to a generator
+    containing the texts to classify
+    """
+    def __init__(self, root_path):
+        BERT.__init__(self, root_path)
+        self._init_pipe()
+        self.encoder = get_encoder(root_path)
+    def _init_pipe(self):
+        self.pipe = TextClassificationPipeline(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            return_all_scores=True,
+            device=self.device)
+    def __call__(self, text_generator):
+        tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
+        predictions = []
+        for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
+            byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
+            predictions.append([int(byScoreDesc[0]['label'][6:]),
+                                byScoreDesc[0]['score'],
+                                int(byScoreDesc[1]['label'][6:])])
+        return self.encoder.inverse_transform(
+                numpy.array(predictions)[:,0].astype(int))
--- a/scripts/ML/BERT/Trainer.py
+++ b/scripts/ML/BERT/Trainer.py
--- a/scripts/ML/BERT/__init__.py
+++ b/scripts/ML/BERT/__init__.py
+from BERT.Base import BERT
+from BERT.Classifier import Classifier
+from BERT.Trainer import Trainer
--- a/scripts/ML/Corpus.py
+++ b/scripts/ML/Corpus.py
 import pandas
 from os import makedirs
-from os.path import dirname, isdir, isfile
+from os.path import dirname, isdir
 def abstract(f):
    def wrapped(*args, **kwargs):
@@ -43,15 +43,25 @@ class TSVIndexed(Corpus):
            self.keys.append('paragraph')
    @abstract
-    def get_content(self, key, row):
+    def content(self, key, row):
        pass
-    def get_all(self):
+    def keys(self, _, row):
+        return row[self.keys].to_dict()
+    def full(self, key, row):
+        d = self.keys(key, row)
+        d[self.column_name] = self.content(key, row).strip() + '\n'
+        return d
+    def get_all(self, projector):
+        if projector is None:
+            projector = self.full
+        elif type(projector) == str:
+            projector = self.__getattribute__(projector)
        self.load()
-        for key, row in self.data.iterrows():
+        for row in self.data.iterrows():
-            keys = self.keys + [self.column_name]
+            yield projector(*row)
-            values = key + (self.get_content(key, row).strip() + '\n',)
-            yield dict(zip(keys, values))
 class SelfContained(TSVIndexed):
    """
@@ -78,7 +88,7 @@ class SelfContained(TSVIndexed):
            primary_key = tuple(primary_key)
        return self.data.xs(primary_key)[self.column_name]
-    def get_content(self, _, row):
+    def content(self, _, row):
        return row[self.column_name]
    def save(self, iterator):
@@ -124,7 +134,7 @@ class Directory(TSVIndexed):
        with open(self.path_to(primary_key), 'r') as file:
            return file.read()
-    def get_content(self, key, _):
+    def content(self, key, _):
        return self.get_text(key)
    def write_text(self, primary_key, content):
@@ -140,10 +150,10 @@ class Directory(TSVIndexed):
            self.write_text(row, row[self.column_name])
        self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False)
-def corpus(path):
+def corpus(path, **kwargs):
-    if path[-1:] == '/':
+    if path[-1:] == '/' or isdir(path):
-        return Directory(path)
+        return Directory(path, **kwargs)
    elif path[-4:] == '.tsv':
-        return SelfContained(path)
+        return SelfContained(path, **kwargs)
    else:
        raise FileNotFoundError(path)
--- a/scripts/ML/loaders.py
+++ b/scripts/ML/loaders.py
-import os
+import numpy
-import pickle
+import random
-from sklearn import preprocessing
 import torch
-def get_device():
-    if torch.cuda.is_available():
-        print('We will use the GPU:', torch.cuda.get_device_name(0))
-        return torch.device("cuda")
-    else:
-        print('No GPU available, using the CPU instead.')
-        return torch.device("cpu")
-def get_encoder(root_path, create_from=None):
-    path = f"{root_path}/label_encoder.pkl"
-    if os.path.isfile(path):
-        with open(path, 'rb') as pickled:
-            return pickle.load(pickled)
-    elif create_from is not None:
-        encoder = preprocessing.LabelEncoder()
-        encoder.fit(create_from)
-        with open(path, 'wb') as file:
-            pickle.dump(encoder, file)
-        return encoder
-    else:
-        raise FileNotFoundError(path)
 def set_random():
    seed_value = 42
    random.seed(seed_val)
-    np.random.seed(seed_val)
+    numpy.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
--- a/scripts/ML/predict.py
+++ b/scripts/ML/predict.py
 #!/usr/bin/env python3
-from BERT import BERT
+from BERT import Classifier
-from loaders import get_encoder
-import numpy
 import pandas
-import sklearn
+from Corpus import corpus
-from Source import Source
 from sys import argv
-from tqdm import tqdm
-from transformers import TextClassificationPipeline
-class Classifier(BERT):
+def label(classify, source, name='label'):
-    """
-    A class wrapping all the different models and classes used throughout a
-    classification task and based on BERT:
-        - tokenizer
-        - classifier
-        - pipeline
-        - label encoder
-    Once created, it behaves as a function which you apply to a generator
-    containing the texts to classify
-    """
-    def __init__(self, root_path):
-        BERT.__init__(self, root_path)
-        self._init_pipe()
-        self.encoder = get_encoder(root_path)
-    def _init_pipe(self):
-        self.pipe = TextClassificationPipeline(
-            model=self.model,
-            tokenizer=self.tokenizer,
-            return_all_scores=True,
-            device=self.device)
-    def __call__(self, text_generator):
-        tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
-        predictions = []
-        for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
-            byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
-            predictions.append([int(byScoreDesc[0]['label'][6:]),
-                                byScoreDesc[0]['score'],
-                                int(byScoreDesc[1]['label'][6:])])
-        return self.encoder.inverse_transform(
-                numpy.array(predictions)[:,0].astype(int))
-def label(classify, source, tsv_path, name='label'):
    """
    Make predictions on a set of document
    Positional arguments
-    :param classify: an instance of the Classifier class above
+    :param classify: an instance of the Classifier class
-    :param source: an instance of the Source class above
+    :param source: an instance of the Corpus class
-    :param tsv_path: the path to a TSV file containing (at least) article or
-    paragraph records (additional metadata will be ignored)
    Keyword arguments
    :param name: defaults to 'label' — the name of the column to be created, that is
@@ -64,11 +21,11 @@ def label(classify, source, tsv_path, name='label'):
    :return: a panda dataframe containing the records from the input TSV file plus
    an additional column
    """
-    records = pandas.read_csv(tsv_path, sep='\t')
+    records = pandas.DataFrame(source.get_all('keys'))
-    records[name] = classify(source.iterate(records))
+    records[name] = classify(source.get_all('content')
    return records
 if __name__ == '__main__':
    classify = Classifier(argv[1])
-    source = Source(argv[2])
+    source = corpus(argv[2])
-    label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False)
+    label(classify, source).to_csv(argv[3], sep='\t', index=False)