diff --git a/scripts/ML/BERT.py b/scripts/ML/BERT.py
index 126d4e7e12f5e878b176abe8a1c6190122057aaf..1dc0ed9d486c171324cc02c67c27bfa37750e54d 100644
--- a/scripts/ML/BERT.py
+++ b/scripts/ML/BERT.py
@@ -1,16 +1,45 @@
 from loaders import get_device
 from transformers import BertForSequenceClassification, BertTokenizer
 
+def loader(f):
+    def wrapped(*args, **kwargs):
+        name = f.__name__.replace('_init_', '')
+        print(f' - {name}', end='')
+        f(*args, **kwargs)
+        print(f'\r✔️  {name}')
+    return wrapped
+
 class BERT:
     model_name = 'bert-base-multilingual-cased'
 
-    def __init__(self, path):
+    def __init__(self, root_path, training=False):
         self.device = get_device()
         print('Loading BERT tools')
-        print(' - tokenizer', end='')
+        self._init_tokenizer()
+        self.root_path = root_path
+        _init_classifier(training)
+
+    @loader
+    def _init_tokenizer():
         self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name)
-        print('\r✔️  tokenizer')
-        print(' - classifier', end='')
-        bert = BertForSequenceClassification.from_pretrained(path)
+
+    @loader
+    def _init_classifier(training)
+        if training
+            bert = BertForSequenceClassification.from_pretrained(self.root_path)
+        else:
+            bert = BertForSequenceClassification.from_pretrained(
+                    model_name, # Use the 12-layer BERT model, with an uncased vocab.
+                    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+                                                  # You can increase this
+                                                  # for multi-class tasks.
+                    output_attentions = False, # Whether the model returns attentions weights.
+                    output_hidden_states = False, # Whether the model returns all hidden-states.
+                    )
         self.model = bert.to(self.device.type)
-        print('\r✔️  classifier')
+
+    def import_data(self, data):
+        return map(lambda d: d.to(self.device), data)
+
+    def save(self):
+        self.model.save_pretrained(self.root_path)
diff --git a/scripts/ML/BERT/Base.py b/scripts/ML/BERT/Base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8b8d11eced28882082aea89a3eba1d071ca4ea1
--- /dev/null
+++ b/scripts/ML/BERT/Base.py
@@ -0,0 +1,70 @@
+from transformers import BertForSequenceClassification, BertTokenizer
+import os
+import pickle
+from sklearn import preprocessing
+import torch
+
+def get_device():
+    if torch.cuda.is_available():
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
+        return torch.device("cuda")
+    else:
+        print('No GPU available, using the CPU instead.')
+        return torch.device("cpu")
+
+def get_encoder(root_path, create_from=None):
+    path = f"{root_path}/label_encoder.pkl"
+    if os.path.isfile(path):
+        with open(path, 'rb') as pickled:
+            return pickle.load(pickled)
+    elif create_from is not None:
+        encoder = preprocessing.LabelEncoder()
+        encoder.fit(create_from)
+        with open(path, 'wb') as file:
+            pickle.dump(encoder, file)
+        return encoder
+    else:
+        raise FileNotFoundError(path)
+
+def loader(f):
+    def wrapped(*args, **kwargs):
+        name = f.__name__.replace('_init_', '')
+        print(f' - {name}', end='')
+        f(*args, **kwargs)
+        print(f'\r✔️  {name}')
+    return wrapped
+
+class BERT:
+    model_name = 'bert-base-multilingual-cased'
+
+    def __init__(self, root_path, training=False):
+        self.device = get_device()
+        print('Loading BERT tools')
+        self._init_tokenizer()
+        self.root_path = root_path
+        _init_classifier(training)
+
+    @loader
+    def _init_tokenizer():
+        self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name)
+
+    @loader
+    def _init_classifier(training)
+        if training
+            bert = BertForSequenceClassification.from_pretrained(
+                    model_name, # Use the 12-layer BERT model, with an uncased vocab.
+                    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+                                                  # You can increase this
+                                                  # for multi-class tasks.   
+                    output_attentions = False, # Whether the model returns attentions weights.
+                    output_hidden_states = False, # Whether the model returns all hidden-states.
+                    )
+        else:
+            bert = BertForSequenceClassification.from_pretrained(self.root_path)
+        self.model = bert.to(self.device.type)
+
+    def import_data(self, data):
+        return map(lambda d: d.to(self.device), data)
+
+    def save(self):
+        self.model.save_pretrained(self.root_path)
diff --git a/scripts/ML/BERT/Classifier.py b/scripts/ML/BERT/Classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..2807e36b77124ca8e0f5c9006a70cd89d1612f10
--- /dev/null
+++ b/scripts/ML/BERT/Classifier.py
@@ -0,0 +1,40 @@
+from BERT.Base import BERT, get_encoder
+import numpy
+from tqdm import tqdm
+from transformers import TextClassificationPipeline
+
+class Classifier(BERT):
+    """
+    A class wrapping all the different models and classes used throughout a
+    classification task and based on BERT:
+
+        - tokenizer
+        - classifier
+        - pipeline
+        - label encoder
+
+    Once created, it behaves as a function which you apply to a generator
+    containing the texts to classify
+    """
+    def __init__(self, root_path):
+        BERT.__init__(self, root_path)
+        self._init_pipe()
+        self.encoder = get_encoder(root_path)
+
+    def _init_pipe(self):
+        self.pipe = TextClassificationPipeline(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            return_all_scores=True,
+            device=self.device)
+
+    def __call__(self, text_generator):
+        tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
+        predictions = []
+        for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
+            byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
+            predictions.append([int(byScoreDesc[0]['label'][6:]),
+                                byScoreDesc[0]['score'],
+                                int(byScoreDesc[1]['label'][6:])])
+        return self.encoder.inverse_transform(
+                numpy.array(predictions)[:,0].astype(int))
diff --git a/scripts/ML/BERT/Trainer.py b/scripts/ML/BERT/Trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/ML/BERT/__init__.py b/scripts/ML/BERT/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..50cbcc17fd40517cb519e00e8e1c03a857e2d5b4
--- /dev/null
+++ b/scripts/ML/BERT/__init__.py
@@ -0,0 +1,3 @@
+from BERT.Base import BERT
+from BERT.Classifier import Classifier
+from BERT.Trainer import Trainer
diff --git a/scripts/ML/Corpus.py b/scripts/ML/Corpus.py
index d2ea16d6d55da99723b053afc87cdf7dc12f8d7d..b81a56c79fd41dd5862a48480260e8ffd2f468c7 100644
--- a/scripts/ML/Corpus.py
+++ b/scripts/ML/Corpus.py
@@ -1,6 +1,6 @@
 import pandas
 from os import makedirs
-from os.path import dirname, isdir, isfile
+from os.path import dirname, isdir
 
 def abstract(f):
     def wrapped(*args, **kwargs):
@@ -43,15 +43,25 @@ class TSVIndexed(Corpus):
             self.keys.append('paragraph')
 
     @abstract
-    def get_content(self, key, row):
+    def content(self, key, row):
         pass
 
-    def get_all(self):
+    def keys(self, _, row):
+        return row[self.keys].to_dict()
+
+    def full(self, key, row):
+        d = self.keys(key, row)
+        d[self.column_name] = self.content(key, row).strip() + '\n'
+        return d
+
+    def get_all(self, projector):
+        if projector is None:
+            projector = self.full
+        elif type(projector) == str:
+            projector = self.__getattribute__(projector)
         self.load()
-        for key, row in self.data.iterrows():
-            keys = self.keys + [self.column_name]
-            values = key + (self.get_content(key, row).strip() + '\n',)
-            yield dict(zip(keys, values))
+        for row in self.data.iterrows():
+            yield projector(*row)
 
 class SelfContained(TSVIndexed):
     """
@@ -78,7 +88,7 @@ class SelfContained(TSVIndexed):
             primary_key = tuple(primary_key)
         return self.data.xs(primary_key)[self.column_name]
 
-    def get_content(self, _, row):
+    def content(self, _, row):
         return row[self.column_name]
 
     def save(self, iterator):
@@ -124,7 +134,7 @@ class Directory(TSVIndexed):
         with open(self.path_to(primary_key), 'r') as file:
             return file.read()
 
-    def get_content(self, key, _):
+    def content(self, key, _):
         return self.get_text(key)
 
     def write_text(self, primary_key, content):
@@ -140,10 +150,10 @@ class Directory(TSVIndexed):
             self.write_text(row, row[self.column_name])
         self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False)
 
-def corpus(path):
-    if path[-1:] == '/':
-        return Directory(path)
+def corpus(path, **kwargs):
+    if path[-1:] == '/' or isdir(path):
+        return Directory(path, **kwargs)
     elif path[-4:] == '.tsv':
-        return SelfContained(path)
+        return SelfContained(path, **kwargs)
     else:
         raise FileNotFoundError(path)
diff --git a/scripts/ML/loaders.py b/scripts/ML/loaders.py
index 859669d42884c62c4b0f62f77e5bf852eb4829e7..5aa9dc7a0ae58cf19612072886eb88a8b8235de3 100644
--- a/scripts/ML/loaders.py
+++ b/scripts/ML/loaders.py
@@ -1,33 +1,10 @@
-import os
-import pickle
-from sklearn import preprocessing
+import numpy
+import random
 import torch
 
-def get_device():
-    if torch.cuda.is_available():
-        print('We will use the GPU:', torch.cuda.get_device_name(0))
-        return torch.device("cuda")
-    else:
-        print('No GPU available, using the CPU instead.')
-        return torch.device("cpu")
-
-def get_encoder(root_path, create_from=None):
-    path = f"{root_path}/label_encoder.pkl"
-    if os.path.isfile(path):
-        with open(path, 'rb') as pickled:
-            return pickle.load(pickled)
-    elif create_from is not None:
-        encoder = preprocessing.LabelEncoder()
-        encoder.fit(create_from)
-        with open(path, 'wb') as file:
-            pickle.dump(encoder, file)
-        return encoder
-    else:
-        raise FileNotFoundError(path)
-
 def set_random():
     seed_value = 42
     random.seed(seed_val)
-    np.random.seed(seed_val)
+    numpy.random.seed(seed_val)
     torch.manual_seed(seed_val)
     torch.cuda.manual_seed_all(seed_val)
diff --git a/scripts/ML/predict.py b/scripts/ML/predict.py
index a64100329cf6680e288092dcb32a25bc346586df..f1768db8f7d7ceb338749c0e58459ace47dbbd34 100644
--- a/scripts/ML/predict.py
+++ b/scripts/ML/predict.py
@@ -1,59 +1,16 @@
 #!/usr/bin/env python3
-from BERT import BERT
-from loaders import get_encoder
-import numpy
+from BERT import Classifier
 import pandas
-import sklearn
-from Source import Source
+from Corpus import corpus
 from sys import argv
-from tqdm import tqdm
-from transformers import TextClassificationPipeline
 
-class Classifier(BERT):
-    """
-    A class wrapping all the different models and classes used throughout a
-    classification task and based on BERT:
-
-        - tokenizer
-        - classifier
-        - pipeline
-        - label encoder
-
-    Once created, it behaves as a function which you apply to a generator
-    containing the texts to classify
-    """
-    def __init__(self, root_path):
-        BERT.__init__(self, root_path)
-        self._init_pipe()
-        self.encoder = get_encoder(root_path)
-
-    def _init_pipe(self):
-        self.pipe = TextClassificationPipeline(
-            model=self.model,
-            tokenizer=self.tokenizer,
-            return_all_scores=True,
-            device=self.device)
-
-    def __call__(self, text_generator):
-        tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
-        predictions = []
-        for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
-            byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
-            predictions.append([int(byScoreDesc[0]['label'][6:]),
-                                byScoreDesc[0]['score'],
-                                int(byScoreDesc[1]['label'][6:])])
-        return self.encoder.inverse_transform(
-                numpy.array(predictions)[:,0].astype(int))
-
-def label(classify, source, tsv_path, name='label'):
+def label(classify, source, name='label'):
     """
     Make predictions on a set of document
 
     Positional arguments
-    :param classify: an instance of the Classifier class above
-    :param source: an instance of the Source class above
-    :param tsv_path: the path to a TSV file containing (at least) article or
-    paragraph records (additional metadata will be ignored)
+    :param classify: an instance of the Classifier class
+    :param source: an instance of the Corpus class
 
     Keyword arguments
     :param name: defaults to 'label' — the name of the column to be created, that is
@@ -64,11 +21,11 @@ def label(classify, source, tsv_path, name='label'):
     :return: a panda dataframe containing the records from the input TSV file plus
     an additional column
     """
-    records = pandas.read_csv(tsv_path, sep='\t')
-    records[name] = classify(source.iterate(records))
+    records = pandas.DataFrame(source.get_all('keys'))
+    records[name] = classify(source.get_all('content')
     return records
 
 if __name__ == '__main__':
     classify = Classifier(argv[1])
-    source = Source(argv[2])
-    label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False)
+    source = corpus(argv[2])
+    label(classify, source).to_csv(argv[3], sep='\t', index=False)