Keep reworking things, factorize source directory handling

ef245f29 · Alice Brenon · 38de3c27 · ef245f29 · 38de3c27 · ef245f29
Commit ef245f29 authored 1 year ago by Alice Brenon
--- a/scripts/ML/Source.py
+++ b/scripts/ML/Source.py
+class Source:
+    """
+    A class to handle the normalised path used in the project and loading the
+    actual text input as a generator from records when they are needed
+    """
+    def __init__(self, root_path):
+        """
+        Positional arguments
+        :param root_path: the path to a GÉODE-style folder containing the text
+        version of the corpus on which to predict the classes
+        """
+        self.root_path = root_path
+
+    def path_to(self, record):
+        article_relative_path = "{work}/T{volume}/{article}".format(**record)
+        prefix = f"{self.root_path}/{article_relative_path}"
+        if 'paragraph' in record:
+            return f"{prefix}/{record.paragraph}.txt"
+        else:
+            return f"{prefix}.txt"
+
+    def load_text(self, record):
+        with open(self.path_to(record), 'r') as file:
+            return file.read()
+
+    def iterate(self, records):
+        for _, record in records.iterrows():
+            yield self.load_text(record)
+
--- a/scripts/ML/gpu.py
+++ b/scripts/ML/gpu.py
-import torch
-
-class WithGPU:
-    def __init__(self):
-        if torch.cuda.is_available():
-            print('We will use the GPU:', torch.cuda.get_device_name(0))
-            self.device = torch.device("cuda")
-        else:
-            print('No GPU available, using the CPU instead.')
-            self.device = torch.device("cpu")
--- a/scripts/ML/loaders.py
+++ b/scripts/ML/loaders.py
+import os
+import pickle
+from sklearn import preprocessing
+import torch
+
+def get_device():
+    if torch.cuda.is_available():
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
+        return torch.device("cuda")
+    else:
+        print('No GPU available, using the CPU instead.')
+        return torch.device("cpu")
+
+def get_encoder(root_path, create_from=None):
+    path = f"{root_path}/label_encoder.pkl"
+    if os.path.isfile(path):
+        with open(path, 'rb') as pickled:
+            return pickle.load(pickled)
+    elif create_from is not None:
+        encoder = preprocessing.LabelEncoder()
+        encoder.fit(create_from)
+        with open(path, 'wb') as file:
+            pickle.dump(encoder, file)
+        return encoder
+    else:
+        raise FileNotFoundError(path)
+
+def get_tokenizer():
+    model_name = 'bert-base-multilingual-cased'
+    print('Loading BERT tokenizer...')
+    return BertTokenizer.from_pretrained(model_name)
--- a/scripts/ML/predict.py
+++ b/scripts/ML/predict.py
 #!/usr/bin/env python3
-from gpu import WithGPU
+import loaders import get_device, get_encoder, get_tokenizer
 import numpy
 import pandas
-import pickle
 import sklearn
+from Source import Source
 from sys import argv
 from tqdm import tqdm
 from transformers import BertForSequenceClassification, BertTokenizer, TextClassificationPipeline

-class Classifier(WithGPU):
+class Classifier:
    """
    A class wrapping all the different models and classes used throughout a
    classification task:
@@ -22,20 +22,16 @@ class Classifier(WithGPU):
    containing the texts to classify
    """
    def __init__(self, root_path):
-        WithGPU.__init__(self)
-        self._init_tokenizer()
+        self.device = get_device()
+        self.tokenizer = get_tokenizer()
        self._init_model(root_path)
        self._init_pipe()
-        self._init_encoder(f"{root_path}/label_encoder.pkl")
+        self.encoder = get_encoder(root_path)

    def _init_model(self, path):
        bert = BertForSequenceClassification.from_pretrained(path)
        self.model = bert.to(self.device.type)

-    def _init_tokenizer(self):
-        model_name = 'bert-base-multilingual-cased'
-        self.tokenizer = BertTokenizer.from_pretrained(model_name)
-
    def _init_pipe(self):
        self.pipe = TextClassificationPipeline(
            model=self.model,
@@ -43,10 +39,6 @@ class Classifier(WithGPU):
            return_all_scores=True,
            device=self.device)

-    def _init_encoder(self, path):
-        with open(path, 'rb') as pickled:
-            self.encoder = pickle.load(pickled)
-
    def __call__(self, text_generator):
        tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
        predictions = []
@@ -55,37 +47,8 @@ class Classifier(WithGPU):
            predictions.append([int(byScoreDesc[0]['label'][6:]),
                                byScoreDesc[0]['score'],
                                int(byScoreDesc[1]['label'][6:])])
-        predictions = numpy.array(predictions)
-        return list(self.encoder.inverse_transform(predictions[:,0].astype(int)))
-
-class Source:
-    """
-    A class to handle the normalised path used in the project and loading the
-    actual text input as a generator from records when they are needed
-    """
-    def __init__(self, root_path):
-        """
-        Positional arguments
-        :param root_path: the path to a GÉODE-style folder containing the text
-        version of the corpus on which to predict the classes
-        """
-        self.root_path = root_path
-
-    def path_to(self, record):
-        article_relative_path = "{work}/T{volume}/{article}".format(**record)
-        prefix = f"{self.root_path}/{article_relative_path}"
-        if 'paragraph' in record:
-            return f"{prefix}/{record.paragraph}.txt"
-        else:
-            return f"{prefix}.txt"
-
-    def load_text(self, record):
-        with open(self.path_to(record), 'r') as file:
-            return file.read()
-
-    def iterate(self, records):
-        for _, record in records.iterrows():
-            yield self.load_text(record)
+        return self.encoder.inverse_transform(
+                numpy.array(predictions)[:,0].astype(int))

 def label(classify, source, tsv_path, name='label'):
    """