From 7799b94cc843a88045840b6c797e6df99e43a46a Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Mon, 20 Nov 2023 18:38:52 +0100
Subject: [PATCH] Add predictor for MultiBERT model, improve the trainer for
 the same model and expose the predicted score in the model's output

---
 scripts/ML/BERT/Classifier.py               | 12 ++++------
 scripts/ML/Corpus.py                        | 21 ++++++-----------
 scripts/ML/predictMulti.py                  | 26 +++++++++++++++++++++
 scripts/ML/{predict.py => predictSimple.py} |  2 +-
 scripts/ML/prodigy-multi-jsonl-to-tsv.py    |  2 +-
 scripts/ML/trainMultiBERT.py                | 20 ++++++----------
 6 files changed, 47 insertions(+), 36 deletions(-)
 create mode 100755 scripts/ML/predictMulti.py
 rename scripts/ML/{predict.py => predictSimple.py} (92%)

diff --git a/scripts/ML/BERT/Classifier.py b/scripts/ML/BERT/Classifier.py
index 04bcffa..d5c1b50 100644
--- a/scripts/ML/BERT/Classifier.py
+++ b/scripts/ML/BERT/Classifier.py
@@ -24,16 +24,14 @@ class Classifier(BERT):
         self.pipe = TextClassificationPipeline(
             model=self.model,
             tokenizer=self.tokenizer,
-            return_all_scores=True,
+            top_k=1,
             device=self.device)
 
     def __call__(self, text_generator):
         tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
-        predictions = []
+        labels, scores = [], []
         for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
             byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
-            predictions.append([int(byScoreDesc[0]['label'][6:]),
-                                byScoreDesc[0]['score'],
-                                int(byScoreDesc[1]['label'][6:])])
-        return self.encoder.inverse_transform(
-                numpy.array(predictions)[:,0].astype(int))
+            labels.append(int(byScoreDesc[0]['label'][6:]))
+            scores.append(byScoreDesc[0]['score'])
+        return self.encoder.inverse_transform(labels), numpy.array(scores)
diff --git a/scripts/ML/Corpus.py b/scripts/ML/Corpus.py
index 159092b..00d5d9f 100644
--- a/scripts/ML/Corpus.py
+++ b/scripts/ML/Corpus.py
@@ -1,3 +1,4 @@
+from GEODE import fromKey, relativePath
 import pandas
 from os import makedirs
 from os.path import dirname, isdir
@@ -103,34 +104,26 @@ class Directory(TSVIndexed):
     A class to handle the normalised path used in the project and loading the
     actual text input as a generator from records when they are needed
     """
-    def __init__(self, root_path, column_name='content'):
+    def __init__(self, root_path, tsv_filename="files", column_name='content'):
         """
         Positional arguments
         :param root_path: the path to a GÃ‰ODE-style folder containing the text
         version of the corpus on which to predict the classes
         """
         self.text_path = f"{root_path}/Text"
-        TSVIndexed.__init__(self, f"{root_path}/files.tsv", column_name)
+        TSVIndexed.__init__(self, f"{root_path}/{tsv_filename}.tsv", column_name)
 
     def path_to(self, primary_key):
         record = self.dict_primary_key(primary_key)
-        article_relative_path = "{work}/T{volume}/{article}".format(**record)
-        prefix = f"{self.text_path}/{article_relative_path}"
-        if 'paragraph' in record:
-            return f"{prefix}/{record['paragraph']}.txt"
-        else:
-            return f"{prefix}.txt"
+        return f"{self.text_path}/{relativePath(record, 'txt')}"
 
     def dict_primary_key(self, primary_key):
         if type(primary_key) == pandas.core.series.Series:
             return dict(primary_key)
-        elif type(primary_key) != dict:
-            keys = self.default_keys.copy()
-            if len(primary_key) == 4:
-                keys.append('paragraph')
-            return dict(zip(keys, primary_key))
-        else:
+        elif type(primary_key) == dict:
             return primary_key
+        else:
+            return fromKey(primary_key)
 
     def get_text(self, primary_key):
         with open(self.path_to(primary_key), 'r') as file:
diff --git a/scripts/ML/predictMulti.py b/scripts/ML/predictMulti.py
new file mode 100755
index 0000000..80d7eaa
--- /dev/null
+++ b/scripts/ML/predictMulti.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+from BERT import Classifier
+from Corpus import corpus
+import GEODE.discursive as discursive
+import pandas
+from sys import argv
+
+def rateClass(name, answer, score):
+    return (1, score) if answer == 'accept' else (0, -score)
+
+def combine(row):
+    classes = [(name, row[name], row[name + 'Score'])
+               for name in discursive.functions]
+    return max(classes, key=lambda c: rateClass(*c))[0]
+
+def label(modelsRoot, source):
+    records = pandas.DataFrame(source.get_all('key'))
+    for name in discursive.functions:
+        classify = Classifier(f"{modelsRoot}/{name}")
+        content = source.get_all('content')
+        records[name], records[name + 'Score'] = classify(content)
+    records['label'] = records.apply(combine, axis=1)
+    return records
+
+if __name__ == '__main__':
+    label(argv[1], corpus(argv[2])).to_csv(argv[3], sep='\t', index=False)
diff --git a/scripts/ML/predict.py b/scripts/ML/predictSimple.py
similarity index 92%
rename from scripts/ML/predict.py
rename to scripts/ML/predictSimple.py
index 2dfa3e8..5ba28ad 100755
--- a/scripts/ML/predict.py
+++ b/scripts/ML/predictSimple.py
@@ -22,7 +22,7 @@ def label(classify, source, name='label'):
     an additional column
     """
     records = pandas.DataFrame(source.get_all('key'))
-    records[name] = classify(source.get_all('content'))
+    records[name], records['score'] = classify(source.get_all('content'))
     return records
 
 if __name__ == '__main__':
diff --git a/scripts/ML/prodigy-multi-jsonl-to-tsv.py b/scripts/ML/prodigy-multi-jsonl-to-tsv.py
index c4b3c17..d555e74 100755
--- a/scripts/ML/prodigy-multi-jsonl-to-tsv.py
+++ b/scripts/ML/prodigy-multi-jsonl-to-tsv.py
@@ -22,7 +22,7 @@ def initialiseTexts(texts, key, annotation):
                {'accept': None,
                 'reject': set(),
                 'row': dict(**annotation['meta'],
-                            content=annotation['text'])})
+                            content=annotation['text'].strip()+'\n')})
 
 def byLabel(annotations):
     labels = {}
diff --git a/scripts/ML/trainMultiBERT.py b/scripts/ML/trainMultiBERT.py
index 637ad29..43cc10b 100755
--- a/scripts/ML/trainMultiBERT.py
+++ b/scripts/ML/trainMultiBERT.py
@@ -6,24 +6,18 @@ from LabeledData import LabeledData
 import os
 import sys
 
-def split(columnName):
-    return {}
-
-def load(rootPath):
-    classes = {}
-    for f in os.listdir(rootPath):
-        if f[-4:] == '.tsv':
-            classes[f[:-4]] = f"{rootPath}/{f}"
-    return classes
-
 def trainSubClassifier(trainRoot, modelRoot, className):
     trainData = Directory(trainRoot, tsv_filename=className)
     labeled_data = LabeledData(trainData, "answer")
     subModelPath = f"{modelRoot}/{className}"
     os.makedirs(subModelPath, exist_ok=True)
-    os.symlink(f"../{BERT.encoder_file}", f"{subModelPath}/{BERT.encoder_file}")
-    trainer = Trainer(subModelPath, labeled_data)
-    trainer()
+    encoderLink = f"{subModelPath}/{BERT.encoder_file}"
+    if not os.path.islink(encoderLink):
+        os.symlink(f"../{BERT.encoder_file}", encoderLink)
+    if not os.path.exists(f"{subModelPath}/pytorch_model.bin"):
+        trainer = Trainer(subModelPath, labeled_data)
+        print(f"Training {className} to {subModelPath}")
+        trainer()
 
 if __name__ == '__main__':
     for className in discursive.functions:
-- 
GitLab