From 2fbe96cd6743c805fce55467d18f8152ff86886c Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Wed, 13 Dec 2023 10:13:15 +0100
Subject: [PATCH] Moving python lib out of scripts directory

---
 {scripts/ML => lib/python}/BERT/Base.py       |  0
 {scripts/ML => lib/python}/BERT/Classifier.py |  0
 .../ML => lib/python/BERT}/LabeledData.py     |  0
 {scripts/ML => lib/python}/BERT/Trainer.py    | 13 ++--
 {scripts/ML => lib/python}/BERT/__init__.py   |  1 +
 lib/python/GEODE/Classification/__init__.py   | 19 ++++++
 .../GEODE/Classification}/discursive.py       |  4 +-
 lib/python/GEODE/Classification/domains.py    |  0
 .../python/GEODE/Metadata}/__init__.py        |  0
 lib/python/GEODE/Visualisation.py             | 12 ++++
 lib/python/GEODE/__init__.py                  |  4 ++
 lib/python/GEODE/signal.py                    | 24 +++++++
 .../ML => lib/python/GEODE/store}/Corpus.py   |  9 +--
 lib/python/GEODE/store/TSV.py                 | 24 +++++++
 lib/python/GEODE/store/__init__.py            |  9 +++
 {scripts/ML => lib/python}/JSONL.py           | 18 ++++-
 lib/python/split/Error.py                     | 18 +++++
 .../util.py => lib/python/split/__init__.py   |  4 ++
 manifest.scm                                  |  6 +-
 scripts/LGE/extract-from-source.sh            |  2 +-
 scripts/ML/GEODE/Error.py                     |  5 --
 scripts/ML/convert-corpus.py                  |  4 +-
 scripts/ML/evaluate.py                        | 18 ++---
 scripts/ML/loaders.py                         | 10 ---
 scripts/ML/predictMulti.py                    | 12 ++--
 scripts/ML/predictSimple.py                   |  9 ++-
 scripts/ML/prodigy-jsonl-to-tsv.hs            | 66 +++++++++++++++++--
 scripts/ML/prodigy-tsv-to-jsonl.hs            | 59 +++++++++++++----
 scripts/ML/prodigyAcceptedJSONLToTSV.py       | 12 ++--
 scripts/ML/prodigyMultiJSONLToDirectory.py    | 56 +++++++---------
 scripts/ML/simpleTrainOfMulti.py              | 26 --------
 scripts/ML/splitMulti.py                      | 51 +++++++-------
 scripts/ML/train.py                           |  9 ---
 scripts/ML/trainMultiBERT.py                  | 11 ++--
 scripts/ML/trainSimpleBERT.py                 | 13 ++--
 scripts/extract-from-source.sh                |  8 +--
 36 files changed, 358 insertions(+), 178 deletions(-)
 rename {scripts/ML => lib/python}/BERT/Base.py (100%)
 rename {scripts/ML => lib/python}/BERT/Classifier.py (100%)
 rename {scripts/ML => lib/python/BERT}/LabeledData.py (100%)
 rename {scripts/ML => lib/python}/BERT/Trainer.py (88%)
 rename {scripts/ML => lib/python}/BERT/__init__.py (70%)
 create mode 100644 lib/python/GEODE/Classification/__init__.py
 rename {scripts/ML/GEODE => lib/python/GEODE/Classification}/discursive.py (73%)
 create mode 100644 lib/python/GEODE/Classification/domains.py
 rename {scripts/ML/GEODE => lib/python/GEODE/Metadata}/__init__.py (100%)
 create mode 100644 lib/python/GEODE/Visualisation.py
 create mode 100644 lib/python/GEODE/__init__.py
 create mode 100644 lib/python/GEODE/signal.py
 rename {scripts/ML => lib/python/GEODE/store}/Corpus.py (94%)
 create mode 100644 lib/python/GEODE/store/TSV.py
 create mode 100644 lib/python/GEODE/store/__init__.py
 rename {scripts/ML => lib/python}/JSONL.py (57%)
 create mode 100644 lib/python/split/Error.py
 rename scripts/ML/GEODE/util.py => lib/python/split/__init__.py (79%)
 delete mode 100644 scripts/ML/GEODE/Error.py
 delete mode 100644 scripts/ML/loaders.py
 delete mode 100755 scripts/ML/simpleTrainOfMulti.py
 delete mode 100755 scripts/ML/train.py

diff --git a/scripts/ML/BERT/Base.py b/lib/python/BERT/Base.py
similarity index 100%
rename from scripts/ML/BERT/Base.py
rename to lib/python/BERT/Base.py
diff --git a/scripts/ML/BERT/Classifier.py b/lib/python/BERT/Classifier.py
similarity index 100%
rename from scripts/ML/BERT/Classifier.py
rename to lib/python/BERT/Classifier.py
diff --git a/scripts/ML/LabeledData.py b/lib/python/BERT/LabeledData.py
similarity index 100%
rename from scripts/ML/LabeledData.py
rename to lib/python/BERT/LabeledData.py
diff --git a/scripts/ML/BERT/Trainer.py b/lib/python/BERT/Trainer.py
similarity index 88%
rename from scripts/ML/BERT/Trainer.py
rename to lib/python/BERT/Trainer.py
index 70c2a8f..b094d14 100644
--- a/scripts/ML/BERT/Trainer.py
+++ b/lib/python/BERT/Trainer.py
@@ -1,6 +1,7 @@
 from BERT.Base import BERT
 import datetime
-from loaders import set_random
+import numpy
+import random
 import time
 import torch
 from torch.optim import AdamW
@@ -31,9 +32,13 @@ class Trainer(BERT):
                 num_warmup_steps = 0, # Default value in run_glue.py
                 num_training_steps = self.epochs * len(data_loader))
 
-    def __call__(self):
-        set_random()
-        losses = [self.epoch(e) for e in range(self.epochs)]
+    def __call__(self, seed_value=42):
+        random.seed(seed_value)
+        numpy.random.seed(seed_value)
+        torch.manual_seed(seed_value)
+        torch.cuda.manual_seed_all(seed_value)
+        for e in range(self.epochs):
+            self.epoch(e)
         self.save()
         print("\nTraining complete!")
 
diff --git a/scripts/ML/BERT/__init__.py b/lib/python/BERT/__init__.py
similarity index 70%
rename from scripts/ML/BERT/__init__.py
rename to lib/python/BERT/__init__.py
index 50cbcc1..ce7c99a 100644
--- a/scripts/ML/BERT/__init__.py
+++ b/lib/python/BERT/__init__.py
@@ -1,3 +1,4 @@
 from BERT.Base import BERT
 from BERT.Classifier import Classifier
+from BERT.LabeledData import LabeledData
 from BERT.Trainer import Trainer
diff --git a/lib/python/GEODE/Classification/__init__.py b/lib/python/GEODE/Classification/__init__.py
new file mode 100644
index 0000000..695af95
--- /dev/null
+++ b/lib/python/GEODE/Classification/__init__.py
@@ -0,0 +1,19 @@
+from GEODE.Classification.discursive import functions as discursiveFunctions
+
+knowledgeDomains = [ 'Agriculture',
+                     'Beaux-arts',
+                     'Belles-lettres',
+                     'Chasse',
+                     'Commerce',
+                     'Droit Jurisprudence',
+                     'GÃ©ographie',
+                     'Histoire',
+                     'Histoire naturelle',
+                     'MÃ©decine',
+                     'MÃ©tiers',
+                     'Militaire',
+                     'Musique',
+                     'Philosophie',
+                     'Physique',
+                     'Politique',
+                     'Religion' ]
diff --git a/scripts/ML/GEODE/discursive.py b/lib/python/GEODE/Classification/discursive.py
similarity index 73%
rename from scripts/ML/GEODE/discursive.py
rename to lib/python/GEODE/Classification/discursive.py
index 60a958e..3161984 100644
--- a/scripts/ML/GEODE/discursive.py
+++ b/lib/python/GEODE/Classification/discursive.py
@@ -1,4 +1,4 @@
-functions = {'Historical narrative',
+functions = ['Historical narrative',
              'People narrative',
              'Critical',
              'Description',
@@ -6,4 +6,4 @@ functions = {'Historical narrative',
              'Example',
              'Reasoning',
              'Quotation',
-             'Prescriptive'}
+             'Prescriptive']
diff --git a/lib/python/GEODE/Classification/domains.py b/lib/python/GEODE/Classification/domains.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/ML/GEODE/__init__.py b/lib/python/GEODE/Metadata/__init__.py
similarity index 100%
rename from scripts/ML/GEODE/__init__.py
rename to lib/python/GEODE/Metadata/__init__.py
diff --git a/lib/python/GEODE/Visualisation.py b/lib/python/GEODE/Visualisation.py
new file mode 100644
index 0000000..c60b3ab
--- /dev/null
+++ b/lib/python/GEODE/Visualisation.py
@@ -0,0 +1,12 @@
+from GEODE.store import prepare
+import matplotlib.pyplot as plot
+import seaborn
+
+def heatmap(matrix, filePath, labels, **kwargs):
+    plot.figure(figsize=(16,13))
+    if 'cmap' not in kwargs:
+        kwargs['cmap'] = 'Blues'
+    ax = seaborn.heatmap(
+            matrix, xticklabels=labels, yticklabels=labels, **kwargs
+        )
+    plot.savefig(prepare(filePath), dpi=300, bbox_inches='tight')
diff --git a/lib/python/GEODE/__init__.py b/lib/python/GEODE/__init__.py
new file mode 100644
index 0000000..fd7e6c6
--- /dev/null
+++ b/lib/python/GEODE/__init__.py
@@ -0,0 +1,4 @@
+from GEODE.Classification import discursiveFunctions
+from GEODE.Metadata import article, paragraph, relativePath, toKey, uid
+from GEODE.store import corpus, Directory, SelfContained, toTSV
+from GEODE.Visualisation import heatmap
diff --git a/lib/python/GEODE/signal.py b/lib/python/GEODE/signal.py
new file mode 100644
index 0000000..1e2fa18
--- /dev/null
+++ b/lib/python/GEODE/signal.py
@@ -0,0 +1,24 @@
+import math
+
+def curry(f):
+    return lambda x: (lambda *args: f(x, *args))
+
+def gate(n, size, offset=0):
+    return [1 if i == n else 0 for i in range(offset, offset+size)]
+
+@curry
+def orientedIntersection(l, sNew, sOld):
+    left = max(sNew*l[0], sOld*l[1])
+    right = min((sNew+1)*l[0], (sOld+1)*l[1])
+    return max(right-left, 0)
+
+@curry
+def resample(newSize, distribution):
+    oldSize = len(distribution)
+    lcm = math.lcm(newSize, oldSize)
+    intersection = orientedIntersection((lcm/newSize, lcm/oldSize))
+    ratio = oldSize / newSize
+    for i in range(newSize):
+        yield oldSize/lcm*sum([distribution[j]*intersection(i, j)
+                               for j in range(math.floor(i*ratio),
+                                              math.ceil((i+1)*ratio))])
diff --git a/scripts/ML/Corpus.py b/lib/python/GEODE/store/Corpus.py
similarity index 94%
rename from scripts/ML/Corpus.py
rename to lib/python/GEODE/store/Corpus.py
index 5abf5b3..e72b8b6 100644
--- a/scripts/ML/Corpus.py
+++ b/lib/python/GEODE/store/Corpus.py
@@ -1,4 +1,5 @@
-from GEODE import fromKey, relativePath
+from GEODE.Metadata import fromKey, relativePath
+from GEODE.store.TSV import toTSV
 import pandas
 from os import makedirs
 from os.path import dirname, isdir
@@ -54,7 +55,7 @@ class TSVIndexed(Corpus):
 
     def full(self, key, row):
         d = self.key(key, row)
-        d[self.column_name] = self.content(key, row).strip() + '\n'
+        d[self.column_name] = self.content(key, row).strip()
         return d
 
     def get_all(self, projector=None, where=None):
@@ -98,7 +99,7 @@ class SelfContained(TSVIndexed):
     def save(self, iterator):
         self.data = pandas.DataFrame(iterator)
         self.detect_keys()
-        self.data.to_csv(self.tsv_path, sep='\t', index=False)
+        toTSV(self.tsv_path, self.data)
 
 class Directory(TSVIndexed):
     """
@@ -144,7 +145,7 @@ class Directory(TSVIndexed):
         self.detect_keys()
         for _, row in self.data.iterrows():
             self.write_text(row, row[self.column_name])
-        self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False)
+        toTSV(self.tsv_path, self.data[self.keys])
 
 def corpus(path, **kwargs):
     if path[-1:] == '/' or isdir(path):
diff --git a/lib/python/GEODE/store/TSV.py b/lib/python/GEODE/store/TSV.py
new file mode 100644
index 0000000..3967be3
--- /dev/null
+++ b/lib/python/GEODE/store/TSV.py
@@ -0,0 +1,24 @@
+from GEODE.signal import curry
+from numpy import vectorize
+import pandas
+
+@curry
+def toStrKey(areParagraphs, row):
+    key = "{work}_{volume:02d}_{article:04d}"
+    if areParagraphs:
+        key += "_{paragraph:04d}"
+    return key.format(**row)
+
+def forPanda(data, f):
+    return vectorize(lambda i: f(data.iloc[i]))
+
+def toTSV(filePath, data, sortBy='toStrKey'):
+    if type(data) != pandas.DataFrame:
+        data = pandas.DataFrame(data)
+    if sortBy == 'toStrKey':
+        sortBy = toStrKey('paragraph' in data)
+    if sortBy is None:
+        sortedData = data
+    else:
+        sortedData = data.sort_index(key=forPanda(data, sortBy))
+    sortedData.to_csv(filePath, sep='\t', index=False)
diff --git a/lib/python/GEODE/store/__init__.py b/lib/python/GEODE/store/__init__.py
new file mode 100644
index 0000000..ebbd38d
--- /dev/null
+++ b/lib/python/GEODE/store/__init__.py
@@ -0,0 +1,9 @@
+from GEODE.store.Corpus import corpus, Directory, SelfContained
+from GEODE.store.TSV import toTSV
+import os
+import os.path
+
+def prepare(path):
+    if '/' in path:
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+    return path
diff --git a/scripts/ML/JSONL.py b/lib/python/JSONL.py
similarity index 57%
rename from scripts/ML/JSONL.py
rename to lib/python/JSONL.py
index 07b2aaf..4e05d05 100644
--- a/scripts/ML/JSONL.py
+++ b/lib/python/JSONL.py
@@ -1,4 +1,5 @@
 import json
+import sys
 
 def load(file_path):
     if type(file_path) == str:
@@ -9,7 +10,22 @@ def load(file_path):
         for line in file_path.readlines():
             yield json.loads(line)
 
+"""
+def load(file_path):
+    if type(file_path) == str:
+        with open(file_path, 'r') as input_file:
+            return list(loadObjects(input_file))
+    else:
+        return loadObjects(file_path)
+
+def loadObjects(input_file):
+    for line in input_file.readlines():
+        yield json.loads(line)
+"""
+
 def save(file_path, objects):
+    if file_path == '-':
+        file_path = sys.stdin
     if type(file_path) == str:
         with open(file_path, 'w') as output_file:
             saveObjects(output_file, objects)
@@ -18,5 +34,5 @@ def save(file_path, objects):
 
 def saveObjects(output_file, objects):
     for obj in objects:
-        json.dump(obj, output_file)
+        json.dump(obj, output_file, separators=(',', ':'))
         print(file=output_file)
diff --git a/lib/python/split/Error.py b/lib/python/split/Error.py
new file mode 100644
index 0000000..867e5b8
--- /dev/null
+++ b/lib/python/split/Error.py
@@ -0,0 +1,18 @@
+from GEODE import uid
+
+def getUID(annotation):
+    return uid(annotation['meta'])
+
+def UnknownAnswer(annotation, answer):
+    print(f"Unsupported answer '{answer}' for annotation {getUID(annotation)}")
+
+def TwoAnnotations(annotation, first, second):
+    print(f"Found two annotations for {getUID(annotation)}: " +
+          f"'{first}' and '{second}'")
+
+def Contradiction(annotation, label):
+    print(f"Contradiction found for {getUID(annotation)}: " +
+          f"function {label} should be both accepted and rejected")
+
+def NoLabelLeft(text):
+    print(f"No possible function left for {uid(text)}")
diff --git a/scripts/ML/GEODE/util.py b/lib/python/split/__init__.py
similarity index 79%
rename from scripts/ML/GEODE/util.py
rename to lib/python/split/__init__.py
index e9945b1..9643512 100644
--- a/scripts/ML/GEODE/util.py
+++ b/lib/python/split/__init__.py
@@ -12,3 +12,7 @@ def checkBound(f):
 def parseRatio(s):
     return checkBound(int(s[:-1]) / 100 if s[-1] == '%' else float(s))
 
+def toIterator(*args):
+    for arg in args:
+        for elem in arg:
+            yield elem
diff --git a/manifest.scm b/manifest.scm
index 1234356..4627992 100644
--- a/manifest.scm
+++ b/manifest.scm
@@ -9,9 +9,10 @@
                                                    ghc-hs-conllu
                                                    ghc-random
                                                    ghc-regex-tdfa))
+             ((gnu packages machine-learning) #:select (python-scikit-learn python-spacy))
              ((gnu packages python) #:select (python))
              ((gnu packages python-science) #:select (python-pandas))
-             ((gnu packages python-xyz) #:select (python-beautifulsoup4))
+             ((gnu packages python-xyz) #:select (python-beautifulsoup4 python-seaborn))
              ((gnu packages xml) #:select (python-lxml)))
 
 ;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm"))
@@ -40,6 +41,9 @@
     ;python-edda ; TODO
     python-lxml ; fusion articles into tomes for TXM
     python-pandas ; working with CSV in python
+    python-scikit-learn ; evaluating models
+    python-seaborn ; draw figures
+    python-spacy ; working with prodigy's custom formats
     python-stanza ; annotation
     sed ; select files from listing
     stanza-fr ; annotation
diff --git a/scripts/LGE/extract-from-source.sh b/scripts/LGE/extract-from-source.sh
index 75a8883..931e250 100755
--- a/scripts/LGE/extract-from-source.sh
+++ b/scripts/LGE/extract-from-source.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 
-source ${0%/*}/../lib.sh
+source ${0%%/*}/lib/bash.sh
 
 if [ "$#" != 2 ]
 then
diff --git a/scripts/ML/GEODE/Error.py b/scripts/ML/GEODE/Error.py
deleted file mode 100644
index 09cd467..0000000
--- a/scripts/ML/GEODE/Error.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from GEODE import uid
-
-def TwoAnnotations(text, first, second):
-    textUID = text if type(text) == str else uid(text)
-    return f"Found two annotations for {textUID}: '{first}' and '{second}'"
diff --git a/scripts/ML/convert-corpus.py b/scripts/ML/convert-corpus.py
index a37fb2c..6b2b703 100755
--- a/scripts/ML/convert-corpus.py
+++ b/scripts/ML/convert-corpus.py
@@ -1,5 +1,5 @@
-#!/usr/bin/env python3
-from Corpus import corpus
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+from GEODE import corpus
 import sys
 
 if __name__ == '__main__':
diff --git a/scripts/ML/evaluate.py b/scripts/ML/evaluate.py
index 104cdad..27d05f0 100755
--- a/scripts/ML/evaluate.py
+++ b/scripts/ML/evaluate.py
@@ -1,7 +1,7 @@
-#!/usr/bin/env python3
-from EDdA.classification import heatmap
-from EDdA.store import preparePath
-import GEODE.discursive as discursive
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
+from GEODE.Visualisation import heatmap
+from GEODE.Classification import discursiveFunctions, knowledgeDomains
 import pandas
 from sklearn.metrics import classification_report, confusion_matrix
 from sys import argv
@@ -9,11 +9,13 @@ from sys import argv
 def evaluate(truth, predictions, outputDirectory):
     matrix = confusion_matrix(truth,
                               predictions,
-                              labels=list(discursive.functions),
+                              labels=knowledgeDomains,
+                              #labels=discursiveFunctions
                               normalize='true')
     heatmap(matrix,
-            preparePath(f"{outputDirectory}/confusion.png"),
-            labels=discursive.functions)
+            f"{outputDirectory}/confusion.png",
+            labels=knowledgeDomains)
+            #labels=discursiveFunctions)
     with open(f"{outputDirectory}/report.json", 'w') as json:
         print(classification_report(truth, predictions, output_dict=True),
               file=json)
@@ -24,4 +26,4 @@ def evaluate(truth, predictions, outputDirectory):
 if __name__ == '__main__':
     truth = pandas.read_csv(argv[1], sep='\t')
     predictions = pandas.read_csv(argv[2], sep='\t')
-    evaluate(truth['paragraphFunction'], predictions['label'], argv[3])
+    evaluate(truth['super_domain'], predictions['label'], argv[3])
diff --git a/scripts/ML/loaders.py b/scripts/ML/loaders.py
deleted file mode 100644
index 93986f4..0000000
--- a/scripts/ML/loaders.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import numpy
-import random
-import torch
-
-def set_random():
-    seed_value = 42
-    random.seed(seed_value)
-    numpy.random.seed(seed_value)
-    torch.manual_seed(seed_value)
-    torch.cuda.manual_seed_all(seed_value)
diff --git a/scripts/ML/predictMulti.py b/scripts/ML/predictMulti.py
index 80d7eaa..93ab6ab 100755
--- a/scripts/ML/predictMulti.py
+++ b/scripts/ML/predictMulti.py
@@ -1,7 +1,7 @@
-#!/usr/bin/env python3
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
 from BERT import Classifier
-from Corpus import corpus
-import GEODE.discursive as discursive
+from GEODE import corpus, discursiveFunctions, toTSV
 import pandas
 from sys import argv
 
@@ -10,12 +10,12 @@ def rateClass(name, answer, score):
 
 def combine(row):
     classes = [(name, row[name], row[name + 'Score'])
-               for name in discursive.functions]
+               for name in discursiveFunctions]
     return max(classes, key=lambda c: rateClass(*c))[0]
 
 def label(modelsRoot, source):
     records = pandas.DataFrame(source.get_all('key'))
-    for name in discursive.functions:
+    for name in discursiveFunctions:
         classify = Classifier(f"{modelsRoot}/{name}")
         content = source.get_all('content')
         records[name], records[name + 'Score'] = classify(content)
@@ -23,4 +23,4 @@ def label(modelsRoot, source):
     return records
 
 if __name__ == '__main__':
-    label(argv[1], corpus(argv[2])).to_csv(argv[3], sep='\t', index=False)
+    toTSV(argv[3], label(argv[1], corpus(argv[2])))
diff --git a/scripts/ML/predictSimple.py b/scripts/ML/predictSimple.py
index 5ba28ad..3f511c8 100755
--- a/scripts/ML/predictSimple.py
+++ b/scripts/ML/predictSimple.py
@@ -1,7 +1,8 @@
-#!/usr/bin/env python3
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
 from BERT import Classifier
+from GEODE import corpus, toTSV
 import pandas
-from Corpus import corpus
 from sys import argv
 
 def label(classify, source, name='label'):
@@ -26,6 +27,4 @@ def label(classify, source, name='label'):
     return records
 
 if __name__ == '__main__':
-    classify = Classifier(argv[1])
-    source = corpus(argv[2])
-    label(classify, source).to_csv(argv[3], sep='\t', index=False)
+    toTSV(argv[3], label(Classifier(argv[1]), corpus(argv[2])))
diff --git a/scripts/ML/prodigy-jsonl-to-tsv.hs b/scripts/ML/prodigy-jsonl-to-tsv.hs
index d26e850..c37c505 100755
--- a/scripts/ML/prodigy-jsonl-to-tsv.hs
+++ b/scripts/ML/prodigy-jsonl-to-tsv.hs
@@ -1,17 +1,38 @@
 #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances"
 {-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-}
-import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode)
-import Data.Aeson.Types (prependFailure, typeMismatch)
+import Control.Applicative ((<|>))
+import Control.Monad.Except (MonadError(..), ExceptT(..), runExceptT)
+import Data.Aeson ((.:), FromJSON(..), Object, Value(..), encode, withArray, withObject, withText, eitherDecode)
+import Data.Aeson.Types (Parser, prependFailure, typeMismatch)
 import Data.ByteString.Lazy as BS (null, readFile, split)
 import Data.ByteString.Lazy.Char8 as BS (unpack)
+import Data.String (IsString(..))
+import Data.Text as Text (Text)
 import Data.Vector as Vector (head)
-import GEODE.Metadata (type (@)(..), tsvFile)
+import GEODE.Metadata (type (@)(..), Record(..), tsvFile)
 import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph)
 import System.Environment (getArgs)
 import System.Script (try, syntax, warn)
+import Text.Printf (printf)
 
 data Row = Unclassified String | Full ClassifiedParagraph
 instance {-# OVERLAPS #-} FromJSON Row where
+{-
+  parseJSON o@(Object v) = do
+    paragraphMeta@(paragraphRecord :@: _) <- v .: "meta"
+    --classified <- v .: "accept" >>= parseClassification
+    classified <- v .: "label"
+    pure $ either (\_ -> Unclassified $ debug paragraphRecord) (Full . (paragraphMeta :@:)) classified
+    where
+      parseClassification = withArray "Classification" singleValue
+      singleValue a
+        | not $ Prelude.null a =
+          withText "domain" (pure . Right . Classification) (Vector.head a)
+      singleValue _ = pure $ Left
+        ("Looks like " ++ BS.unpack (encode o) ++ " was not classified, ignoring for now")
+      debug record =
+        "Looks like " ++ uid record ++ " was not classified, ignoring for now"
+
   parseJSON o@(Object v) = do
     paragraphMeta <- v .: "meta" >>= parseJSON
     classified <- v .: "accept" >>= parseClassification
@@ -24,10 +45,47 @@ instance {-# OVERLAPS #-} FromJSON Row where
       singleValue _ = pure $ Left
         ("Looks like " ++ debug ++ " was not classified, ignoring for now")
       debug = BS.unpack $ encode o
+-}
+
+  parseJSON = withObject "Row" parseRow
+    where
+      parseRow o = do
+        paragraphMeta <- o .: "meta"
+        getRow paragraphMeta
+          <$> runExceptT (classification o)
+      getRow paragraphMeta@(paragraphRecord :@: _) = either
+        (Unclassified . debug paragraphRecord)
+        (Full . (paragraphMeta :@:) . Classification)
+      classification :: Object -> ExceptT String Parser Text
+      classification o = do
+        getTextField "answer" o >>= isAccept
+        getTextField "label"
+        --o .: "label" >>= withText "label" pure
+      --checkAnswer o = ExceptT
+      --  ((o .: "answer" >>= withText "answer" (pure . isAccept))
+      --    <|> pure (Left "answer field is missing"))
+      isAccept "accept" = pure ()
+      isAccept s = throwError $ printf "answer was \"%s\" and not \"accept\"" s
+      --isAccept s = Left $ printf "answer was \"%s\" and not \"accept\"" s
+      debug record = printf "Ignoring %s: %s" (uid record)
+
+getTextField :: String -> Object -> ExceptT String Parser Text
+getTextField name o = getField >>= ensureIsText
+  where
+    getField :: ExceptT String Parser Value
+    getField = ExceptT $
+      (Right <$> (o .: fromString name)) <|> catch "is missing"
+    ensureIsText :: Value -> ExceptT String Parser Text
+    ensureIsText v = ExceptT $
+      withText name (pure . Right) v <|> catch "is not text"
+    catch :: String -> Parser (Either String a)
+    catch = pure . Left . printf "%s field %s" name
 
+{-
   parseJSON invalid =
-    prependFailure "parsing ClassifiedParagraph failed, "
+    prependFailure "parsing Row failed, "
     (typeMismatch "Object" invalid)
+-}
 
 logIgnored :: [Row] -> IO [ClassifiedParagraph]
 logIgnored = foldr keepFull (pure [])
diff --git a/scripts/ML/prodigy-tsv-to-jsonl.hs b/scripts/ML/prodigy-tsv-to-jsonl.hs
index 51cf7a6..8d6e421 100755
--- a/scripts/ML/prodigy-tsv-to-jsonl.hs
+++ b/scripts/ML/prodigy-tsv-to-jsonl.hs
@@ -1,12 +1,15 @@
-#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
 {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
 import Data.Aeson (ToJSON(..), defaultOptions, encode, genericToEncoding)
 import Data.ByteString.Lazy.Char8 as ByteString (putStrLn)
+import Data.Csv (FromNamedRecord(..), ToNamedRecord(..))
 import Data.Text (Text)
 import Data.Text.IO as Text (readFile)
-import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv)
-import GEODE.Metadata.ProdigyMeta
-  (Classification(..), ClassifiedParagraph, ParagraphMeta)
+import GEODE.Metadata
+  ( type (@)(..), DefaultFields(..), HasDefaultHeader(..), Record(..)
+  , readNamedTsv )
+import GEODE.Metadata.ProdigyMeta (ParagraphMeta)
+--  (Classification(..), ClassifiedParagraph, ParagraphMeta)
 import GHC.Generics (Generic)
 import System.Environment (getArgs)
 import System.FilePath ((</>))
@@ -15,21 +18,51 @@ import System.Script (syntax, try)
 data Paragraph = Paragraph
   { text :: Text
   , meta :: ParagraphMeta
-  , accept :: [Text] } deriving Generic
+  --, label :: Text
+  } deriving Generic
+
+newtype DatasetContent = DatasetContent { content :: Text } deriving Generic
+
+instance ToNamedRecord DatasetContent
+instance FromNamedRecord DatasetContent
+instance HasDefaultHeader DatasetContent where
+  defaultFields = DefaultFields ["content"]
+
+type DatasetRow = ParagraphMeta @ DatasetContent
+
+{-
+data Paragraph = Paragraph
+  { text :: Text
+  , meta :: ParagraphMeta
+  , accept :: [Text]
+  } deriving Generic
+-}
 
 instance ToJSON Paragraph where
   toEncoding = genericToEncoding defaultOptions
 
-loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph
-loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do
+--loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph
+fromFile :: FilePath -> ParagraphMeta -> IO Paragraph
+--loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do
+fromFile source meta@(paragraphRecord :@: _) = do
   text <- Text.readFile (source </> relativePath paragraphRecord "txt")
-  pure $ Paragraph {text, meta, accept = [paragraphFunction classification]}
+  --pure $ Paragraph {text, meta, accept = []}
+  --pure $ Paragraph {text, meta, accept = [paragraphFunction classification], answer = "accept"}
+  --pure $ Paragraph {text, meta, label = paragraphFunction classification}
+  pure $ Paragraph {text, meta}
+
+fromRow :: DatasetRow -> IO Paragraph
+fromRow (meta :@: (DatasetContent {content})) =
+  pure $ Paragraph {text = content, meta}
 
 main :: IO ()
 main = getArgs >>= run
   where
-    run [inputMeta, source] =
-        try (readNamedTsv inputMeta) >>= mapM_ (printJSON source)
-    run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY"
-    printJSON source parMeta =
-      loadParagraph source parMeta >>= ByteString.putStrLn . encode
+    run [dataset] = f fromRow dataset
+    run [inputMeta, source] = f (fromFile source) inputMeta
+--        try (readNamedTsv inputMeta) >>= mapM_ (printJSON source)
+    run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY | TSV_DATASET"
+    f loader input =
+      try (readNamedTsv input) >>= mapM_ (\row -> loader row >>= ByteString.putStrLn . encode)
+    --printJSON source parMeta =
+    --  loadParagraph source parMeta >>= ByteString.putStrLn . encode
diff --git a/scripts/ML/prodigyAcceptedJSONLToTSV.py b/scripts/ML/prodigyAcceptedJSONLToTSV.py
index 2395caa..d0fd491 100755
--- a/scripts/ML/prodigyAcceptedJSONLToTSV.py
+++ b/scripts/ML/prodigyAcceptedJSONLToTSV.py
@@ -1,11 +1,11 @@
-#!/usr/bin/env python3
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
 
-from GEODE import toKey
+from GEODE import toKey, toTSV
 import pandas
 import JSONL
 import sys
 
-def tsv_row(annotation):
+def toRow(annotation):
     return {'work': annotation['meta']['work'],
             'volume': annotation['meta']['volume'],
             'article': annotation['meta']['article'],
@@ -15,9 +15,9 @@ def tsv_row(annotation):
             }
 
 def acceptedToTSV(inputJSONL, outputTSV):
-    annotations = pandas.DataFrame(
-            sorted([tsv_row(a) for a in inputJSONL], key=toKey))
-    annotations.to_csv(outputTSV, sep='\t', index=False)
+    toTSV(outputTSV, [toRow(a) for a in inputJSONL if a['answer'] == 'accept'])
+    #toTSV(outputTSV, [toRow(a) for a in inputJSONL if a['answer'] == 'accept'],
+    #      sortBy=None)
 
 if __name__ == '__main__':
     acceptedToTSV(JSONL.load(sys.argv[1]), sys.argv[2])
diff --git a/scripts/ML/prodigyMultiJSONLToDirectory.py b/scripts/ML/prodigyMultiJSONLToDirectory.py
index cf3ccae..0c1227e 100755
--- a/scripts/ML/prodigyMultiJSONLToDirectory.py
+++ b/scripts/ML/prodigyMultiJSONLToDirectory.py
@@ -1,16 +1,12 @@
-#!/usr/bin/env python3
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
 
-from Corpus import Directory
-from GEODE import toKey, uid
-import GEODE.discursive as discursive
-from GEODE.util import initialise
-import pandas
+from GEODE import Directory, discursiveFunctions, toKey, toTSV, uid
+from split import initialise, toIterator
+from split.Error import Contradiction, NoLabelLeft, TwoAnnotations, UnknownAnswer
+from unbalanceLimiter import unbalanceLimiter
 import JSONL
 import sys
 
-def subDict(d, keys):
-    return {key: d[key] for key in keys}
-
 def initialiseTexts(texts, key, annotation):
     initialise(texts,
                key,
@@ -26,7 +22,7 @@ def byLabel(annotations):
         answer = annotation['answer']
         initialise(labels, label, {'accept': [], 'reject': [], 'ignore': []})
         if answer not in labels[label]:
-            print(f"Unsupported answer '{answer}' for annotation {annotation}")
+            UnknownAnswer(annotation, answer)
         else:
             labels[label][answer].append(annotation)
     return labels
@@ -34,7 +30,6 @@ def byLabel(annotations):
 def erase(texts, error, key, reason):
     error[key] = texts[key]['row']
     del texts[key]
-    print(reason)
 
 def accept(texts, errors, label, accepted):
     for annotation in accepted:
@@ -43,8 +38,8 @@ def accept(texts, errors, label, accepted):
             initialiseTexts(texts, key, annotation)
             previous = texts[key]['accept']
             if previous is not None:
-                reason = f"Found two annotations for {uid(annotation['meta'])}: '{label}' and '{previous}'"
-                erase(texts, errors, key, reason)
+                TwoAnnotations(annotation, previous, label)
+                erase(texts, errors, key)
             else:
                 texts[key]['accept'] = label
 
@@ -55,19 +50,21 @@ def reject(texts, errors, label, rejected):
             initialiseTexts(texts, key, annotation)
             previous = texts[key]['accept']
             if previous is not None and previous == label:
-                erase(texts, errors, key, f"Contradiction found for {uid(annotation['meta'])}: function {label} should be both accepted and rejected")
+                Contradiction(annotation, label)
+                erase(texts, errors, key)
             else:
                 texts[key]['reject'].add(label)
 
 def checkRejects(texts, errors):
     for key, text in texts.items():
         countRejected = len(text['reject'])
-        countFunctions = len(discursive.functions)
+        countFunctions = len(discursiveFunctions)
         if countRejected == countFunctions:
-            reason = f"No possible function left for {uid(text['row'])}"
-            erase(texts, errors, key, reason)
+            NoLabelLeft(text['row'])
+            erase(texts, errors, key)
         elif text['accept'] is None and countRejected == countFunctions - 1:
-            text['accept'] = discursive.functions.difference(text['reject']).pop()
+            free = set(discursiveFunctions).difference(text['reject'])
+            text['accept'] = free.pop()
             print(f"Infered {uid(text['row'])} to be {text['accept']}, only discursive function left unrejected")
 
 def byText(byLabelAnnotations):
@@ -79,19 +76,9 @@ def byText(byLabelAnnotations):
     checkRejects(texts, errors)
     return texts.values(), errors.values()
 
-def toTsv(filePath, data):
-    rows = sorted(data, key=toKey)
-    pandas.DataFrame(rows).to_csv(filePath, sep='\t', index=False)
-
-def toIterator(*args):
-    for arg in args:
-        for elem in arg:
-            yield elem
-
 def exportCorpus(rootDirectory, texts, errors):
     corpus = Directory(rootDirectory)
-    corpus.save(sorted(toIterator([t['row'] for t in texts], errors),
-                       key=toKey))
+    corpus.save(toIterator([t['row'] for t in texts], errors))
 
 def indexByKey(annotations):
     return {toKey(annotation['meta']): annotation for annotation in annotations}
@@ -110,16 +97,19 @@ def toRow(answer):
 
 def exportLabels(rootDirectory, labels):
     for label, answers in labels.items():
-        toTsv(f"{rootDirectory}/{label}.tsv",
-              toIterator(map(toRow('accept'), answers['accept']),
-                         map(toRow('reject'), allRejects(labels, label))))
+        toTSV(f"{rootDirectory}/{label}.tsv",
+              unbalanceLimiter(
+                  toIterator(map(toRow('accept'), answers['accept']),
+                             map(toRow('reject'), allRejects(labels, label))),
+                  maxRatio=4,
+                  attribute='answer'))
 
 def multiJSONLToDirectory(jsonl, outputDirectory):
     byLabelAnnotations = byLabel(jsonl)
     texts, errors = byText(byLabelAnnotations)
     exportCorpus(outputDirectory, texts, errors)
     if len(errors) > 0:
-        toTsv(f"{outputDirectory}/errors.tsv", errors)
+        toTSV(f"{outputDirectory}/errors.tsv", errors)
     exportLabels(outputDirectory, byLabelAnnotations)
 
 if __name__ == '__main__':
diff --git a/scripts/ML/simpleTrainOfMulti.py b/scripts/ML/simpleTrainOfMulti.py
deleted file mode 100755
index 5f80001..0000000
--- a/scripts/ML/simpleTrainOfMulti.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env python3
-
-from Corpus import Directory, SelfContained
-from GEODE import fromKey, toKey
-import GEODE.discursive as discursive
-from prodigyAcceptedJSONLToTSV import acceptedToTSV
-from sys import argv
-
-def isAccepted(key, row):
-    return row['answer'] == 'accept'
-
-def withLabel(corpus, label):
-    return lambda key, row: dict(**corpus.full(key, row)
-                                 , paragraphFunction=label)
-
-def simpleTrainOfMulti(multiDirectory, outputTSV):
-    annotations = []
-    for className in discursive.functions:
-        corpus = Directory(multiDirectory, tsv_filename=className)
-        p = withLabel(corpus, className)
-        annotations += list(corpus.get_all(projector=p, where=isAccepted))
-    output = SelfContained(outputTSV)
-    output.save(sorted(annotations, key=toKey))
-
-if __name__ == '__main__':
-    simpleTrainOfMulti(argv[1], argv[2])
diff --git a/scripts/ML/splitMulti.py b/scripts/ML/splitMulti.py
index d5cd2a7..9de1423 100755
--- a/scripts/ML/splitMulti.py
+++ b/scripts/ML/splitMulti.py
@@ -1,13 +1,11 @@
-#!/usr/bin/env python3
-from Corpus import Directory
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
 from GEODE import toKey
-from GEODE.Error import TwoAnnotations
-from GEODE.util import initialise, parseRatio
+from split import initialise, parseRatio
+from split.Error import TwoAnnotations
 import JSONL
 from random import shuffle
 from sys import argv, stdin
-from prodigyAcceptedJSONLToTSV import acceptedToTSV
-from prodigyMultiJSONLToDirectory import multiJSONLToDirectory
 
 def getTexts(inputJSONL):
     texts = {}
@@ -16,19 +14,20 @@ def getTexts(inputJSONL):
         key = toKey(annotation['meta'])
         if key not in errors:
             initialise(texts, key, {'accept': None, 'reject': []})
-            if annotation['answer'] == 'accept':
-                previous = texts[key]['accept']
-                if previous is None:
-                    texts[key]['accept'] = annotation
-                else:
-                    print(TwoAnnotations(annotations['meta'],
-                                         previous['label'],
-                                         texts[key]['label']))
-                    errors.add(key)
-            else:
-                texts[key]['reject'].append(annotation)
+            sortByAnswer(texts, errors, key, annotation)
     return texts
 
+def sortByAnswer(texts, errors, key, annotation):
+    if annotation['answer'] == 'accept':
+        previous = texts[key]['accept']
+        if previous is None:
+            texts[key]['accept'] = annotation
+        else:
+            TwoAnnotations(annotation, previous['label'], texts[key]['label'])
+            errors.add(key)
+    else:
+        texts[key]['reject'].append(annotation)
+
 def getTest(texts, trainRatio):
     accepted = [key for key, t in texts.items() if t['accept'] is not None]
     shuffle(accepted)
@@ -42,16 +41,22 @@ def allAnnotations(text):
         return [text['accept']] + text['reject']
 
 def getTrain(texts, test):
-    return [annotation
-            for key in sorted(texts.keys()) if key not in test
-            for annotation in allAnnotations(texts[key])]
+    train = []
+    waste = []
+    for key in sorted(texts.keys()):
+        if key not in test:
+            train += allAnnotations(texts[key])
+        else:
+            waste += texts[key]['reject']
+    return train, waste
 
 def splitMulti(jsonl, trainRatio, trainOutput, testOutput):
     texts = getTexts(jsonl)
     test = getTest(texts, trainRatio)
-    train = getTrain(texts, test)
-    multiJSONLToDirectory(train, trainOutput)
-    acceptedToTSV(test.values(), testOutput)
+    train, waste = getTrain(texts, test)
+    print(f"{len(waste)} negative annotations about texts in the test set have been discarded from the training one")
+    JSONL.save(trainOutput, train)
+    JSONL.save(testOutput, test.values())
 
 if __name__ == '__main__':
     splitMulti(JSONL.load(stdin), parseRatio(argv[1]), argv[2], argv[3])
diff --git a/scripts/ML/train.py b/scripts/ML/train.py
deleted file mode 100755
index 95b812f..0000000
--- a/scripts/ML/train.py
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env python3
-from BERT import Trainer
-from LabeledData import LabeledData
-import sys
-
-if __name__ == '__main__':
-    labeled_data = LabeledData(sys.argv[1])
-    trainer = Trainer(sys.argv[2], labeled_data)
-    trainer()
diff --git a/scripts/ML/trainMultiBERT.py b/scripts/ML/trainMultiBERT.py
index 43cc10b..f8eae39 100755
--- a/scripts/ML/trainMultiBERT.py
+++ b/scripts/ML/trainMultiBERT.py
@@ -1,8 +1,7 @@
-#!/usr/bin/env python3
-from BERT import BERT, Trainer
-from Corpus import Directory
-import GEODE.discursive as discursive
-from LabeledData import LabeledData
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
+from BERT import BERT, LabeledData, Trainer
+from GEODE import Directory, discursiveFunctions
 import os
 import sys
 
@@ -20,5 +19,5 @@ def trainSubClassifier(trainRoot, modelRoot, className):
         trainer()
 
 if __name__ == '__main__':
-    for className in discursive.functions:
+    for className in discursiveFunctions:
         trainSubClassifier(sys.argv[1], sys.argv[2], className)
diff --git a/scripts/ML/trainSimpleBERT.py b/scripts/ML/trainSimpleBERT.py
index d869b4a..28107d3 100755
--- a/scripts/ML/trainSimpleBERT.py
+++ b/scripts/ML/trainSimpleBERT.py
@@ -1,10 +1,13 @@
-#!/usr/bin/env python3
-from Corpus import corpus
-from BERT import Trainer
-from LabeledData import LabeledData
+#!/usr/bin/env -S PYTHONPATH=lib/python python3
+
+from BERT import Trainer, LabeledData
+from GEODE import corpus
+import os
 import sys
 
 if __name__ == '__main__':
     labeled_data = LabeledData(corpus(sys.argv[1]), "paragraphFunction")
-    trainer = Trainer(sys.argv[2], labeled_data)
+    modelPath = sys.argv[2]
+    os.makedirs(modelPath, exist_ok=True)
+    trainer = Trainer(modelPath, labeled_data)
     trainer()
diff --git a/scripts/extract-from-source.sh b/scripts/extract-from-source.sh
index ba74ce0..f89f885 100755
--- a/scripts/extract-from-source.sh
+++ b/scripts/extract-from-source.sh
@@ -1,8 +1,8 @@
 #!/bin/sh
 
-BASE_DIR="${0%/*}"
+BASE_DIR="${0%%/*}"
 
-source ${BASE_DIR}/lib.sh
+source ${BASE_DIR}/lib/bash.sh
 
 if [ "$#" != 2 ]
 then
@@ -16,5 +16,5 @@ fi
 
 FILES_TSV="${TARGET}/files.tsv"
 printf "book	tome	rank	headWord	name	page\n" > "${FILES_TSV}"
-${BASE_DIR}/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}"
-${BASE_DIR}/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}"
+${BASE_DIR}/scripts/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}"
+${BASE_DIR}/scripts/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}"
-- 
GitLab