diff --git a/scripts/ML/GEODE/Error.py b/scripts/ML/GEODE/Error.py
new file mode 100644
index 0000000000000000000000000000000000000000..09cd467395fb772ed3fecd9b73f4c65613af8a81
--- /dev/null
+++ b/scripts/ML/GEODE/Error.py
@@ -0,0 +1,5 @@
+from GEODE import uid
+
+def TwoAnnotations(text, first, second):
+    textUID = text if type(text) == str else uid(text)
+    return f"Found two annotations for {textUID}: '{first}' and '{second}'"
diff --git a/scripts/ML/GEODE/util.py b/scripts/ML/GEODE/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9945b1f7f8bb76a860b8a96369d315b1b23db15
--- /dev/null
+++ b/scripts/ML/GEODE/util.py
@@ -0,0 +1,14 @@
+def initialise(dictionary, label, value):
+    if label not in dictionary:
+        dictionary[label] = value
+
+def checkBound(f):
+    if f >= 0 and f <= 1:
+        return f
+    else:
+        print("Expected a ratio between 0 and 1 (inclusive)")
+        exit(1)
+
+def parseRatio(s):
+    return checkBound(int(s[:-1]) / 100 if s[-1] == '%' else float(s))
+
diff --git a/scripts/ML/prodigy-accepted-jsonl-to-tsv.py b/scripts/ML/prodigyAcceptedJSONLToTSV.py
similarity index 66%
rename from scripts/ML/prodigy-accepted-jsonl-to-tsv.py
rename to scripts/ML/prodigyAcceptedJSONLToTSV.py
index 50fe598c986f253d564848cfe8a7b8910185c75a..2395caa68b01a53e89d72b50c9b6bb1a9f96fca1 100755
--- a/scripts/ML/prodigy-accepted-jsonl-to-tsv.py
+++ b/scripts/ML/prodigyAcceptedJSONLToTSV.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+from GEODE import toKey
 import pandas
 import JSONL
 import sys
@@ -13,11 +14,10 @@ def tsv_row(annotation):
             'paragraphFunction': annotation['label']
             }
 
-if __name__ == '__main__':
-    input_jsonl = sys.argv[1]
-    output_tsv = sys.argv[2]
+def acceptedToTSV(inputJSONL, outputTSV):
     annotations = pandas.DataFrame(
-            [tsv_row(a) for a in JSONL.load(input_jsonl)
-                        if a['answer'] == 'accept']
-            )
-    annotations.to_csv(output_tsv, sep='\t', index=False)
+            sorted([tsv_row(a) for a in inputJSONL], key=toKey))
+    annotations.to_csv(outputTSV, sep='\t', index=False)
+
+if __name__ == '__main__':
+    acceptedToTSV(JSONL.load(sys.argv[1]), sys.argv[2])
diff --git a/scripts/ML/prodigy-multi-jsonl-to-tsv.py b/scripts/ML/prodigyMultiJSONLToDirectory.py
similarity index 95%
rename from scripts/ML/prodigy-multi-jsonl-to-tsv.py
rename to scripts/ML/prodigyMultiJSONLToDirectory.py
index d555e7453a439df1168275ac006366fbec2f2ef8..cf3ccae84f1cf3471ee3978fd0dc54a59cdad64c 100755
--- a/scripts/ML/prodigy-multi-jsonl-to-tsv.py
+++ b/scripts/ML/prodigyMultiJSONLToDirectory.py
@@ -3,19 +3,14 @@
 from Corpus import Directory
 from GEODE import toKey, uid
 import GEODE.discursive as discursive
+from GEODE.util import initialise
 import pandas
 import JSONL
 import sys
 
-binary = ['accept', 'reject']
-
 def subDict(d, keys):
     return {key: d[key] for key in keys}
 
-def initialise(dictionary, label, value):
-    if label not in dictionary:
-        dictionary[label] = value
-
 def initialiseTexts(texts, key, annotation):
     initialise(texts,
                key,
@@ -119,11 +114,13 @@ def exportLabels(rootDirectory, labels):
               toIterator(map(toRow('accept'), answers['accept']),
                          map(toRow('reject'), allRejects(labels, label))))
 
-if __name__ == '__main__':
-    byLabelAnnotations = byLabel(JSONL.load(sys.argv[1]))
+def multiJSONLToDirectory(jsonl, outputDirectory):
+    byLabelAnnotations = byLabel(jsonl)
     texts, errors = byText(byLabelAnnotations)
-    outputDirectory = sys.argv[2]
     exportCorpus(outputDirectory, texts, errors)
     if len(errors) > 0:
         toTsv(f"{outputDirectory}/errors.tsv", errors)
     exportLabels(outputDirectory, byLabelAnnotations)
+
+if __name__ == '__main__':
+    multiJSONLToDirectory(JSONL.load(sys.argv[1]), sys.argv[2])
diff --git a/scripts/ML/splitMulti.py b/scripts/ML/splitMulti.py
new file mode 100755
index 0000000000000000000000000000000000000000..d5cd2a70d5ea95c1d58e624d054cdfb76df77c3e
--- /dev/null
+++ b/scripts/ML/splitMulti.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+from Corpus import Directory
+from GEODE import toKey
+from GEODE.Error import TwoAnnotations
+from GEODE.util import initialise, parseRatio
+import JSONL
+from random import shuffle
+from sys import argv, stdin
+from prodigyAcceptedJSONLToTSV import acceptedToTSV
+from prodigyMultiJSONLToDirectory import multiJSONLToDirectory
+
+def getTexts(inputJSONL):
+    texts = {}
+    errors = set({})
+    for annotation in inputJSONL:
+        key = toKey(annotation['meta'])
+        if key not in errors:
+            initialise(texts, key, {'accept': None, 'reject': []})
+            if annotation['answer'] == 'accept':
+                previous = texts[key]['accept']
+                if previous is None:
+                    texts[key]['accept'] = annotation
+                else:
+                    print(TwoAnnotations(annotations['meta'],
+                                         previous['label'],
+                                         texts[key]['label']))
+                    errors.add(key)
+            else:
+                texts[key]['reject'].append(annotation)
+    return texts
+
+def getTest(texts, trainRatio):
+    accepted = [key for key, t in texts.items() if t['accept'] is not None]
+    shuffle(accepted)
+    size = round(len(accepted) * (1-trainRatio))
+    return {key: texts[key]['accept'] for key in accepted[:size]}
+
+def allAnnotations(text):
+    if text['accept'] is None:
+        return text['reject']
+    else:
+        return [text['accept']] + text['reject']
+
+def getTrain(texts, test):
+    return [annotation
+            for key in sorted(texts.keys()) if key not in test
+            for annotation in allAnnotations(texts[key])]
+
+def splitMulti(jsonl, trainRatio, trainOutput, testOutput):
+    texts = getTexts(jsonl)
+    test = getTest(texts, trainRatio)
+    train = getTrain(texts, test)
+    multiJSONLToDirectory(train, trainOutput)
+    acceptedToTSV(test.values(), testOutput)
+
+if __name__ == '__main__':
+    splitMulti(JSONL.load(stdin), parseRatio(argv[1]), argv[2], argv[3])
diff --git a/scripts/ML/splitSimple.py b/scripts/ML/splitSimple.py
new file mode 100755
index 0000000000000000000000000000000000000000..4d0de41bed7f5e0181e786242408851b7518f45a
--- /dev/null
+++ b/scripts/ML/splitSimple.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+from GEODE.util import initialise, parseRatio
+import JSONL
+from prodigyAcceptedJSONLToTSV import acceptedToTSV
+from sys import argv, stdin
+
+def splitSimple(jsonl, trainRatio, trainOutput, testOutput):
+    size = round(len(jsonl) * trainRatio)
+    train = jsonl[:size]
+    test = jsonl[size:]
+    acceptedToTSV(train, trainOutput)
+    acceptedToTSV(test, testOutput)
+
+if __name__ == '__main__':
+    splitSimple(list(JSONL.load(stdin)), parseRatio(argv[1]), argv[2], argv[3])