diff --git a/scripts/ML/GEODE/Error.py b/scripts/ML/GEODE/Error.py new file mode 100644 index 0000000000000000000000000000000000000000..09cd467395fb772ed3fecd9b73f4c65613af8a81 --- /dev/null +++ b/scripts/ML/GEODE/Error.py @@ -0,0 +1,5 @@ +from GEODE import uid + +def TwoAnnotations(text, first, second): + textUID = text if type(text) == str else uid(text) + return f"Found two annotations for {textUID}: '{first}' and '{second}'" diff --git a/scripts/ML/GEODE/util.py b/scripts/ML/GEODE/util.py new file mode 100644 index 0000000000000000000000000000000000000000..e9945b1f7f8bb76a860b8a96369d315b1b23db15 --- /dev/null +++ b/scripts/ML/GEODE/util.py @@ -0,0 +1,14 @@ +def initialise(dictionary, label, value): + if label not in dictionary: + dictionary[label] = value + +def checkBound(f): + if f >= 0 and f <= 1: + return f + else: + print("Expected a ratio between 0 and 1 (inclusive)") + exit(1) + +def parseRatio(s): + return checkBound(int(s[:-1]) / 100 if s[-1] == '%' else float(s)) + diff --git a/scripts/ML/prodigy-accepted-jsonl-to-tsv.py b/scripts/ML/prodigyAcceptedJSONLToTSV.py similarity index 66% rename from scripts/ML/prodigy-accepted-jsonl-to-tsv.py rename to scripts/ML/prodigyAcceptedJSONLToTSV.py index 50fe598c986f253d564848cfe8a7b8910185c75a..2395caa68b01a53e89d72b50c9b6bb1a9f96fca1 100755 --- a/scripts/ML/prodigy-accepted-jsonl-to-tsv.py +++ b/scripts/ML/prodigyAcceptedJSONLToTSV.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +from GEODE import toKey import pandas import JSONL import sys @@ -13,11 +14,10 @@ def tsv_row(annotation): 'paragraphFunction': annotation['label'] } -if __name__ == '__main__': - input_jsonl = sys.argv[1] - output_tsv = sys.argv[2] +def acceptedToTSV(inputJSONL, outputTSV): annotations = pandas.DataFrame( - [tsv_row(a) for a in JSONL.load(input_jsonl) - if a['answer'] == 'accept'] - ) - annotations.to_csv(output_tsv, sep='\t', index=False) + sorted([tsv_row(a) for a in inputJSONL], key=toKey)) + annotations.to_csv(outputTSV, sep='\t', index=False) + +if __name__ == '__main__': + acceptedToTSV(JSONL.load(sys.argv[1]), sys.argv[2]) diff --git a/scripts/ML/prodigy-multi-jsonl-to-tsv.py b/scripts/ML/prodigyMultiJSONLToDirectory.py similarity index 95% rename from scripts/ML/prodigy-multi-jsonl-to-tsv.py rename to scripts/ML/prodigyMultiJSONLToDirectory.py index d555e7453a439df1168275ac006366fbec2f2ef8..cf3ccae84f1cf3471ee3978fd0dc54a59cdad64c 100755 --- a/scripts/ML/prodigy-multi-jsonl-to-tsv.py +++ b/scripts/ML/prodigyMultiJSONLToDirectory.py @@ -3,19 +3,14 @@ from Corpus import Directory from GEODE import toKey, uid import GEODE.discursive as discursive +from GEODE.util import initialise import pandas import JSONL import sys -binary = ['accept', 'reject'] - def subDict(d, keys): return {key: d[key] for key in keys} -def initialise(dictionary, label, value): - if label not in dictionary: - dictionary[label] = value - def initialiseTexts(texts, key, annotation): initialise(texts, key, @@ -119,11 +114,13 @@ def exportLabels(rootDirectory, labels): toIterator(map(toRow('accept'), answers['accept']), map(toRow('reject'), allRejects(labels, label)))) -if __name__ == '__main__': - byLabelAnnotations = byLabel(JSONL.load(sys.argv[1])) +def multiJSONLToDirectory(jsonl, outputDirectory): + byLabelAnnotations = byLabel(jsonl) texts, errors = byText(byLabelAnnotations) - outputDirectory = sys.argv[2] exportCorpus(outputDirectory, texts, errors) if len(errors) > 0: toTsv(f"{outputDirectory}/errors.tsv", errors) exportLabels(outputDirectory, byLabelAnnotations) + +if __name__ == '__main__': + multiJSONLToDirectory(JSONL.load(sys.argv[1]), sys.argv[2]) diff --git a/scripts/ML/splitMulti.py b/scripts/ML/splitMulti.py new file mode 100755 index 0000000000000000000000000000000000000000..d5cd2a70d5ea95c1d58e624d054cdfb76df77c3e --- /dev/null +++ b/scripts/ML/splitMulti.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +from Corpus import Directory +from GEODE import toKey +from GEODE.Error import TwoAnnotations +from GEODE.util import initialise, parseRatio +import JSONL +from random import shuffle +from sys import argv, stdin +from prodigyAcceptedJSONLToTSV import acceptedToTSV +from prodigyMultiJSONLToDirectory import multiJSONLToDirectory + +def getTexts(inputJSONL): + texts = {} + errors = set({}) + for annotation in inputJSONL: + key = toKey(annotation['meta']) + if key not in errors: + initialise(texts, key, {'accept': None, 'reject': []}) + if annotation['answer'] == 'accept': + previous = texts[key]['accept'] + if previous is None: + texts[key]['accept'] = annotation + else: + print(TwoAnnotations(annotations['meta'], + previous['label'], + texts[key]['label'])) + errors.add(key) + else: + texts[key]['reject'].append(annotation) + return texts + +def getTest(texts, trainRatio): + accepted = [key for key, t in texts.items() if t['accept'] is not None] + shuffle(accepted) + size = round(len(accepted) * (1-trainRatio)) + return {key: texts[key]['accept'] for key in accepted[:size]} + +def allAnnotations(text): + if text['accept'] is None: + return text['reject'] + else: + return [text['accept']] + text['reject'] + +def getTrain(texts, test): + return [annotation + for key in sorted(texts.keys()) if key not in test + for annotation in allAnnotations(texts[key])] + +def splitMulti(jsonl, trainRatio, trainOutput, testOutput): + texts = getTexts(jsonl) + test = getTest(texts, trainRatio) + train = getTrain(texts, test) + multiJSONLToDirectory(train, trainOutput) + acceptedToTSV(test.values(), testOutput) + +if __name__ == '__main__': + splitMulti(JSONL.load(stdin), parseRatio(argv[1]), argv[2], argv[3]) diff --git a/scripts/ML/splitSimple.py b/scripts/ML/splitSimple.py new file mode 100755 index 0000000000000000000000000000000000000000..4d0de41bed7f5e0181e786242408851b7518f45a --- /dev/null +++ b/scripts/ML/splitSimple.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +from GEODE.util import initialise, parseRatio +import JSONL +from prodigyAcceptedJSONLToTSV import acceptedToTSV +from sys import argv, stdin + +def splitSimple(jsonl, trainRatio, trainOutput, testOutput): + size = round(len(jsonl) * trainRatio) + train = jsonl[:size] + test = jsonl[size:] + acceptedToTSV(train, trainOutput) + acceptedToTSV(test, testOutput) + +if __name__ == '__main__': + splitSimple(list(JSONL.load(stdin)), parseRatio(argv[1]), argv[2], argv[3])