From 48d0de91d55c948113efdc212fc6ac232294ebf3 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Fri, 17 Nov 2023 08:25:31 +0100
Subject: [PATCH] Two ways to extract training data from Prodigy's JSONL
 annotation

---
 scripts/ML/GEODE/__init__.py                |  43 +++++++
 scripts/ML/GEODE/discursive.py              |   9 ++
 scripts/ML/JSONL.py                         |   6 +
 scripts/ML/prodigy-accepted-jsonl-to-tsv.py |  23 ++++
 scripts/ML/prodigy-multi-jsonl-to-tsv.py    | 129 ++++++++++++++++++++
 5 files changed, 210 insertions(+)
 create mode 100644 scripts/ML/GEODE/__init__.py
 create mode 100644 scripts/ML/GEODE/discursive.py
 create mode 100644 scripts/ML/JSONL.py
 create mode 100755 scripts/ML/prodigy-accepted-jsonl-to-tsv.py
 create mode 100755 scripts/ML/prodigy-multi-jsonl-to-tsv.py

diff --git a/scripts/ML/GEODE/__init__.py b/scripts/ML/GEODE/__init__.py
new file mode 100644
index 0000000..2b8ddef
--- /dev/null
+++ b/scripts/ML/GEODE/__init__.py
@@ -0,0 +1,43 @@
+def article(work, volume, article_):
+    return {'work': work, 'volume': int(volume), 'article': int(article_)}
+
+def paragraph(work, volume, article_, paragraph_):
+    return {'work': work,
+            'volume': int(volume),
+            'article': int(article_),
+            'paragraph': int(paragraph_)}
+
+def uid(text):
+    result = "{work}_{volume}_{article}".format(**text)
+    if 'paragraph' in text:
+        result = f"{result}_{text['paragraph']}"
+    return result
+
+def fromUID(uid_):
+    components = uid_.split('_')
+    if len(components) == 3:
+        return article(*components)
+    elif len(components) == 4:
+        return paragraph(*components)
+    else:
+        print(f"'{uid}' doesn't represent a valid text UID")
+
+def relativePath(text, extension):
+    result = "{work}/T{volume}/{article}".format(**text)
+    if 'paragraph' in text:
+        result = f"{result}/{text['paragraph']}"
+    return f"{result}.{extension}"
+
+def toKey(text):
+    result = (text['work'], text['volume'], text['article'])
+    if 'paragraph' in text:
+        result = result + (text['paragraph'],)
+    return result
+
+def fromKey(key):
+    if len(key) == 3:
+        return article(*key)
+    elif len(key) == 4:
+        return paragraph(*key)
+    else:
+        print(f"{key} isn't a valid text key")
diff --git a/scripts/ML/GEODE/discursive.py b/scripts/ML/GEODE/discursive.py
new file mode 100644
index 0000000..60a958e
--- /dev/null
+++ b/scripts/ML/GEODE/discursive.py
@@ -0,0 +1,9 @@
+functions = {'Historical narrative',
+             'People narrative',
+             'Critical',
+             'Description',
+             'Arts',
+             'Example',
+             'Reasoning',
+             'Quotation',
+             'Prescriptive'}
diff --git a/scripts/ML/JSONL.py b/scripts/ML/JSONL.py
new file mode 100644
index 0000000..d191ee5
--- /dev/null
+++ b/scripts/ML/JSONL.py
@@ -0,0 +1,6 @@
+import json
+
+def load(file_path):
+    with open(file_path, 'r') as input_file:
+        for line in input_file.readlines():
+            yield json.loads(line)
diff --git a/scripts/ML/prodigy-accepted-jsonl-to-tsv.py b/scripts/ML/prodigy-accepted-jsonl-to-tsv.py
new file mode 100755
index 0000000..50fe598
--- /dev/null
+++ b/scripts/ML/prodigy-accepted-jsonl-to-tsv.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+import pandas
+import JSONL
+import sys
+
+def tsv_row(annotation):
+    return {'work': annotation['meta']['work'],
+            'volume': annotation['meta']['volume'],
+            'article': annotation['meta']['article'],
+            'paragraph': annotation['meta']['paragraph'],
+            'content': annotation['text'].strip(),
+            'paragraphFunction': annotation['label']
+            }
+
+if __name__ == '__main__':
+    input_jsonl = sys.argv[1]
+    output_tsv = sys.argv[2]
+    annotations = pandas.DataFrame(
+            [tsv_row(a) for a in JSONL.load(input_jsonl)
+                        if a['answer'] == 'accept']
+            )
+    annotations.to_csv(output_tsv, sep='\t', index=False)
diff --git a/scripts/ML/prodigy-multi-jsonl-to-tsv.py b/scripts/ML/prodigy-multi-jsonl-to-tsv.py
new file mode 100755
index 0000000..c4b3c17
--- /dev/null
+++ b/scripts/ML/prodigy-multi-jsonl-to-tsv.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+
+from Corpus import Directory
+from GEODE import toKey, uid
+import GEODE.discursive as discursive
+import pandas
+import JSONL
+import sys
+
+binary = ['accept', 'reject']
+
+def subDict(d, keys):
+    return {key: d[key] for key in keys}
+
+def initialise(dictionary, label, value):
+    if label not in dictionary:
+        dictionary[label] = value
+
+def initialiseTexts(texts, key, annotation):
+    initialise(texts,
+               key,
+               {'accept': None,
+                'reject': set(),
+                'row': dict(**annotation['meta'],
+                            content=annotation['text'])})
+
+def byLabel(annotations):
+    labels = {}
+    for annotation in annotations:
+        label = annotation['label']
+        answer = annotation['answer']
+        initialise(labels, label, {'accept': [], 'reject': [], 'ignore': []})
+        if answer not in labels[label]:
+            print(f"Unsupported answer '{answer}' for annotation {annotation}")
+        else:
+            labels[label][answer].append(annotation)
+    return labels
+
+def erase(texts, error, key, reason):
+    error[key] = texts[key]['row']
+    del texts[key]
+    print(reason)
+
+def accept(texts, errors, label, accepted):
+    for annotation in accepted:
+        key = toKey(annotation['meta'])
+        if key not in errors:
+            initialiseTexts(texts, key, annotation)
+            previous = texts[key]['accept']
+            if previous is not None:
+                reason = f"Found two annotations for {uid(annotation['meta'])}: '{label}' and '{previous}'"
+                erase(texts, errors, key, reason)
+            else:
+                texts[key]['accept'] = label
+
+def reject(texts, errors, label, rejected):
+    for annotation in rejected:
+        key = toKey(annotation['meta'])
+        if key not in errors:
+            initialiseTexts(texts, key, annotation)
+            previous = texts[key]['accept']
+            if previous is not None and previous == label:
+                erase(texts, errors, key, f"Contradiction found for {uid(annotation['meta'])}: function {label} should be both accepted and rejected")
+            else:
+                texts[key]['reject'].add(label)
+
+def checkRejects(texts, errors):
+    for key, text in texts.items():
+        countRejected = len(text['reject'])
+        countFunctions = len(discursive.functions)
+        if countRejected == countFunctions:
+            reason = f"No possible function left for {uid(text['row'])}"
+            erase(texts, errors, key, reason)
+        elif text['accept'] is None and countRejected == countFunctions - 1:
+            text['accept'] = discursive.functions.difference(text['reject']).pop()
+            print(f"Infered {uid(text['row'])} to be {text['accept']}, only discursive function left unrejected")
+
+def byText(byLabelAnnotations):
+    texts = {}
+    errors = {}
+    for label, answers in byLabelAnnotations.items():
+        accept(texts, errors, label, answers['accept'])
+        reject(texts, errors, label, answers['reject'])
+    checkRejects(texts, errors)
+    return texts.values(), errors.values()
+
+def toTsv(filePath, data):
+    rows = sorted(data, key=toKey)
+    pandas.DataFrame(rows).to_csv(filePath, sep='\t', index=False)
+
+def toIterator(*args):
+    for arg in args:
+        for elem in arg:
+            yield elem
+
+def exportCorpus(rootDirectory, texts, errors):
+    corpus = Directory(rootDirectory)
+    corpus.save(sorted(toIterator([t['row'] for t in texts], errors),
+                       key=toKey))
+
+def indexByKey(annotations):
+    return {toKey(annotation['meta']): annotation for annotation in annotations}
+
+def allRejects(labels, label):
+    result = indexByKey(labels[label]['reject'])
+    for other, answers in labels.items():
+        if other != label:
+            for key, annotation in indexByKey(answers['accept']).items():
+                if key not in result:
+                    result[key] = annotation
+    return result.values()
+
+def toRow(answer):
+    return lambda annotation: dict(**annotation['meta'], answer=answer)
+
+def exportLabels(rootDirectory, labels):
+    for label, answers in labels.items():
+        toTsv(f"{rootDirectory}/{label}.tsv",
+              toIterator(map(toRow('accept'), answers['accept']),
+                         map(toRow('reject'), allRejects(labels, label))))
+
+if __name__ == '__main__':
+    byLabelAnnotations = byLabel(JSONL.load(sys.argv[1]))
+    texts, errors = byText(byLabelAnnotations)
+    outputDirectory = sys.argv[2]
+    exportCorpus(outputDirectory, texts, errors)
+    if len(errors) > 0:
+        toTsv(f"{outputDirectory}/errors.tsv", errors)
+    exportLabels(outputDirectory, byLabelAnnotations)
-- 
GitLab