From 48d0de91d55c948113efdc212fc6ac232294ebf3 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Fri, 17 Nov 2023 08:25:31 +0100 Subject: [PATCH] Two ways to extract training data from Prodigy's JSONL annotation --- scripts/ML/GEODE/__init__.py | 43 +++++++ scripts/ML/GEODE/discursive.py | 9 ++ scripts/ML/JSONL.py | 6 + scripts/ML/prodigy-accepted-jsonl-to-tsv.py | 23 ++++ scripts/ML/prodigy-multi-jsonl-to-tsv.py | 129 ++++++++++++++++++++ 5 files changed, 210 insertions(+) create mode 100644 scripts/ML/GEODE/__init__.py create mode 100644 scripts/ML/GEODE/discursive.py create mode 100644 scripts/ML/JSONL.py create mode 100755 scripts/ML/prodigy-accepted-jsonl-to-tsv.py create mode 100755 scripts/ML/prodigy-multi-jsonl-to-tsv.py diff --git a/scripts/ML/GEODE/__init__.py b/scripts/ML/GEODE/__init__.py new file mode 100644 index 0000000..2b8ddef --- /dev/null +++ b/scripts/ML/GEODE/__init__.py @@ -0,0 +1,43 @@ +def article(work, volume, article_): + return {'work': work, 'volume': int(volume), 'article': int(article_)} + +def paragraph(work, volume, article_, paragraph_): + return {'work': work, + 'volume': int(volume), + 'article': int(article_), + 'paragraph': int(paragraph_)} + +def uid(text): + result = "{work}_{volume}_{article}".format(**text) + if 'paragraph' in text: + result = f"{result}_{text['paragraph']}" + return result + +def fromUID(uid_): + components = uid_.split('_') + if len(components) == 3: + return article(*components) + elif len(components) == 4: + return paragraph(*components) + else: + print(f"'{uid}' doesn't represent a valid text UID") + +def relativePath(text, extension): + result = "{work}/T{volume}/{article}".format(**text) + if 'paragraph' in text: + result = f"{result}/{text['paragraph']}" + return f"{result}.{extension}" + +def toKey(text): + result = (text['work'], text['volume'], text['article']) + if 'paragraph' in text: + result = result + (text['paragraph'],) + return result + +def fromKey(key): + if len(key) == 3: + return article(*key) + elif len(key) == 4: + return paragraph(*key) + else: + print(f"{key} isn't a valid text key") diff --git a/scripts/ML/GEODE/discursive.py b/scripts/ML/GEODE/discursive.py new file mode 100644 index 0000000..60a958e --- /dev/null +++ b/scripts/ML/GEODE/discursive.py @@ -0,0 +1,9 @@ +functions = {'Historical narrative', + 'People narrative', + 'Critical', + 'Description', + 'Arts', + 'Example', + 'Reasoning', + 'Quotation', + 'Prescriptive'} diff --git a/scripts/ML/JSONL.py b/scripts/ML/JSONL.py new file mode 100644 index 0000000..d191ee5 --- /dev/null +++ b/scripts/ML/JSONL.py @@ -0,0 +1,6 @@ +import json + +def load(file_path): + with open(file_path, 'r') as input_file: + for line in input_file.readlines(): + yield json.loads(line) diff --git a/scripts/ML/prodigy-accepted-jsonl-to-tsv.py b/scripts/ML/prodigy-accepted-jsonl-to-tsv.py new file mode 100755 index 0000000..50fe598 --- /dev/null +++ b/scripts/ML/prodigy-accepted-jsonl-to-tsv.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import pandas +import JSONL +import sys + +def tsv_row(annotation): + return {'work': annotation['meta']['work'], + 'volume': annotation['meta']['volume'], + 'article': annotation['meta']['article'], + 'paragraph': annotation['meta']['paragraph'], + 'content': annotation['text'].strip(), + 'paragraphFunction': annotation['label'] + } + +if __name__ == '__main__': + input_jsonl = sys.argv[1] + output_tsv = sys.argv[2] + annotations = pandas.DataFrame( + [tsv_row(a) for a in JSONL.load(input_jsonl) + if a['answer'] == 'accept'] + ) + annotations.to_csv(output_tsv, sep='\t', index=False) diff --git a/scripts/ML/prodigy-multi-jsonl-to-tsv.py b/scripts/ML/prodigy-multi-jsonl-to-tsv.py new file mode 100755 index 0000000..c4b3c17 --- /dev/null +++ b/scripts/ML/prodigy-multi-jsonl-to-tsv.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 + +from Corpus import Directory +from GEODE import toKey, uid +import GEODE.discursive as discursive +import pandas +import JSONL +import sys + +binary = ['accept', 'reject'] + +def subDict(d, keys): + return {key: d[key] for key in keys} + +def initialise(dictionary, label, value): + if label not in dictionary: + dictionary[label] = value + +def initialiseTexts(texts, key, annotation): + initialise(texts, + key, + {'accept': None, + 'reject': set(), + 'row': dict(**annotation['meta'], + content=annotation['text'])}) + +def byLabel(annotations): + labels = {} + for annotation in annotations: + label = annotation['label'] + answer = annotation['answer'] + initialise(labels, label, {'accept': [], 'reject': [], 'ignore': []}) + if answer not in labels[label]: + print(f"Unsupported answer '{answer}' for annotation {annotation}") + else: + labels[label][answer].append(annotation) + return labels + +def erase(texts, error, key, reason): + error[key] = texts[key]['row'] + del texts[key] + print(reason) + +def accept(texts, errors, label, accepted): + for annotation in accepted: + key = toKey(annotation['meta']) + if key not in errors: + initialiseTexts(texts, key, annotation) + previous = texts[key]['accept'] + if previous is not None: + reason = f"Found two annotations for {uid(annotation['meta'])}: '{label}' and '{previous}'" + erase(texts, errors, key, reason) + else: + texts[key]['accept'] = label + +def reject(texts, errors, label, rejected): + for annotation in rejected: + key = toKey(annotation['meta']) + if key not in errors: + initialiseTexts(texts, key, annotation) + previous = texts[key]['accept'] + if previous is not None and previous == label: + erase(texts, errors, key, f"Contradiction found for {uid(annotation['meta'])}: function {label} should be both accepted and rejected") + else: + texts[key]['reject'].add(label) + +def checkRejects(texts, errors): + for key, text in texts.items(): + countRejected = len(text['reject']) + countFunctions = len(discursive.functions) + if countRejected == countFunctions: + reason = f"No possible function left for {uid(text['row'])}" + erase(texts, errors, key, reason) + elif text['accept'] is None and countRejected == countFunctions - 1: + text['accept'] = discursive.functions.difference(text['reject']).pop() + print(f"Infered {uid(text['row'])} to be {text['accept']}, only discursive function left unrejected") + +def byText(byLabelAnnotations): + texts = {} + errors = {} + for label, answers in byLabelAnnotations.items(): + accept(texts, errors, label, answers['accept']) + reject(texts, errors, label, answers['reject']) + checkRejects(texts, errors) + return texts.values(), errors.values() + +def toTsv(filePath, data): + rows = sorted(data, key=toKey) + pandas.DataFrame(rows).to_csv(filePath, sep='\t', index=False) + +def toIterator(*args): + for arg in args: + for elem in arg: + yield elem + +def exportCorpus(rootDirectory, texts, errors): + corpus = Directory(rootDirectory) + corpus.save(sorted(toIterator([t['row'] for t in texts], errors), + key=toKey)) + +def indexByKey(annotations): + return {toKey(annotation['meta']): annotation for annotation in annotations} + +def allRejects(labels, label): + result = indexByKey(labels[label]['reject']) + for other, answers in labels.items(): + if other != label: + for key, annotation in indexByKey(answers['accept']).items(): + if key not in result: + result[key] = annotation + return result.values() + +def toRow(answer): + return lambda annotation: dict(**annotation['meta'], answer=answer) + +def exportLabels(rootDirectory, labels): + for label, answers in labels.items(): + toTsv(f"{rootDirectory}/{label}.tsv", + toIterator(map(toRow('accept'), answers['accept']), + map(toRow('reject'), allRejects(labels, label)))) + +if __name__ == '__main__': + byLabelAnnotations = byLabel(JSONL.load(sys.argv[1])) + texts, errors = byText(byLabelAnnotations) + outputDirectory = sys.argv[2] + exportCorpus(outputDirectory, texts, errors) + if len(errors) > 0: + toTsv(f"{outputDirectory}/errors.tsv", errors) + exportLabels(outputDirectory, byLabelAnnotations) -- GitLab