Skip to content
Snippets Groups Projects
prodigyMultiJSONLToDirectory.py 4.53 KiB
Newer Older
#!/usr/bin/env python3

from Corpus import Directory
from GEODE import toKey, uid
import GEODE.discursive as discursive
from GEODE.util import initialise
import pandas
import JSONL
import sys

def subDict(d, keys):
    return {key: d[key] for key in keys}

def initialiseTexts(texts, key, annotation):
    initialise(texts,
               key,
               {'accept': None,
                'reject': set(),
                'row': dict(**annotation['meta'],
                            content=annotation['text'].strip()+'\n')})

def byLabel(annotations):
    labels = {}
    for annotation in annotations:
        label = annotation['label']
        answer = annotation['answer']
        initialise(labels, label, {'accept': [], 'reject': [], 'ignore': []})
        if answer not in labels[label]:
            print(f"Unsupported answer '{answer}' for annotation {annotation}")
        else:
            labels[label][answer].append(annotation)
    return labels

def erase(texts, error, key, reason):
    error[key] = texts[key]['row']
    del texts[key]
    print(reason)

def accept(texts, errors, label, accepted):
    for annotation in accepted:
        key = toKey(annotation['meta'])
        if key not in errors:
            initialiseTexts(texts, key, annotation)
            previous = texts[key]['accept']
            if previous is not None:
                reason = f"Found two annotations for {uid(annotation['meta'])}: '{label}' and '{previous}'"
                erase(texts, errors, key, reason)
            else:
                texts[key]['accept'] = label

def reject(texts, errors, label, rejected):
    for annotation in rejected:
        key = toKey(annotation['meta'])
        if key not in errors:
            initialiseTexts(texts, key, annotation)
            previous = texts[key]['accept']
            if previous is not None and previous == label:
                erase(texts, errors, key, f"Contradiction found for {uid(annotation['meta'])}: function {label} should be both accepted and rejected")
            else:
                texts[key]['reject'].add(label)

def checkRejects(texts, errors):
    for key, text in texts.items():
        countRejected = len(text['reject'])
        countFunctions = len(discursive.functions)
        if countRejected == countFunctions:
            reason = f"No possible function left for {uid(text['row'])}"
            erase(texts, errors, key, reason)
        elif text['accept'] is None and countRejected == countFunctions - 1:
            text['accept'] = discursive.functions.difference(text['reject']).pop()
            print(f"Infered {uid(text['row'])} to be {text['accept']}, only discursive function left unrejected")

def byText(byLabelAnnotations):
    texts = {}
    errors = {}
    for label, answers in byLabelAnnotations.items():
        accept(texts, errors, label, answers['accept'])
        reject(texts, errors, label, answers['reject'])
    checkRejects(texts, errors)
    return texts.values(), errors.values()

def toTsv(filePath, data):
    rows = sorted(data, key=toKey)
    pandas.DataFrame(rows).to_csv(filePath, sep='\t', index=False)

def toIterator(*args):
    for arg in args:
        for elem in arg:
            yield elem

def exportCorpus(rootDirectory, texts, errors):
    corpus = Directory(rootDirectory)
    corpus.save(sorted(toIterator([t['row'] for t in texts], errors),
                       key=toKey))

def indexByKey(annotations):
    return {toKey(annotation['meta']): annotation for annotation in annotations}

def allRejects(labels, label):
    result = indexByKey(labels[label]['reject'])
    for other, answers in labels.items():
        if other != label:
            for key, annotation in indexByKey(answers['accept']).items():
                if key not in result:
                    result[key] = annotation
    return result.values()

def toRow(answer):
    return lambda annotation: dict(**annotation['meta'], answer=answer)

def exportLabels(rootDirectory, labels):
    for label, answers in labels.items():
        toTsv(f"{rootDirectory}/{label}.tsv",
              toIterator(map(toRow('accept'), answers['accept']),
                         map(toRow('reject'), allRejects(labels, label))))

def multiJSONLToDirectory(jsonl, outputDirectory):
    byLabelAnnotations = byLabel(jsonl)
    texts, errors = byText(byLabelAnnotations)
    exportCorpus(outputDirectory, texts, errors)
    if len(errors) > 0:
        toTsv(f"{outputDirectory}/errors.tsv", errors)
    exportLabels(outputDirectory, byLabelAnnotations)

if __name__ == '__main__':
    multiJSONLToDirectory(JSONL.load(sys.argv[1]), sys.argv[2])