Skip to content
Snippets Groups Projects
Commit 48d0de91 authored by Alice Brenon's avatar Alice Brenon
Browse files

Two ways to extract training data from Prodigy's JSONL annotation

parent c5866d55
No related branches found
No related tags found
No related merge requests found
def article(work, volume, article_):
return {'work': work, 'volume': int(volume), 'article': int(article_)}
def paragraph(work, volume, article_, paragraph_):
return {'work': work,
'volume': int(volume),
'article': int(article_),
'paragraph': int(paragraph_)}
def uid(text):
result = "{work}_{volume}_{article}".format(**text)
if 'paragraph' in text:
result = f"{result}_{text['paragraph']}"
return result
def fromUID(uid_):
components = uid_.split('_')
if len(components) == 3:
return article(*components)
elif len(components) == 4:
return paragraph(*components)
else:
print(f"'{uid}' doesn't represent a valid text UID")
def relativePath(text, extension):
result = "{work}/T{volume}/{article}".format(**text)
if 'paragraph' in text:
result = f"{result}/{text['paragraph']}"
return f"{result}.{extension}"
def toKey(text):
result = (text['work'], text['volume'], text['article'])
if 'paragraph' in text:
result = result + (text['paragraph'],)
return result
def fromKey(key):
if len(key) == 3:
return article(*key)
elif len(key) == 4:
return paragraph(*key)
else:
print(f"{key} isn't a valid text key")
functions = {'Historical narrative',
'People narrative',
'Critical',
'Description',
'Arts',
'Example',
'Reasoning',
'Quotation',
'Prescriptive'}
import json
def load(file_path):
with open(file_path, 'r') as input_file:
for line in input_file.readlines():
yield json.loads(line)
#!/usr/bin/env python3
import pandas
import JSONL
import sys
def tsv_row(annotation):
return {'work': annotation['meta']['work'],
'volume': annotation['meta']['volume'],
'article': annotation['meta']['article'],
'paragraph': annotation['meta']['paragraph'],
'content': annotation['text'].strip(),
'paragraphFunction': annotation['label']
}
if __name__ == '__main__':
input_jsonl = sys.argv[1]
output_tsv = sys.argv[2]
annotations = pandas.DataFrame(
[tsv_row(a) for a in JSONL.load(input_jsonl)
if a['answer'] == 'accept']
)
annotations.to_csv(output_tsv, sep='\t', index=False)
#!/usr/bin/env python3
from Corpus import Directory
from GEODE import toKey, uid
import GEODE.discursive as discursive
import pandas
import JSONL
import sys
binary = ['accept', 'reject']
def subDict(d, keys):
return {key: d[key] for key in keys}
def initialise(dictionary, label, value):
if label not in dictionary:
dictionary[label] = value
def initialiseTexts(texts, key, annotation):
initialise(texts,
key,
{'accept': None,
'reject': set(),
'row': dict(**annotation['meta'],
content=annotation['text'])})
def byLabel(annotations):
labels = {}
for annotation in annotations:
label = annotation['label']
answer = annotation['answer']
initialise(labels, label, {'accept': [], 'reject': [], 'ignore': []})
if answer not in labels[label]:
print(f"Unsupported answer '{answer}' for annotation {annotation}")
else:
labels[label][answer].append(annotation)
return labels
def erase(texts, error, key, reason):
error[key] = texts[key]['row']
del texts[key]
print(reason)
def accept(texts, errors, label, accepted):
for annotation in accepted:
key = toKey(annotation['meta'])
if key not in errors:
initialiseTexts(texts, key, annotation)
previous = texts[key]['accept']
if previous is not None:
reason = f"Found two annotations for {uid(annotation['meta'])}: '{label}' and '{previous}'"
erase(texts, errors, key, reason)
else:
texts[key]['accept'] = label
def reject(texts, errors, label, rejected):
for annotation in rejected:
key = toKey(annotation['meta'])
if key not in errors:
initialiseTexts(texts, key, annotation)
previous = texts[key]['accept']
if previous is not None and previous == label:
erase(texts, errors, key, f"Contradiction found for {uid(annotation['meta'])}: function {label} should be both accepted and rejected")
else:
texts[key]['reject'].add(label)
def checkRejects(texts, errors):
for key, text in texts.items():
countRejected = len(text['reject'])
countFunctions = len(discursive.functions)
if countRejected == countFunctions:
reason = f"No possible function left for {uid(text['row'])}"
erase(texts, errors, key, reason)
elif text['accept'] is None and countRejected == countFunctions - 1:
text['accept'] = discursive.functions.difference(text['reject']).pop()
print(f"Infered {uid(text['row'])} to be {text['accept']}, only discursive function left unrejected")
def byText(byLabelAnnotations):
texts = {}
errors = {}
for label, answers in byLabelAnnotations.items():
accept(texts, errors, label, answers['accept'])
reject(texts, errors, label, answers['reject'])
checkRejects(texts, errors)
return texts.values(), errors.values()
def toTsv(filePath, data):
rows = sorted(data, key=toKey)
pandas.DataFrame(rows).to_csv(filePath, sep='\t', index=False)
def toIterator(*args):
for arg in args:
for elem in arg:
yield elem
def exportCorpus(rootDirectory, texts, errors):
corpus = Directory(rootDirectory)
corpus.save(sorted(toIterator([t['row'] for t in texts], errors),
key=toKey))
def indexByKey(annotations):
return {toKey(annotation['meta']): annotation for annotation in annotations}
def allRejects(labels, label):
result = indexByKey(labels[label]['reject'])
for other, answers in labels.items():
if other != label:
for key, annotation in indexByKey(answers['accept']).items():
if key not in result:
result[key] = annotation
return result.values()
def toRow(answer):
return lambda annotation: dict(**annotation['meta'], answer=answer)
def exportLabels(rootDirectory, labels):
for label, answers in labels.items():
toTsv(f"{rootDirectory}/{label}.tsv",
toIterator(map(toRow('accept'), answers['accept']),
map(toRow('reject'), allRejects(labels, label))))
if __name__ == '__main__':
byLabelAnnotations = byLabel(JSONL.load(sys.argv[1]))
texts, errors = byText(byLabelAnnotations)
outputDirectory = sys.argv[2]
exportCorpus(outputDirectory, texts, errors)
if len(errors) > 0:
toTsv(f"{outputDirectory}/errors.tsv", errors)
exportLabels(outputDirectory, byLabelAnnotations)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment