Newer
Older
#!/usr/bin/env python3
from Corpus import Directory
from GEODE import toKey, uid
import GEODE.discursive as discursive
from GEODE.util import initialise
import pandas
import JSONL
import sys
def subDict(d, keys):
return {key: d[key] for key in keys}
def initialiseTexts(texts, key, annotation):
initialise(texts,
key,
{'accept': None,
'reject': set(),
'row': dict(**annotation['meta'],
Alice Brenon
committed
content=annotation['text'].strip()+'\n')})
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def byLabel(annotations):
labels = {}
for annotation in annotations:
label = annotation['label']
answer = annotation['answer']
initialise(labels, label, {'accept': [], 'reject': [], 'ignore': []})
if answer not in labels[label]:
print(f"Unsupported answer '{answer}' for annotation {annotation}")
else:
labels[label][answer].append(annotation)
return labels
def erase(texts, error, key, reason):
error[key] = texts[key]['row']
del texts[key]
print(reason)
def accept(texts, errors, label, accepted):
for annotation in accepted:
key = toKey(annotation['meta'])
if key not in errors:
initialiseTexts(texts, key, annotation)
previous = texts[key]['accept']
if previous is not None:
reason = f"Found two annotations for {uid(annotation['meta'])}: '{label}' and '{previous}'"
erase(texts, errors, key, reason)
else:
texts[key]['accept'] = label
def reject(texts, errors, label, rejected):
for annotation in rejected:
key = toKey(annotation['meta'])
if key not in errors:
initialiseTexts(texts, key, annotation)
previous = texts[key]['accept']
if previous is not None and previous == label:
erase(texts, errors, key, f"Contradiction found for {uid(annotation['meta'])}: function {label} should be both accepted and rejected")
else:
texts[key]['reject'].add(label)
def checkRejects(texts, errors):
for key, text in texts.items():
countRejected = len(text['reject'])
countFunctions = len(discursive.functions)
if countRejected == countFunctions:
reason = f"No possible function left for {uid(text['row'])}"
erase(texts, errors, key, reason)
elif text['accept'] is None and countRejected == countFunctions - 1:
text['accept'] = discursive.functions.difference(text['reject']).pop()
print(f"Infered {uid(text['row'])} to be {text['accept']}, only discursive function left unrejected")
def byText(byLabelAnnotations):
texts = {}
errors = {}
for label, answers in byLabelAnnotations.items():
accept(texts, errors, label, answers['accept'])
reject(texts, errors, label, answers['reject'])
checkRejects(texts, errors)
return texts.values(), errors.values()
def toTsv(filePath, data):
rows = sorted(data, key=toKey)
pandas.DataFrame(rows).to_csv(filePath, sep='\t', index=False)
def toIterator(*args):
for arg in args:
for elem in arg:
yield elem
def exportCorpus(rootDirectory, texts, errors):
corpus = Directory(rootDirectory)
corpus.save(sorted(toIterator([t['row'] for t in texts], errors),
key=toKey))
def indexByKey(annotations):
return {toKey(annotation['meta']): annotation for annotation in annotations}
def allRejects(labels, label):
result = indexByKey(labels[label]['reject'])
for other, answers in labels.items():
if other != label:
for key, annotation in indexByKey(answers['accept']).items():
if key not in result:
result[key] = annotation
return result.values()
def toRow(answer):
return lambda annotation: dict(**annotation['meta'], answer=answer)
def exportLabels(rootDirectory, labels):
for label, answers in labels.items():
toTsv(f"{rootDirectory}/{label}.tsv",
toIterator(map(toRow('accept'), answers['accept']),
map(toRow('reject'), allRejects(labels, label))))
def multiJSONLToDirectory(jsonl, outputDirectory):
byLabelAnnotations = byLabel(jsonl)
texts, errors = byText(byLabelAnnotations)
exportCorpus(outputDirectory, texts, errors)
if len(errors) > 0:
toTsv(f"{outputDirectory}/errors.tsv", errors)
exportLabels(outputDirectory, byLabelAnnotations)
if __name__ == '__main__':
multiJSONLToDirectory(JSONL.load(sys.argv[1]), sys.argv[2])