Skip to content
Snippets Groups Projects
Commit 062f130c authored by Alice Brenon's avatar Alice Brenon
Browse files

Implement splitters for Simple and Multi workflows

parent a936a822
No related branches found
No related tags found
No related merge requests found
from GEODE import uid
def TwoAnnotations(text, first, second):
textUID = text if type(text) == str else uid(text)
return f"Found two annotations for {textUID}: '{first}' and '{second}'"
def initialise(dictionary, label, value):
if label not in dictionary:
dictionary[label] = value
def checkBound(f):
if f >= 0 and f <= 1:
return f
else:
print("Expected a ratio between 0 and 1 (inclusive)")
exit(1)
def parseRatio(s):
return checkBound(int(s[:-1]) / 100 if s[-1] == '%' else float(s))
#!/usr/bin/env python3 #!/usr/bin/env python3
from GEODE import toKey
import pandas import pandas
import JSONL import JSONL
import sys import sys
...@@ -13,11 +14,10 @@ def tsv_row(annotation): ...@@ -13,11 +14,10 @@ def tsv_row(annotation):
'paragraphFunction': annotation['label'] 'paragraphFunction': annotation['label']
} }
if __name__ == '__main__': def acceptedToTSV(inputJSONL, outputTSV):
input_jsonl = sys.argv[1]
output_tsv = sys.argv[2]
annotations = pandas.DataFrame( annotations = pandas.DataFrame(
[tsv_row(a) for a in JSONL.load(input_jsonl) sorted([tsv_row(a) for a in inputJSONL], key=toKey))
if a['answer'] == 'accept'] annotations.to_csv(outputTSV, sep='\t', index=False)
)
annotations.to_csv(output_tsv, sep='\t', index=False) if __name__ == '__main__':
acceptedToTSV(JSONL.load(sys.argv[1]), sys.argv[2])
...@@ -3,19 +3,14 @@ ...@@ -3,19 +3,14 @@
from Corpus import Directory from Corpus import Directory
from GEODE import toKey, uid from GEODE import toKey, uid
import GEODE.discursive as discursive import GEODE.discursive as discursive
from GEODE.util import initialise
import pandas import pandas
import JSONL import JSONL
import sys import sys
binary = ['accept', 'reject']
def subDict(d, keys): def subDict(d, keys):
return {key: d[key] for key in keys} return {key: d[key] for key in keys}
def initialise(dictionary, label, value):
if label not in dictionary:
dictionary[label] = value
def initialiseTexts(texts, key, annotation): def initialiseTexts(texts, key, annotation):
initialise(texts, initialise(texts,
key, key,
...@@ -119,11 +114,13 @@ def exportLabels(rootDirectory, labels): ...@@ -119,11 +114,13 @@ def exportLabels(rootDirectory, labels):
toIterator(map(toRow('accept'), answers['accept']), toIterator(map(toRow('accept'), answers['accept']),
map(toRow('reject'), allRejects(labels, label)))) map(toRow('reject'), allRejects(labels, label))))
if __name__ == '__main__': def multiJSONLToDirectory(jsonl, outputDirectory):
byLabelAnnotations = byLabel(JSONL.load(sys.argv[1])) byLabelAnnotations = byLabel(jsonl)
texts, errors = byText(byLabelAnnotations) texts, errors = byText(byLabelAnnotations)
outputDirectory = sys.argv[2]
exportCorpus(outputDirectory, texts, errors) exportCorpus(outputDirectory, texts, errors)
if len(errors) > 0: if len(errors) > 0:
toTsv(f"{outputDirectory}/errors.tsv", errors) toTsv(f"{outputDirectory}/errors.tsv", errors)
exportLabels(outputDirectory, byLabelAnnotations) exportLabels(outputDirectory, byLabelAnnotations)
if __name__ == '__main__':
multiJSONLToDirectory(JSONL.load(sys.argv[1]), sys.argv[2])
#!/usr/bin/env python3
from Corpus import Directory
from GEODE import toKey
from GEODE.Error import TwoAnnotations
from GEODE.util import initialise, parseRatio
import JSONL
from random import shuffle
from sys import argv, stdin
from prodigyAcceptedJSONLToTSV import acceptedToTSV
from prodigyMultiJSONLToDirectory import multiJSONLToDirectory
def getTexts(inputJSONL):
texts = {}
errors = set({})
for annotation in inputJSONL:
key = toKey(annotation['meta'])
if key not in errors:
initialise(texts, key, {'accept': None, 'reject': []})
if annotation['answer'] == 'accept':
previous = texts[key]['accept']
if previous is None:
texts[key]['accept'] = annotation
else:
print(TwoAnnotations(annotations['meta'],
previous['label'],
texts[key]['label']))
errors.add(key)
else:
texts[key]['reject'].append(annotation)
return texts
def getTest(texts, trainRatio):
accepted = [key for key, t in texts.items() if t['accept'] is not None]
shuffle(accepted)
size = round(len(accepted) * (1-trainRatio))
return {key: texts[key]['accept'] for key in accepted[:size]}
def allAnnotations(text):
if text['accept'] is None:
return text['reject']
else:
return [text['accept']] + text['reject']
def getTrain(texts, test):
return [annotation
for key in sorted(texts.keys()) if key not in test
for annotation in allAnnotations(texts[key])]
def splitMulti(jsonl, trainRatio, trainOutput, testOutput):
texts = getTexts(jsonl)
test = getTest(texts, trainRatio)
train = getTrain(texts, test)
multiJSONLToDirectory(train, trainOutput)
acceptedToTSV(test.values(), testOutput)
if __name__ == '__main__':
splitMulti(JSONL.load(stdin), parseRatio(argv[1]), argv[2], argv[3])
#!/usr/bin/env python3
from GEODE.util import initialise, parseRatio
import JSONL
from prodigyAcceptedJSONLToTSV import acceptedToTSV
from sys import argv, stdin
def splitSimple(jsonl, trainRatio, trainOutput, testOutput):
size = round(len(jsonl) * trainRatio)
train = jsonl[:size]
test = jsonl[size:]
acceptedToTSV(train, trainOutput)
acceptedToTSV(test, testOutput)
if __name__ == '__main__':
splitSimple(list(JSONL.load(stdin)), parseRatio(argv[1]), argv[2], argv[3])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment