diff --git a/scripts/stanza-txt-annotator.py b/scripts/stanza-txt-annotator.py index 93e2d7837d647d1301dae8bb1dbd1f9efccfee61..255641219f70c19d3cf45d6efd3ab4940f908b29 100755 --- a/scripts/stanza-txt-annotator.py +++ b/scripts/stanza-txt-annotator.py @@ -10,6 +10,37 @@ from tqdm import tqdm def oneLine(sentence): return sentence.text.replace('\n', ' ').replace('\t', ' ') +def coNLLGet(token, attribute): + return str(token[attribute]) if attribute in token else '_' + +def singleOrRange(token_id): + if type(token_id) == int: + return str(token_id) + else: + return '-'.join(map(str, token_id)) + +def miscAttribute(token): + return lambda key: [f"{key}={token[key]}"] if key in token else [] + +def misc(token): + keys = ['start_char', 'end_char', 'ner'] + attributes = sum(map(miscAttribute(token), keys), []) + return '|'.join(attributes) if len(attributes) > 0 else '_' + +def formatToken(token): + return '\t'.join([ + singleOrRange(token['id']), + coNLLGet(token, "text"), + coNLLGet(token, "lemma"), + coNLLGet(token, "upos"), + coNLLGet(token, "xpos"), + coNLLGet(token, "feats"), + coNLLGet(token, "head"), + coNLLGet(token, "deprel"), + coNLLGet(token, "deps"), + misc(token) + ]) + class Annotator: def __init__(self, source, target): self.source = source @@ -57,16 +88,13 @@ class Annotator: if len(parsed.sentences) > 0: print(f'# newpar id = {self.newpar()}', file=target) for sentence in parsed.sentences: - sentence.add_comment(f'# sent_id = {self.newsent()}') - sentence.add_comment(f'# text = {oneLine(sentence)}') - print(CoNLL.doc2conll_text(parsed), file=target, end='') - -# def annotate_paragraph(self, paragraph, target): -# parsed = self.model(paragraph) -# for sentence in parsed.sentences: -# sentence.add_comment(f'# sent_id = {self.newsent()}') -# sentence.add_comment(f'# text = {oneLine(sentence)}') -# print(CoNLL.doc2conll_text(parsed), file=target, end='') + print(self.annotate_sentence(sentence), file=target) + def annotate_sentence(self, sentence): + return "\n".join( + [f'# sent_id = {self.newsent()}', + f'# text = {oneLine(sentence)}'] + + list(map(formatToken, sentence.to_dict())) + + ['']) def newpar(self): self.paragraphSerial += 1