diff --git a/scripts/stanza-annotator.py b/scripts/stanza-annotator.py index e5e6052268eb4f412e0b961a6dc8087b46040afa..2d341b9bcef4c95d9f2b390d8388d410deb3b85c 100755 --- a/scripts/stanza-annotator.py +++ b/scripts/stanza-annotator.py @@ -4,6 +4,7 @@ import os import os.path import stanza import sys +from xml.sax.saxutils import escape, quoteattr class Annotator: def __init__(self, source, target): @@ -45,10 +46,10 @@ class Annotator: print('\t</s>', file=target) def encode_token(self, token, target): - form = token.text - lemma = '+'.join(map(lambda w: w.lemma, token.words)) - upos = '+'.join(map(lambda w: w.upos, token.words)) - print(f'\t\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target) + form = escape(token.text) + lemma = quoteattr('+'.join(map(lambda w: w.lemma, token.words))) + upos = quoteattr('+'.join(map(lambda w: w.upos, token.words))) + print(f'\t\t<w lemma={lemma} pos={upos}>{form}</w>', file=target) if __name__ == '__main__': Annotator(sys.argv[1], sys.argv[2]).run()