From 3e1224b84ef926e3077a28512cf6fd688e7803e6 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Sat, 10 Dec 2022 10:06:24 +0100 Subject: [PATCH] Don't forget to escape XML characters in output --- scripts/stanza-annotator.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/stanza-annotator.py b/scripts/stanza-annotator.py index e5e6052..2d341b9 100755 --- a/scripts/stanza-annotator.py +++ b/scripts/stanza-annotator.py @@ -4,6 +4,7 @@ import os import os.path import stanza import sys +from xml.sax.saxutils import escape, quoteattr class Annotator: def __init__(self, source, target): @@ -45,10 +46,10 @@ class Annotator: print('\t</s>', file=target) def encode_token(self, token, target): - form = token.text - lemma = '+'.join(map(lambda w: w.lemma, token.words)) - upos = '+'.join(map(lambda w: w.upos, token.words)) - print(f'\t\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target) + form = escape(token.text) + lemma = quoteattr('+'.join(map(lambda w: w.lemma, token.words))) + upos = quoteattr('+'.join(map(lambda w: w.upos, token.words))) + print(f'\t\t<w lemma={lemma} pos={upos}>{form}</w>', file=target) if __name__ == '__main__': Annotator(sys.argv[1], sys.argv[2]).run() -- GitLab