From 3e1224b84ef926e3077a28512cf6fd688e7803e6 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Sat, 10 Dec 2022 10:06:24 +0100
Subject: [PATCH] Don't forget to escape XML characters in output

---
 scripts/stanza-annotator.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/stanza-annotator.py b/scripts/stanza-annotator.py
index e5e6052..2d341b9 100755
--- a/scripts/stanza-annotator.py
+++ b/scripts/stanza-annotator.py
@@ -4,6 +4,7 @@ import os
 import os.path
 import stanza
 import sys
+from xml.sax.saxutils import escape, quoteattr
 
 class Annotator:
     def __init__(self, source, target):
@@ -45,10 +46,10 @@ class Annotator:
         print('\t</s>', file=target)
 
     def encode_token(self, token, target):
-        form = token.text
-        lemma = '+'.join(map(lambda w: w.lemma, token.words))
-        upos = '+'.join(map(lambda w: w.upos, token.words))
-        print(f'\t\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target)
+        form = escape(token.text)
+        lemma = quoteattr('+'.join(map(lambda w: w.lemma, token.words)))
+        upos = quoteattr('+'.join(map(lambda w: w.upos, token.words)))
+        print(f'\t\t<w lemma={lemma} pos={upos}>{form}</w>', file=target)
 
 if __name__ == '__main__':
     Annotator(sys.argv[1], sys.argv[2]).run()
-- 
GitLab