From 37a652cc3358b323cd3d8abad6404cd98f14f481 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Wed, 23 Nov 2022 16:26:35 +0100
Subject: [PATCH] Fix XML output from stanza which had multiple roots + add a
 demonstrator haskell script that was used to fix the problem in place

---
 scripts/reroot.hs           | 16 ++++++++++++++++
 scripts/stanza-annotator.py |  8 +++++---
 2 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100755 scripts/reroot.hs

diff --git a/scripts/reroot.hs b/scripts/reroot.hs
new file mode 100755
index 0000000..fc5e715
--- /dev/null
+++ b/scripts/reroot.hs
@@ -0,0 +1,16 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-i lib"
+{-# LANGUAGE OverloadedStrings #-}
+
+import Data.Text (Text)
+import System.Environment (getArgs)
+import System.Script (syntax)
+import Text.Editor (editAll)
+
+wrap :: [Text] -> [Text]
+wrap text = "<body>": (("\t"<>) <$> text) ++ ["</body>"]
+
+main :: IO ()
+main = getArgs >>= run
+  where
+    run [target] = getContents >>= editAll wrap target
+    run _ = syntax "TARGET_DIRECTORY"
diff --git a/scripts/stanza-annotator.py b/scripts/stanza-annotator.py
index 8a35e11..e5e6052 100755
--- a/scripts/stanza-annotator.py
+++ b/scripts/stanza-annotator.py
@@ -33,20 +33,22 @@ class Annotator:
 
     def encode(self, file, annotation):
         with open(f'{self.target}/{file}', 'w') as target:
+            print('<text>', file=target)
             for sentence in annotation.sentences:
                 self.encode_sentence(sentence, target)
+            print('</text>', file=target)
 
     def encode_sentence(self, sentence, target):
-        print('<s>', file=target)
+        print('\t<s>', file=target)
         for token in sentence.tokens:
             self.encode_token(token, target)
-        print('</s>', file=target)
+        print('\t</s>', file=target)
 
     def encode_token(self, token, target):
         form = token.text
         lemma = '+'.join(map(lambda w: w.lemma, token.words))
         upos = '+'.join(map(lambda w: w.upos, token.words))
-        print(f'\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target)
+        print(f'\t\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target)
 
 if __name__ == '__main__':
     Annotator(sys.argv[1], sys.argv[2]).run()
-- 
GitLab