diff --git a/scripts/reroot.hs b/scripts/reroot.hs new file mode 100755 index 0000000000000000000000000000000000000000..fc5e715e971a453492c52e67c5c84f425bbf21dc --- /dev/null +++ b/scripts/reroot.hs @@ -0,0 +1,16 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-i lib" +{-# LANGUAGE OverloadedStrings #-} + +import Data.Text (Text) +import System.Environment (getArgs) +import System.Script (syntax) +import Text.Editor (editAll) + +wrap :: [Text] -> [Text] +wrap text = "<body>": (("\t"<>) <$> text) ++ ["</body>"] + +main :: IO () +main = getArgs >>= run + where + run [target] = getContents >>= editAll wrap target + run _ = syntax "TARGET_DIRECTORY" diff --git a/scripts/stanza-annotator.py b/scripts/stanza-annotator.py index 8a35e11c247ae30add52f2a05b5a3c67a3e5fe72..e5e6052268eb4f412e0b961a6dc8087b46040afa 100755 --- a/scripts/stanza-annotator.py +++ b/scripts/stanza-annotator.py @@ -33,20 +33,22 @@ class Annotator: def encode(self, file, annotation): with open(f'{self.target}/{file}', 'w') as target: + print('<text>', file=target) for sentence in annotation.sentences: self.encode_sentence(sentence, target) + print('</text>', file=target) def encode_sentence(self, sentence, target): - print('<s>', file=target) + print('\t<s>', file=target) for token in sentence.tokens: self.encode_token(token, target) - print('</s>', file=target) + print('\t</s>', file=target) def encode_token(self, token, target): form = token.text lemma = '+'.join(map(lambda w: w.lemma, token.words)) upos = '+'.join(map(lambda w: w.upos, token.words)) - print(f'\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target) + print(f'\t\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target) if __name__ == '__main__': Annotator(sys.argv[1], sys.argv[2]).run()