Skip to content
Snippets Groups Projects
Commit 37a652cc authored by Alice Brenon's avatar Alice Brenon
Browse files

Fix XML output from stanza which had multiple roots + add a demonstrator...

Fix XML output from stanza which had multiple roots + add a demonstrator haskell script that was used to fix the problem in place
parent cc30e47d
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env -S runhaskell --ghc-arg="-i lib"
{-# LANGUAGE OverloadedStrings #-}
import Data.Text (Text)
import System.Environment (getArgs)
import System.Script (syntax)
import Text.Editor (editAll)
wrap :: [Text] -> [Text]
wrap text = "<body>": (("\t"<>) <$> text) ++ ["</body>"]
main :: IO ()
main = getArgs >>= run
where
run [target] = getContents >>= editAll wrap target
run _ = syntax "TARGET_DIRECTORY"
......@@ -33,20 +33,22 @@ class Annotator:
def encode(self, file, annotation):
with open(f'{self.target}/{file}', 'w') as target:
print('<text>', file=target)
for sentence in annotation.sentences:
self.encode_sentence(sentence, target)
print('</text>', file=target)
def encode_sentence(self, sentence, target):
print('<s>', file=target)
print('\t<s>', file=target)
for token in sentence.tokens:
self.encode_token(token, target)
print('</s>', file=target)
print('\t</s>', file=target)
def encode_token(self, token, target):
form = token.text
lemma = '+'.join(map(lambda w: w.lemma, token.words))
upos = '+'.join(map(lambda w: w.upos, token.words))
print(f'\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target)
print(f'\t\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target)
if __name__ == '__main__':
Annotator(sys.argv[1], sys.argv[2]).run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment