Skip to content
Snippets Groups Projects
stanza-annotator.py 1.91 KiB
#!/usr/bin/env python3

import os
import os.path
import stanza
import sys
from xml.sax.saxutils import escape, quoteattr

class Annotator:
    def __init__(self, source, target):
        self.source = source
        self.target = target
        self.model = stanza.Pipeline(lang='fr', processors='tokenize,pos,lemma')

    def run(self):
        for name in os.listdir(self.source):
            if os.path.isdir(f'{self.source}/{name}'):
                self.annotate_dir(name)

    def annotate_dir(self, directory):
        source_path = f'{self.source}/{directory}'
        os.makedirs(f'{self.target}/{directory}', exist_ok=True)
        for name in os.listdir(source_path):
            path = f'{source_path}/{name}'
            relpath = f'{directory}/{name}'
            if os.path.isdir(path):
                self.annotate_dir(relpath)
            elif os.path.isfile(path):
                self.annotate_file(relpath)

    def annotate_file(self, file):
        with open(f'{self.source}/{file}', 'r') as source:
            self.encode(f'{file[:-4]}.tei', self.model(source.read()))

    def encode(self, file, annotation):
        with open(f'{self.target}/{file}', 'w') as target:
            print('<body>', file=target)
            for sentence in annotation.sentences:
                self.encode_sentence(sentence, target)
            print('</body>', file=target)

    def encode_sentence(self, sentence, target):
        print('\t<s>', file=target)
        for token in sentence.tokens:
            self.encode_token(token, target)
        print('\t</s>', file=target)

    def encode_token(self, token, target):
        form = escape(token.text)
        lemma = quoteattr('+'.join(map(lambda w: w.lemma, token.words)))
        upos = quoteattr('+'.join(map(lambda w: w.upos, token.words)))
        print(f'\t\t<w lemma={lemma} pos={upos}>{form}</w>', file=target)

if __name__ == '__main__':
    Annotator(sys.argv[1], sys.argv[2]).run()