#!/usr/bin/env python3 import os import os.path import stanza import sys from xml.sax.saxutils import escape, quoteattr class Annotator: def __init__(self, source, target): self.source = source self.target = target self.model = stanza.Pipeline(lang='fr', processors='tokenize,pos,lemma') def run(self): for name in os.listdir(self.source): if os.path.isdir(f'{self.source}/{name}'): self.annotate_dir(name) def annotate_dir(self, directory): source_path = f'{self.source}/{directory}' os.makedirs(f'{self.target}/{directory}', exist_ok=True) for name in os.listdir(source_path): path = f'{source_path}/{name}' relpath = f'{directory}/{name}' if os.path.isdir(path): self.annotate_dir(relpath) elif os.path.isfile(path): self.annotate_file(relpath) def annotate_file(self, file): with open(f'{self.source}/{file}', 'r') as source: self.encode(f'{file[:-4]}.tei', self.model(source.read())) def encode(self, file, annotation): with open(f'{self.target}/{file}', 'w') as target: print('<body>', file=target) for sentence in annotation.sentences: self.encode_sentence(sentence, target) print('</body>', file=target) def encode_sentence(self, sentence, target): print('\t<s>', file=target) for token in sentence.tokens: self.encode_token(token, target) print('\t</s>', file=target) def encode_token(self, token, target): form = escape(token.text) lemma = quoteattr('+'.join(map(lambda w: w.lemma, token.words))) upos = quoteattr('+'.join(map(lambda w: w.upos, token.words))) print(f'\t\t<w lemma={lemma} pos={upos}>{form}</w>', file=target) if __name__ == '__main__': Annotator(sys.argv[1], sys.argv[2]).run()