-
Alice Brenon authored33952f77
stanza-annotator.py 1.91 KiB
#!/usr/bin/env python3
import os
import os.path
import stanza
import sys
from xml.sax.saxutils import escape, quoteattr
class Annotator:
def __init__(self, source, target):
self.source = source
self.target = target
self.model = stanza.Pipeline(lang='fr', processors='tokenize,pos,lemma')
def run(self):
for name in os.listdir(self.source):
if os.path.isdir(f'{self.source}/{name}'):
self.annotate_dir(name)
def annotate_dir(self, directory):
source_path = f'{self.source}/{directory}'
os.makedirs(f'{self.target}/{directory}', exist_ok=True)
for name in os.listdir(source_path):
path = f'{source_path}/{name}'
relpath = f'{directory}/{name}'
if os.path.isdir(path):
self.annotate_dir(relpath)
elif os.path.isfile(path):
self.annotate_file(relpath)
def annotate_file(self, file):
with open(f'{self.source}/{file}', 'r') as source:
self.encode(f'{file[:-4]}.tei', self.model(source.read()))
def encode(self, file, annotation):
with open(f'{self.target}/{file}', 'w') as target:
print('<body>', file=target)
for sentence in annotation.sentences:
self.encode_sentence(sentence, target)
print('</body>', file=target)
def encode_sentence(self, sentence, target):
print('\t<s>', file=target)
for token in sentence.tokens:
self.encode_token(token, target)
print('\t</s>', file=target)
def encode_token(self, token, target):
form = escape(token.text)
lemma = quoteattr('+'.join(map(lambda w: w.lemma, token.words)))
upos = quoteattr('+'.join(map(lambda w: w.upos, token.words)))
print(f'\t\t<w lemma={lemma} pos={upos}>{form}</w>', file=target)
if __name__ == '__main__':
Annotator(sys.argv[1], sys.argv[2]).run()