from lxml import etree from sys import stdin from os.path import basename import re def dom(name, attributes, inside): elem = etree.Element(name, attributes) if type(inside) == str: elem.text = inside else: for child in inside: elem.append(child) return elem def analyze(inputArticle): article = {"contents": []} for node in inputArticle.xpath('//text/*'): if 'head' not in article: article['head'] = node.text else: article['contents'].append(node) if 'author' not in article: article['author'] = "anonyme" for c in ['norm', 'generated']: if c + 'Class' not in article: article[c + 'Class'] = "" return article def teiHeader(article): return dom('teiHeader', {}, [ dom('fileDesc', {}, [ dom('titleStmt', {}, [ dom('title', {}, article['head']) ]), dom('publicationStmt', {}, [ dom('p', {}, "Annotated with TreeTagger for project GEODE") ]), dom('sourceDesc', {}, [ dom('bibl', {}, [ dom('title', {}, article['head']), dom('author', {}, article['author']) ]) ]) ]) ]) def contents(article): xmlId = etree.QName('{http://www.w3.org/XML/1998/namespace}id') for c in article['contents']: for w in c.xpath('//w'): if xmlId in w.attrib: del w.attrib[xmlId] return article['contents'] def buildDocument(docId, article): textAttributes = { 'id': docId, 'title': article['head'], 'normClass': article['normClass'], 'generatedClass': article['generatedClass'], 'author': article['author'] } return dom('TEI', {}, [ teiHeader(article), dom('text', textAttributes, [ dom('body', {}, contents(article)) ]) ]) if __name__ == '__main__': for filepath in stdin.read().splitlines(): m = re.match('T(\d+)article_(\d+).xml', basename(filepath)) if m is not None: docId = '-'.join(m.groups()) with open(filepath) as f: inputArticle = etree.parse(f) print(etree.tounicode(buildDocument(docId, analyze(inputArticle))))