Skip to content
Snippets Groups Projects
reimport.py 2.37 KiB
Newer Older
from lxml import etree
from sys import stdin
from os.path import basename
import re

def dom(name, attributes, inside):
    elem = etree.Element(name, attributes)
    if type(inside) == str:
        elem.text = inside
    else:
        for child in inside:
            elem.append(child)
    return elem

def analyze(inputArticle):
    article = {"contents": []}
    for node in inputArticle.xpath('//text/*'):
        if 'head' not in article:
            article['head'] = node.text
        else:
            article['contents'].append(node)
    if 'author' not in article:
        article['author'] = "anonyme"
    for c in ['norm', 'generated']:
        if c + 'Class' not in article:
            article[c + 'Class'] = ""
    return article

def teiHeader(article):
    return dom('teiHeader', {}, [
        dom('fileDesc', {}, [
            dom('titleStmt', {}, [
                dom('title', {}, article['head'])
                ]),
            dom('publicationStmt', {}, [
                dom('p', {}, "Annotated with TreeTagger for project GEODE")
                ]),
            dom('sourceDesc', {}, [
                dom('bibl', {}, [
                    dom('title', {}, article['head']),
                    dom('author', {}, article['author'])
                    ])
                ])
            ])
        ])

def contents(article):
    xmlId = etree.QName('{http://www.w3.org/XML/1998/namespace}id')
    for c in article['contents']:
        for w in c.xpath('//w'):
            if xmlId in w.attrib:
                del w.attrib[xmlId]
    return article['contents']

def buildDocument(docId, article):
    textAttributes = {
            'id': docId,
            'title': article['head'],
            'normClass': article['normClass'],
            'generatedClass': article['generatedClass'],
            'author': article['author']
            }
    return dom('TEI', {}, [
            teiHeader(article),
            dom('text', textAttributes, [
                dom('body', {}, contents(article))
                ])
        ])

if __name__ == '__main__':
    for filepath in stdin.read().splitlines():
        m = re.match('T(\d+)article_(\d+).xml', basename(filepath))
        if m is not None:
            docId = '-'.join(m.groups())
            with open(filepath) as f:
                inputArticle = etree.parse(f)
            print(etree.tounicode(buildDocument(docId, analyze(inputArticle))))