Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from lxml import etree
from sys import stdin
from os.path import basename
import re
def dom(name, attributes, inside):
elem = etree.Element(name, attributes)
if type(inside) == str:
elem.text = inside
else:
for child in inside:
elem.append(child)
return elem
def analyze(inputArticle):
article = {"contents": []}
for node in inputArticle.xpath('//text/*'):
if 'head' not in article:
article['head'] = node.text
else:
article['contents'].append(node)
if 'author' not in article:
article['author'] = "anonyme"
for c in ['norm', 'generated']:
if c + 'Class' not in article:
article[c + 'Class'] = ""
return article
def teiHeader(article):
return dom('teiHeader', {}, [
dom('fileDesc', {}, [
dom('titleStmt', {}, [
dom('title', {}, article['head'])
]),
dom('publicationStmt', {}, [
dom('p', {}, "Annotated with TreeTagger for project GEODE")
]),
dom('sourceDesc', {}, [
dom('bibl', {}, [
dom('title', {}, article['head']),
dom('author', {}, article['author'])
])
])
])
])
def contents(article):
xmlId = etree.QName('{http://www.w3.org/XML/1998/namespace}id')
for c in article['contents']:
for w in c.xpath('//w'):
if xmlId in w.attrib:
del w.attrib[xmlId]
return article['contents']
def buildDocument(docId, article):
textAttributes = {
'id': docId,
'title': article['head'],
'normClass': article['normClass'],
'generatedClass': article['generatedClass'],
'author': article['author']
}
return dom('TEI', {}, [
teiHeader(article),
dom('text', textAttributes, [
dom('body', {}, contents(article))
])
])
if __name__ == '__main__':
for filepath in stdin.read().splitlines():
m = re.match('T(\d+)article_(\d+).xml', basename(filepath))
if m is not None:
docId = '-'.join(m.groups())
with open(filepath) as f:
inputArticle = etree.parse(f)
print(etree.tounicode(buildDocument(docId, analyze(inputArticle))))