Skip to content
Snippets Groups Projects
EDdA-metadata.py 899 B
Newer Older
#!/usr/bin/env python3

import csv
from bs4 import BeautifulSoup
import os
import sys

header = ["T", "article", "head", "domain"]

def getAttribute(article, attribute):
    result = article.find(type=attribute)
    return result.get('value') if result else ''

def main(rootDirectory):
    output = csv.writer(sys.stdout, lineterminator='\n')
    output.writerow(header)
    for t in range(1,18):
        path = f"{rootDirectory}/T{t}"
        for rank in range(1, len(os.listdir(path))+1):
            root = BeautifulSoup(open(f"{path}/article{rank}.tei"))
            normclass = getAttribute(root, "normclass")
            output.writerow([
                t,
                rank,
                getAttribute(root, "head"),
                normclass if normclass != 'unclassified' else getAttribute(root, "generatedclass")
                ])

if __name__ == '__main__':
    main(sys.argv[1])