Newer
Older
#!/usr/bin/env python3
import csv
from bs4 import BeautifulSoup
import os
import sys
header = ["T", "article", "head", "domain"]
def getAttribute(article, attribute):
result = article.find(type=attribute)
return result.get('value') if result else ''
def main(rootDirectory):
output = csv.writer(sys.stdout, lineterminator='\n')
output.writerow(header)
for t in range(1,18):
path = f"{rootDirectory}/T{t}"
for rank in range(1, len(os.listdir(path))+1):
root = BeautifulSoup(open(f"{path}/article{rank}.tei"))
normclass = getAttribute(root, "normclass")
output.writerow([
t,
rank,
getAttribute(root, "head"),
normclass if normclass != 'unclassified' else getAttribute(root, "generatedclass")
])
if __name__ == '__main__':
main(sys.argv[1])