#!/usr/bin/env python3 import csv from bs4 import BeautifulSoup from EDdA import ENCCRE import pandas import os import sys header = ["T", "article", "head", "author", "ARTFL_domain", "ENCCRE_domain"] def getAttribute(article, attribute): result = article.find(type=attribute) return result.get('value') if result else '' def idARTFLToENCCRE(correspondances, tome, article): try : row = correspondances.loc[(correspondances['tome']==tome) & (correspondances['article']==article)].reset_index(drop=True) return row['entreeid'][0] except KeyError: return None ENCCREDomains = ENCCRE.query('domaines') def getENCCREDomain(enccreId): if enccreId: article = ENCCRE.query('article/%s' % enccreId) annotation = article['annotations'] if 'constit' in annotation and 'domgen' in annotation['constit'][0]: domains = annotation['constit'][0]['domgen'] return ' | '.join( set(map(lambda d: ENCCREDomains[d]['dgrid'], domains)) ) def main(rootDirectory, correspondances): output = csv.writer(sys.stdout, lineterminator='\n', delimiter='\t') output.writerow(header) for t in range(1,18): path = f"{rootDirectory}/T{t}" for rank in range(1, len(os.listdir(path))+1): root = BeautifulSoup(open(f"{path}/article{rank}.tei")) normclass = getAttribute(root, "normclass") output.writerow([ t, rank, getAttribute(root, "head"), getAttribute(root, "author"), normclass, getENCCREDomain(idARTFLToENCCRE(correspondances, t, rank)) ]) if __name__ == '__main__': main(sys.argv[1], pandas.read_csv(sys.argv[2]))