Newer
Older
#!/usr/bin/env python3
import csv
from bs4 import BeautifulSoup
from EDdA import ENCCRE
import pandas
header = ["T", "article", "head", "author", "ARTFL_domain", "ENCCRE_domain"]
def getAttribute(article, attribute):
result = article.find(type=attribute)
return result.get('value') if result else ''
def idARTFLToENCCRE(correspondances, tome, article):
try :
row = correspondances.loc[(correspondances['tome']==tome) &
(correspondances['article']==article)].reset_index(drop=True)
return row['entreeid'][0]
except KeyError:
return None
ENCCREDomains = ENCCRE.query('domaines')
def getENCCREDomain(enccreId):
if enccreId:
article = ENCCRE.query('article/%s' % enccreId)
annotation = article['annotations']
if 'constit' in annotation and 'domgen' in annotation['constit'][0]:
domains = annotation['constit'][0]['domgen']
return ' | '.join(
set(map(lambda d: ENCCREDomains[d]['dgrid'], domains))
)
def main(rootDirectory, correspondances):
output = csv.writer(sys.stdout, lineterminator='\n', delimiter='\t')
output.writerow(header)
for t in range(1,18):
path = f"{rootDirectory}/T{t}"
for rank in range(1, len(os.listdir(path))+1):
root = BeautifulSoup(open(f"{path}/article{rank}.tei"))
normclass = getAttribute(root, "normclass")
output.writerow([
t,
rank,
getAttribute(root, "head"),
getAttribute(root, "author"),
normclass,
getENCCREDomain(idARTFLToENCCRE(correspondances, t, rank))
])
if __name__ == '__main__':
main(sys.argv[1], pandas.read_csv(sys.argv[2]))