EDdA-metadata.py

#!/usr/bin/env python3

import csv
from bs4 import BeautifulSoup
from EDdA import ENCCRE
import pandas
import os
import sys

header = ["T", "article", "head", "author", "ARTFL_domain", "ENCCRE_domain"]

def getAttribute(article, attribute):
    result = article.find(type=attribute)
    return result.get('value') if result else ''

def idARTFLToENCCRE(correspondances, tome, article):
    try :  
        row = correspondances.loc[(correspondances['tome']==tome) &
                                  (correspondances['article']==article)].reset_index(drop=True)
        return row['entreeid'][0]
    except KeyError:
        return None

ENCCREDomains = ENCCRE.query('domaines')

def getENCCREDomain(enccreId):
    if enccreId:
        article = ENCCRE.query('article/%s' % enccreId)
        annotation = article['annotations']
        if 'constit' in annotation and 'domgen' in annotation['constit'][0]:
            domains = annotation['constit'][0]['domgen']
            return ' | '.join(
                set(map(lambda d: ENCCREDomains[d]['dgrid'], domains))
                )

def main(rootDirectory, correspondances):
    output = csv.writer(sys.stdout, lineterminator='\n', delimiter='\t')
    output.writerow(header)
    for t in range(1,18):
        path = f"{rootDirectory}/T{t}"
        for rank in range(1, len(os.listdir(path))+1):
            root = BeautifulSoup(open(f"{path}/article{rank}.tei"))
            normclass = getAttribute(root, "normclass")
            output.writerow([
                t,
                rank,
                getAttribute(root, "head"),
                getAttribute(root, "author"),
                normclass,
                getENCCREDomain(idARTFLToENCCRE(correspondances, t, rank))
                ])

if __name__ == '__main__':
    main(sys.argv[1], pandas.read_csv(sys.argv[2]))