data_preparation.py

from os import path
from os.path import basename, splitext
import pandas as pd
import os
from data_process.TEIFile import TEIFile


def basename_without_ext(path):
    base_name = basename(path)
    stem, ext = splitext(base_name)
    if stem.endswith('.tei'):
        # Return base name without tei file
        return stem[0:-4]
    else:
        return stem


def tei_to_csv_entry(tei_file, txt_file):
    print(f"Going on {tei_file}")
    tei = TEIFile(tei_file, txt_file)
    print(f"Handled {tei_file}")
    base_name = basename_without_ext(tei_file)
    return base_name, tei._text, tei._Head, tei._author, tei._Objecttype, tei._Class, tei._normclass, tei._generatedclass, tei._englishclass, tei._attribution


input_path = r'./data/EDdA/'
output_name = "corpus_tei.csv"

column_names = ["articleName", "text", "head", "author", "objecttype", "class", "normclass", "generatedclass", "englishclass", "attribution"]

df = pd.DataFrame(columns = column_names)

marge = 0


for tome in os.listdir(input_path):
    volume = tome[1:]

    for index, article in enumerate(os.listdir(input_path + tome +"/")):
        filepath = os.path.join(input_path, tome, article)
        base_name = basename_without_ext(filepath)

        df.loc[index+marge] = tei_to_csv_entry(filepath, ' ')
        df.loc[index+marge]['articleName'] = volume+'_'+base_name
    marge += index +1


df.to_csv(output_name, index=False)