from os import path from os.path import basename, splitext import pandas as pd import os from data_process.TEIFile import TEIFile def basename_without_ext(path): base_name = basename(path) stem, ext = splitext(base_name) if stem.endswith('.tei'): # Return base name without tei file return stem[0:-4] else: return stem def tei_to_csv_entry(tei_file, txt_file): print(f"Going on {tei_file}") tei = TEIFile(tei_file, txt_file) print(f"Handled {tei_file}") base_name = basename_without_ext(tei_file) return base_name, tei._text, tei._Head, tei._author, tei._Objecttype, tei._Class, tei._normclass, tei._generatedclass, tei._englishclass, tei._attribution input_path = r'./data/EDdA/' output_name = "corpus_tei.csv" column_names = ["articleName", "text", "head", "author", "objecttype", "class", "normclass", "generatedclass", "englishclass", "attribution"] df = pd.DataFrame(columns = column_names) marge = 0 for tome in os.listdir(input_path): volume = tome[1:] for index, article in enumerate(os.listdir(input_path + tome +"/")): filepath = os.path.join(input_path, tome, article) base_name = basename_without_ext(filepath) df.loc[index+marge] = tei_to_csv_entry(filepath, ' ') df.loc[index+marge]['articleName'] = volume+'_'+base_name marge += index +1 df.to_csv(output_name, index=False)