-
Ludovic Moncla authored0f45f5be
data_preparation.py 1.38 KiB
from os import path
from os.path import basename, splitext
import pandas as pd
import os
from data_process.TEIFile import TEIFile
def basename_without_ext(path):
base_name = basename(path)
stem, ext = splitext(base_name)
if stem.endswith('.tei'):
# Return base name without tei file
return stem[0:-4]
else:
return stem
def tei_to_csv_entry(tei_file, txt_file):
print(f"Going on {tei_file}")
tei = TEIFile(tei_file, txt_file)
print(f"Handled {tei_file}")
base_name = basename_without_ext(tei_file)
return base_name, tei._text, tei._Head, tei._author, tei._Objecttype, tei._Class, tei._normclass, tei._generatedclass, tei._englishclass, tei._attribution
input_path = r'./data/EDdA/'
output_name = "corpus_tei.csv"
column_names = ["articleName", "text", "head", "author", "objecttype", "class", "normclass", "generatedclass", "englishclass", "attribution"]
df = pd.DataFrame(columns = column_names)
marge = 0
for tome in os.listdir(input_path):
volume = tome[1:]
for index, article in enumerate(os.listdir(input_path + tome +"/")):
filepath = os.path.join(input_path, tome, article)
base_name = basename_without_ext(filepath)
df.loc[index+marge] = tei_to_csv_entry(filepath, ' ')
df.loc[index+marge]['articleName'] = volume+'_'+base_name
marge += index +1
df.to_csv(output_name, index=False)