From 6b612e5e39a5415a45476a9e61a78b02f80ae715 Mon Sep 17 00:00:00 2001 From: Ludovic Moncla <moncla.ludovic@gmail.com> Date: Fri, 25 Nov 2022 13:22:22 +0100 Subject: [PATCH] Update Predict_LGE.py --- scripts/Predict_LGE.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/scripts/Predict_LGE.py b/scripts/Predict_LGE.py index a20544d..fabc4cf 100644 --- a/scripts/Predict_LGE.py +++ b/scripts/Predict_LGE.py @@ -111,16 +111,18 @@ def text_folder_to_dataframe(path): # id,tome,filename,nb_words,content,domain for tome in sorted(os.listdir(path)): - for article in sorted(os.listdir(path + "/" + tome)): - filename = article[:-4] - id = tome + filename - - if article[-4:] == ".txt": - with open(path + "/" + tome + "/" + article) as f: - content = f.read() - - data.append([id, tome, filename, content, len(content.split(' '))]) - + try: + for article in sorted(os.listdir(path + "/" + tome)): + filename = article[:-4] + id = tome + filename + + if article[-4:] == ".txt": + with open(path + "/" + tome + "/" + article) as f: + content = f.read() + + data.append([id, tome, filename, content, len(content.split(' '))]) + except NotADirectoryError: + pass return pd.DataFrame(data, columns=['id', 'tome', 'filename', 'content', 'nb_words']) @@ -148,10 +150,10 @@ if __name__ == '__main__': ## Load data print("* Load data") - path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text/" - + input_path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text" + path = "../" - df_LGE = text_folder_to_dataframe(path) + df_LGE = text_folder_to_dataframe(input_path) #df_LGE = pd.read_csv(path + "data/LGE_withContent.tsv", sep="\t") data_LGE = df_LGE["content"].values -- GitLab