diff --git a/scripts/Predict_LGE.py b/scripts/Predict_LGE.py index a20544d9b24ae18cb41a11dcb4edea5cd4509102..fabc4cfe10abf397255f9bea0d4c5c6ce9f85a75 100644 --- a/scripts/Predict_LGE.py +++ b/scripts/Predict_LGE.py @@ -111,16 +111,18 @@ def text_folder_to_dataframe(path): # id,tome,filename,nb_words,content,domain for tome in sorted(os.listdir(path)): - for article in sorted(os.listdir(path + "/" + tome)): - filename = article[:-4] - id = tome + filename - - if article[-4:] == ".txt": - with open(path + "/" + tome + "/" + article) as f: - content = f.read() - - data.append([id, tome, filename, content, len(content.split(' '))]) - + try: + for article in sorted(os.listdir(path + "/" + tome)): + filename = article[:-4] + id = tome + filename + + if article[-4:] == ".txt": + with open(path + "/" + tome + "/" + article) as f: + content = f.read() + + data.append([id, tome, filename, content, len(content.split(' '))]) + except NotADirectoryError: + pass return pd.DataFrame(data, columns=['id', 'tome', 'filename', 'content', 'nb_words']) @@ -148,10 +150,10 @@ if __name__ == '__main__': ## Load data print("* Load data") - path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text/" - + input_path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text" + path = "../" - df_LGE = text_folder_to_dataframe(path) + df_LGE = text_folder_to_dataframe(input_path) #df_LGE = pd.read_csv(path + "data/LGE_withContent.tsv", sep="\t") data_LGE = df_LGE["content"].values