From 6b612e5e39a5415a45476a9e61a78b02f80ae715 Mon Sep 17 00:00:00 2001
From: Ludovic Moncla <moncla.ludovic@gmail.com>
Date: Fri, 25 Nov 2022 13:22:22 +0100
Subject: [PATCH] Update Predict_LGE.py

---
 scripts/Predict_LGE.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/scripts/Predict_LGE.py b/scripts/Predict_LGE.py
index a20544d..fabc4cf 100644
--- a/scripts/Predict_LGE.py
+++ b/scripts/Predict_LGE.py
@@ -111,16 +111,18 @@ def text_folder_to_dataframe(path):
   # id,tome,filename,nb_words,content,domain
 
   for tome in sorted(os.listdir(path)):
-    for article in sorted(os.listdir(path + "/" + tome)):
-        filename = article[:-4]
-        id = tome + filename
-
-        if article[-4:] == ".txt":
-            with open(path + "/" + tome + "/" + article) as f:
-                content = f.read()
-
-                data.append([id, tome, filename, content, len(content.split(' '))])
-       
+    try:
+        for article in sorted(os.listdir(path + "/" + tome)):
+            filename = article[:-4]
+            id = tome + filename
+
+            if article[-4:] == ".txt":
+                with open(path + "/" + tome + "/" + article) as f:
+                    content = f.read()
+
+                    data.append([id, tome, filename, content, len(content.split(' '))])
+    except NotADirectoryError:
+        pass
   return pd.DataFrame(data, columns=['id', 'tome', 'filename', 'content', 'nb_words'])
 
 
@@ -148,10 +150,10 @@ if __name__ == '__main__':
   ## Load data
   print("* Load data")
 
-  path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text/"
-
+  input_path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text"
+  path = "../"
 
-  df_LGE = text_folder_to_dataframe(path)
+  df_LGE = text_folder_to_dataframe(input_path)
   #df_LGE = pd.read_csv(path + "data/LGE_withContent.tsv", sep="\t")
   data_LGE = df_LGE["content"].values
 
-- 
GitLab