Update tmp_preprocess_data.py

21f1d59e · Ludovic Moncla · a5ab7d3b · 21f1d59e
Commit 21f1d59e authored 4 years ago by Ludovic Moncla
--- a/tmp_preprocess_data.py
+++ b/tmp_preprocess_data.py
@@ -21,38 +21,31 @@ import nltk
 from ClassPreprocessor import create_dict


-def removeMarkers(df, textColumn, listOfMarkers):
-
-    #remove null values or add condition if exist
-    #self.remove_null_rows(df, markerColumn)
-    #self.remove_null_rows(df, textColumn)
-    tmp = 0
-    for index, row in df.iterrows():
-        tmp += 1
-        print(tmp)
-        if not pd.isna(row[textColumn]):
-            for m in listOfMarkers:
-
-                marker = str(m)
-                marker_with_brcts = '('+ marker +')'
-                row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
-                row[textColumn] = row[textColumn].replace(marker , "")
-                full_text = row[textColumn]
-                i = unidecode(full_text).find(marker_with_brcts)
-                goOn = False
-                if i != -1:
-                    goOn = True
-                while goOn:
-
-                    full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
-                    i = unidecode(full_text).find(marker_with_brcts)
-                    if i == -1:
-                        goOn = False
-

-                row[textColumn] = full_text
+def removeMarkers(full_text, listOfMarkers):
+   
+    if not pd.isna(full_text):
+        for m in listOfMarkers:
+            marker = str(m)
+            marker_with_brcts = '('+ marker +')'
+            full_text = full_text.replace(marker_with_brcts , "")
+            full_text = full_text.replace(marker , "")
+            #full_text = row[textColumn]
+            i = unidecode(full_text).find(marker_with_brcts)
+            goOn = False
+            if i != -1:
+                goOn = True
+            while goOn:
+
+                full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
+                i = unidecode(full_text).find(marker_with_brcts)
+                if i == -1:
+                    goOn = False
+            #row[textColumn] = full_text
+    return full_text

-    return df
+## On vectorise la fonction removeMarkers() afin de l'appliquer de manière efficace (en terme de temps de calcul) sur le dataframe
+vec_removeMarkers = np.vectorize(removeMarkers)



@@ -65,8 +58,8 @@ listOfM = df['class'].unique()
 df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
 preprocessor.remove_null_rows(df_original, 'content')

-df_original = removeMarkers(df_original, 'content', listOfM)
-
+#df_original = removeMarkers(df_original, 'content', listOfM)
+df_original['content_withoutMarkers'] = vec_removeMarkers(df_original.content, listOfM)


 df_1 = df_original[['ensemble_domaine_enccre','content']].copy()
@@ -74,11 +67,11 @@ df_2 = df_original[['domaine_enccre','content']].copy()
 df_3 = df_original[['normClass_artfl','content']].copy()

 ############ shall we remove articles with less n tokens ####### remove markers
-preprocessor.remove_null_rows(df_1, 'content')
+preprocessor.remove_null_rows(df_1, 'content_withoutMarkers')
 preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
-preprocessor.remove_null_rows(df_2, 'content')
+preprocessor.remove_null_rows(df_2, 'content_withoutMarkers')
 preprocessor.remove_null_rows(df_2, 'domaine_enccre')
-preprocessor.remove_null_rows(df_3, 'content')
+preprocessor.remove_null_rows(df_3, 'content_withoutMarkers')
 preprocessor.remove_null_rows(df_3, 'normClass_artfl')

 df_1 = split_class(df_1, 'ensemble_domaine_enccre')