diff --git a/tmp_preprocess_data.py b/tmp_preprocess_data.py index d97c7362289bfd4396b0d24dae6874c4d77d5cfb..3db8eae5c6eed912837621127db77d7b6ec03dcf 100644 --- a/tmp_preprocess_data.py +++ b/tmp_preprocess_data.py @@ -21,38 +21,31 @@ import nltk from ClassPreprocessor import create_dict -def removeMarkers(df, textColumn, listOfMarkers): - - #remove null values or add condition if exist - #self.remove_null_rows(df, markerColumn) - #self.remove_null_rows(df, textColumn) - tmp = 0 - for index, row in df.iterrows(): - tmp += 1 - print(tmp) - if not pd.isna(row[textColumn]): - for m in listOfMarkers: - - marker = str(m) - marker_with_brcts = '('+ marker +')' - row[textColumn] = row[textColumn].replace(marker_with_brcts , "") - row[textColumn] = row[textColumn].replace(marker , "") - full_text = row[textColumn] - i = unidecode(full_text).find(marker_with_brcts) - goOn = False - if i != -1: - goOn = True - while goOn: - - full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):])) - i = unidecode(full_text).find(marker_with_brcts) - if i == -1: - goOn = False - - row[textColumn] = full_text +def removeMarkers(full_text, listOfMarkers): + + if not pd.isna(full_text): + for m in listOfMarkers: + marker = str(m) + marker_with_brcts = '('+ marker +')' + full_text = full_text.replace(marker_with_brcts , "") + full_text = full_text.replace(marker , "") + #full_text = row[textColumn] + i = unidecode(full_text).find(marker_with_brcts) + goOn = False + if i != -1: + goOn = True + while goOn: + + full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):])) + i = unidecode(full_text).find(marker_with_brcts) + if i == -1: + goOn = False + #row[textColumn] = full_text + return full_text - return df +## On vectorise la fonction removeMarkers() afin de l'appliquer de manière efficace (en terme de temps de calcul) sur le dataframe +vec_removeMarkers = np.vectorize(removeMarkers) @@ -65,8 +58,8 @@ listOfM = df['class'].unique() df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") preprocessor.remove_null_rows(df_original, 'content') -df_original = removeMarkers(df_original, 'content', listOfM) - +#df_original = removeMarkers(df_original, 'content', listOfM) +df_original['content_withoutMarkers'] = vec_removeMarkers(df_original.content, listOfM) df_1 = df_original[['ensemble_domaine_enccre','content']].copy() @@ -74,11 +67,11 @@ df_2 = df_original[['domaine_enccre','content']].copy() df_3 = df_original[['normClass_artfl','content']].copy() ############ shall we remove articles with less n tokens ####### remove markers -preprocessor.remove_null_rows(df_1, 'content') +preprocessor.remove_null_rows(df_1, 'content_withoutMarkers') preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre') -preprocessor.remove_null_rows(df_2, 'content') +preprocessor.remove_null_rows(df_2, 'content_withoutMarkers') preprocessor.remove_null_rows(df_2, 'domaine_enccre') -preprocessor.remove_null_rows(df_3, 'content') +preprocessor.remove_null_rows(df_3, 'content_withoutMarkers') preprocessor.remove_null_rows(df_3, 'normClass_artfl') df_1 = split_class(df_1, 'ensemble_domaine_enccre')