Skip to content
Snippets Groups Projects
Commit 21f1d59e authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update tmp_preprocess_data.py

parent a5ab7d3b
No related branches found
No related tags found
No related merge requests found
......@@ -21,38 +21,31 @@ import nltk
from ClassPreprocessor import create_dict
def removeMarkers(df, textColumn, listOfMarkers):
#remove null values or add condition if exist
#self.remove_null_rows(df, markerColumn)
#self.remove_null_rows(df, textColumn)
tmp = 0
for index, row in df.iterrows():
tmp += 1
print(tmp)
if not pd.isna(row[textColumn]):
for m in listOfMarkers:
marker = str(m)
marker_with_brcts = '('+ marker +')'
row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
row[textColumn] = row[textColumn].replace(marker , "")
full_text = row[textColumn]
i = unidecode(full_text).find(marker_with_brcts)
goOn = False
if i != -1:
goOn = True
while goOn:
full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
i = unidecode(full_text).find(marker_with_brcts)
if i == -1:
goOn = False
row[textColumn] = full_text
def removeMarkers(full_text, listOfMarkers):
if not pd.isna(full_text):
for m in listOfMarkers:
marker = str(m)
marker_with_brcts = '('+ marker +')'
full_text = full_text.replace(marker_with_brcts , "")
full_text = full_text.replace(marker , "")
#full_text = row[textColumn]
i = unidecode(full_text).find(marker_with_brcts)
goOn = False
if i != -1:
goOn = True
while goOn:
full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
i = unidecode(full_text).find(marker_with_brcts)
if i == -1:
goOn = False
#row[textColumn] = full_text
return full_text
return df
## On vectorise la fonction removeMarkers() afin de l'appliquer de manière efficace (en terme de temps de calcul) sur le dataframe
vec_removeMarkers = np.vectorize(removeMarkers)
......@@ -65,8 +58,8 @@ listOfM = df['class'].unique()
df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
preprocessor.remove_null_rows(df_original, 'content')
df_original = removeMarkers(df_original, 'content', listOfM)
#df_original = removeMarkers(df_original, 'content', listOfM)
df_original['content_withoutMarkers'] = vec_removeMarkers(df_original.content, listOfM)
df_1 = df_original[['ensemble_domaine_enccre','content']].copy()
......@@ -74,11 +67,11 @@ df_2 = df_original[['domaine_enccre','content']].copy()
df_3 = df_original[['normClass_artfl','content']].copy()
############ shall we remove articles with less n tokens ####### remove markers
preprocessor.remove_null_rows(df_1, 'content')
preprocessor.remove_null_rows(df_1, 'content_withoutMarkers')
preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
preprocessor.remove_null_rows(df_2, 'content')
preprocessor.remove_null_rows(df_2, 'content_withoutMarkers')
preprocessor.remove_null_rows(df_2, 'domaine_enccre')
preprocessor.remove_null_rows(df_3, 'content')
preprocessor.remove_null_rows(df_3, 'content_withoutMarkers')
preprocessor.remove_null_rows(df_3, 'normClass_artfl')
df_1 = split_class(df_1, 'ensemble_domaine_enccre')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment