diff --git a/projet/tmp_preprocess_data.py b/projet/tmp_preprocess_data.py new file mode 100644 index 0000000000000000000000000000000000000000..d97c7362289bfd4396b0d24dae6874c4d77d5cfb --- /dev/null +++ b/projet/tmp_preprocess_data.py @@ -0,0 +1,109 @@ +import sys +import os +import time +import argparse +import pandas as pd +import numpy as np +from data_preprocessing import Preprocessor +from features_extractor import feature_extractor +from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class +from classifiers import classifiers, grid_params +from sklearn.model_selection import train_test_split +from sklearn import preprocessing +from evaluate_model import evaluate_model +from sklearn.model_selection import GridSearchCV +import configparser +from re import search +import math +from unidecode import unidecode +import re +import nltk +from ClassPreprocessor import create_dict + + +def removeMarkers(df, textColumn, listOfMarkers): + + #remove null values or add condition if exist + #self.remove_null_rows(df, markerColumn) + #self.remove_null_rows(df, textColumn) + tmp = 0 + for index, row in df.iterrows(): + tmp += 1 + print(tmp) + if not pd.isna(row[textColumn]): + for m in listOfMarkers: + + marker = str(m) + marker_with_brcts = '('+ marker +')' + row[textColumn] = row[textColumn].replace(marker_with_brcts , "") + row[textColumn] = row[textColumn].replace(marker , "") + full_text = row[textColumn] + i = unidecode(full_text).find(marker_with_brcts) + goOn = False + if i != -1: + goOn = True + while goOn: + + full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):])) + i = unidecode(full_text).find(marker_with_brcts) + if i == -1: + goOn = False + + + row[textColumn] = full_text + + return df + + + +# Reading data and preprocessings steps + +preprocessor = Preprocessor() +df = pd.read_csv('corpus_tei.csv') +listOfM = df['class'].unique() + +df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") +preprocessor.remove_null_rows(df_original, 'content') + +df_original = removeMarkers(df_original, 'content', listOfM) + + + +df_1 = df_original[['ensemble_domaine_enccre','content']].copy() +df_2 = df_original[['domaine_enccre','content']].copy() +df_3 = df_original[['normClass_artfl','content']].copy() + +############ shall we remove articles with less n tokens ####### remove markers +preprocessor.remove_null_rows(df_1, 'content') +preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre') +preprocessor.remove_null_rows(df_2, 'content') +preprocessor.remove_null_rows(df_2, 'domaine_enccre') +preprocessor.remove_null_rows(df_3, 'content') +preprocessor.remove_null_rows(df_3, 'normClass_artfl') + +df_1 = split_class(df_1, 'ensemble_domaine_enccre') +df_2 = split_class(df_2, 'domaine_enccre') +df_3 = split_class(df_3, 'normClass_artfl') + + + + +d_1 = create_dict(df_1, 'ensemble_domaine_enccre') +tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count']) +tosave.to_excel("ensemble_domaine_enccre.xlsx") + +d_2 = create_dict(df_2, 'domaine_enccre') +tosave = pd.DataFrame.from_dict(d_2, orient='index', columns=[ 'Count']) +tosave.to_excel("domaine_enccre.xlsx") + +d_3 = create_dict(df_3, 'normClass_artfl') +tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count']) +tosave.to_excel("normClass_artfl.xlsx") + +df_1.to_csv('dataframe_with_ensemble_domaine_enccre.csv') +df_2.to_csv('dataframe_with_domaine_enccre.csv') +df_3.to_csv('dataframe_with_normClass_artfl.csv') + + + +print(df_original.shape)