Skip to content
Snippets Groups Projects
tmp_preprocess_data.py 2.37 KiB
Newer Older
import sys
import os
import time
import argparse
import pandas as pd
import numpy as np
from data_preprocessing import Preprocessor
from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
from classifiers import classifiers, grid_params
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from evaluate_model import evaluate_model
from sklearn.model_selection import GridSearchCV
import configparser
from re import search
import math
import re
import nltk
from ClassPreprocessor import create_dict


Ludovic Moncla's avatar
Ludovic Moncla committed
print("Begin preprocess")

# Reading data and preprocessings steps

preprocessor = Preprocessor()

Ludovic Moncla's avatar
Ludovic Moncla committed
print("load dataset")
Ludovic Moncla's avatar
Ludovic Moncla committed
df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
df = df_original.copy()
Ludovic Moncla's avatar
Ludovic Moncla committed
print("remove blank rows")
df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True)
df.reset_index(drop=True, inplace=True)

#preprocessor.remove_null_rows(df_original, 'content')
print("copy")
df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy()
Ludovic Moncla's avatar
Ludovic Moncla committed
print("split ensemble domaine enccre")
df_1 = split_class(df_1, 'ensemble_domaine_enccre')
Ludovic Moncla's avatar
Ludovic Moncla committed
print("save dataframe")
df_1.to_csv('./data/dataframe_with_ensemble_domaine_enccre.csv')

print("split ensemble domaine enccre")
df_2 = split_class(df_2, 'domaine_enccre')
Ludovic Moncla's avatar
Ludovic Moncla committed
print("save dataframe")
df_2.to_csv('./data/dataframe_with_domaine_enccre.csv')
Ludovic Moncla's avatar
Ludovic Moncla committed
print("split ensemble domaine enccre")
df_3 = split_class(df_3, 'normClass')
print("save dataframe")
df_3.to_csv('./data/dataframe_with_normClass_artfl.csv')
Ludovic Moncla's avatar
Ludovic Moncla committed
print("some stats")
d_1 = create_dict(df_1, 'ensemble_domaine_enccre')
tosave = pd.DataFrame.from_dict(d_1, orient='index',  columns=[ 'Count'])
tosave.to_excel("ensemble_domaine_enccre.xlsx")

d_2 = create_dict(df_2, 'domaine_enccre')
tosave = pd.DataFrame.from_dict(d_2, orient='index',  columns=[ 'Count'])
tosave.to_excel("domaine_enccre.xlsx")

d_3 = create_dict(df_3, 'normClass_artfl')
tosave = pd.DataFrame.from_dict(d_3, orient='index',  columns=[ 'Count'])
tosave.to_excel("normClass_artfl.xlsx")

print(df_original.shape)