Skip to content
Snippets Groups Projects
Commit 28895891 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update tmp_preprocess_data.py

parent 02e69317
No related branches found
No related tags found
No related merge requests found
import sys #import sys
import os #import os
import time #import time
import argparse #import argparse
import pandas as pd import pandas as pd
import numpy as np #import numpy as np
from data_preprocessing import Preprocessor #from data_preprocessing import Preprocessor
from features_extractor import feature_extractor #from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class #from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
from classifiers import classifiers, grid_params #from classifiers import classifiers, grid_params
from sklearn.model_selection import train_test_split #from sklearn.model_selection import train_test_split
from sklearn import preprocessing #from sklearn import preprocessing
from evaluate_model import evaluate_model #from evaluate_model import evaluate_model
from sklearn.model_selection import GridSearchCV #from sklearn.model_selection import GridSearchCV
import configparser #import configparser
from re import search #from re import search
import math #import math
import re #import re
import nltk #import nltk
from ClassPreprocessor import create_dict #from ClassPreprocessor import create_dict
print("Begin preprocess") #print("Begin preprocess")
# Reading data and preprocessings steps # Reading data and preprocessings steps
preprocessor = Preprocessor() #preprocessor = Preprocessor()
print("load dataset") print("load dataset")
df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
df = df_original.copy() #df = df_original.copy()
print("remove blank rows") print("remove blank rows")
df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True) df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True)
df.reset_index(drop=True, inplace=True) df.reset_index(drop=True, inplace=True)
print("filter unclassified rows")
# filtrer les articles non classés par ARTFL mais classé par ENCCRE (jeu de test)
df_unclassified = df.loc[(df['normClass']=="unclassified")]
df_classified = df.loc[(df['normClass']!="unclassified")]
print("save dataframe")
df_classified.to_csv('./data/train_dataframe.tsv', sep="\t")
df_unclassified.to_csv('./data/test_dataframe.tsv', sep="\t")
print("some stats")
print("len(df_unclassified)",len(df_unclassified))
print("len(df_classified)",len(df_classified))
'''
#preprocessor.remove_null_rows(df_original, 'content') #preprocessor.remove_null_rows(df_original, 'content')
print("copy") print("copy")
df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy() df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
...@@ -44,21 +63,20 @@ df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy() ...@@ -44,21 +63,20 @@ df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy()
print("split ensemble domaine enccre") print("split ensemble domaine enccre")
df_1 = split_class(df_1, 'ensemble_domaine_enccre') df_1 = split_class(df_1, 'ensemble_domaine_enccre')
print("save dataframe") print("save dataframe")
df_1.to_csv('./data/dataframe_with_ensemble_domaine_enccre.csv') df_1.to_csv('./data/train_dataframe_with_ensemble_domaine_enccre.csv')
print("split ensemble domaine enccre") print("split domaine enccre")
df_2 = split_class(df_2, 'domaine_enccre') df_2 = split_class(df_2, 'domaine_enccre')
print("save dataframe") print("save dataframe")
df_2.to_csv('./data/dataframe_with_domaine_enccre.csv') df_2.to_csv('./data/train_dataframe_with_domaine_enccre.csv')
print("split ensemble domaine enccre") print("split normclass")
df_3 = split_class(df_3, 'normClass') df_3 = split_class(df_3, 'normClass')
print("save dataframe") print("save dataframe")
df_3.to_csv('./data/dataframe_with_normClass_artfl.csv') df_3.to_csv('./data/train_dataframe_with_normClass_artfl.csv')
print("some stats")
d_1 = create_dict(df_1, 'ensemble_domaine_enccre') d_1 = create_dict(df_1, 'ensemble_domaine_enccre')
tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count']) tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count'])
tosave.to_excel("ensemble_domaine_enccre.xlsx") tosave.to_excel("ensemble_domaine_enccre.xlsx")
...@@ -72,3 +90,4 @@ tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count']) ...@@ -72,3 +90,4 @@ tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count'])
tosave.to_excel("normClass_artfl.xlsx") tosave.to_excel("normClass_artfl.xlsx")
print(df_original.shape) print(df_original.shape)
'''
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment