diff --git a/tmp_preprocess_data.py b/tmp_preprocess_data.py index d353bc44e0dc37e124b1aa94be45af9540e31150..bc852b8c430d8b20319d42f72d1f64c9134d4b28 100644 --- a/tmp_preprocess_data.py +++ b/tmp_preprocess_data.py @@ -1,40 +1,59 @@ -import sys -import os -import time -import argparse +#import sys +#import os +#import time +#import argparse import pandas as pd -import numpy as np -from data_preprocessing import Preprocessor -from features_extractor import feature_extractor -from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class -from classifiers import classifiers, grid_params -from sklearn.model_selection import train_test_split -from sklearn import preprocessing -from evaluate_model import evaluate_model -from sklearn.model_selection import GridSearchCV -import configparser -from re import search -import math -import re -import nltk -from ClassPreprocessor import create_dict - - -print("Begin preprocess") +#import numpy as np +#from data_preprocessing import Preprocessor +#from features_extractor import feature_extractor +#from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class +#from classifiers import classifiers, grid_params +#from sklearn.model_selection import train_test_split +#from sklearn import preprocessing +#from evaluate_model import evaluate_model +#from sklearn.model_selection import GridSearchCV +#import configparser +#from re import search +#import math +#import re +#import nltk +#from ClassPreprocessor import create_dict + + +#print("Begin preprocess") # Reading data and preprocessings steps -preprocessor = Preprocessor() +#preprocessor = Preprocessor() print("load dataset") -df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") -df = df_original.copy() +df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") +#df = df_original.copy() print("remove blank rows") df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True) df.reset_index(drop=True, inplace=True) + +print("filter unclassified rows") +# filtrer les articles non classés par ARTFL mais classé par ENCCRE (jeu de test) +df_unclassified = df.loc[(df['normClass']=="unclassified")] +df_classified = df.loc[(df['normClass']!="unclassified")] + + + +print("save dataframe") +df_classified.to_csv('./data/train_dataframe.tsv', sep="\t") +df_unclassified.to_csv('./data/test_dataframe.tsv', sep="\t") + +print("some stats") + +print("len(df_unclassified)",len(df_unclassified)) +print("len(df_classified)",len(df_classified)) + +''' + #preprocessor.remove_null_rows(df_original, 'content') print("copy") df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy() @@ -44,21 +63,20 @@ df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy() print("split ensemble domaine enccre") df_1 = split_class(df_1, 'ensemble_domaine_enccre') print("save dataframe") -df_1.to_csv('./data/dataframe_with_ensemble_domaine_enccre.csv') +df_1.to_csv('./data/train_dataframe_with_ensemble_domaine_enccre.csv') -print("split ensemble domaine enccre") +print("split domaine enccre") df_2 = split_class(df_2, 'domaine_enccre') print("save dataframe") -df_2.to_csv('./data/dataframe_with_domaine_enccre.csv') +df_2.to_csv('./data/train_dataframe_with_domaine_enccre.csv') -print("split ensemble domaine enccre") +print("split normclass") df_3 = split_class(df_3, 'normClass') print("save dataframe") -df_3.to_csv('./data/dataframe_with_normClass_artfl.csv') +df_3.to_csv('./data/train_dataframe_with_normClass_artfl.csv') -print("some stats") d_1 = create_dict(df_1, 'ensemble_domaine_enccre') tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count']) tosave.to_excel("ensemble_domaine_enccre.xlsx") @@ -72,3 +90,4 @@ tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count']) tosave.to_excel("normClass_artfl.xlsx") print(df_original.shape) +'''