#import sys #import os #import time #import argparse import pandas as pd #import numpy as np #from data_preprocessing import Preprocessor #from features_extractor import feature_extractor #from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class #from classifiers import classifiers, grid_params #from sklearn.model_selection import train_test_split #from sklearn import preprocessing #from evaluate_model import evaluate_model #from sklearn.model_selection import GridSearchCV #import configparser #from re import search #import math #import re #import nltk #from ClassPreprocessor import create_dict #print("Begin preprocess") # Reading data and preprocessings steps print("load dataset") df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") #df = df_original.copy() print("len(df)",len(df)) print("remove blank rows") df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True) print("len(df)",len(df)) print("remove small articles < 15 words") #preprocessor = Preprocessor() #preprocessor.removeArticlesByTokensNumbers(df, 'content', 25) df = df.loc[(df['nb_word']>=15)] print("len(df)",len(df)) df.reset_index(drop=True, inplace=True) print("filter unclassified rows") # filtrer les articles non classés par ARTFL mais classé par ENCCRE (jeu de test) df_unclassified = df.loc[(df['normClass']=="unclassified")] df_classified = df.loc[(df['normClass']!="unclassified")] print("save dataframe") df_classified.to_csv('./data/train_dataframe.tsv', sep="\t") df_unclassified.to_csv('./data/test_dataframe.tsv', sep="\t") print("some stats") print("len(df_unclassified)",len(df_unclassified)) print("len(df_classified)",len(df_classified)) ''' #preprocessor.remove_null_rows(df_original, 'content') print("copy") df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy() df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy() df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy() print("split ensemble domaine enccre") df_1 = split_class(df_1, 'ensemble_domaine_enccre') print("save dataframe") df_1.to_csv('./data/train_dataframe_with_ensemble_domaine_enccre.csv') print("split domaine enccre") df_2 = split_class(df_2, 'domaine_enccre') print("save dataframe") df_2.to_csv('./data/train_dataframe_with_domaine_enccre.csv') print("split normclass") df_3 = split_class(df_3, 'normClass') print("save dataframe") df_3.to_csv('./data/train_dataframe_with_normClass_artfl.csv') d_1 = create_dict(df_1, 'ensemble_domaine_enccre') tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count']) tosave.to_excel("ensemble_domaine_enccre.xlsx") d_2 = create_dict(df_2, 'domaine_enccre') tosave = pd.DataFrame.from_dict(d_2, orient='index', columns=[ 'Count']) tosave.to_excel("domaine_enccre.xlsx") d_3 = create_dict(df_3, 'normClass_artfl') tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count']) tosave.to_excel("normClass_artfl.xlsx") print(df_original.shape) '''