import sys import os import time import argparse import pandas as pd import numpy as np from data_preprocessing import Preprocessor from features_extractor import feature_extractor from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class from classifiers import classifiers, grid_params from sklearn.model_selection import train_test_split from sklearn import preprocessing from evaluate_model import evaluate_model from sklearn.model_selection import GridSearchCV import configparser from re import search import math from unidecode import unidecode import re import nltk from ClassPreprocessor import create_dict def removeMarkers(full_text, listOfMarkers): if not pd.isna(full_text): for m in listOfMarkers: marker = str(m) marker_with_brcts = '('+ marker +')' full_text = full_text.replace(marker_with_brcts , "") full_text = full_text.replace(marker , "") #full_text = row[textColumn] i = unidecode(full_text).find(marker_with_brcts) goOn = False if i != -1: goOn = True while goOn: full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):])) i = unidecode(full_text).find(marker_with_brcts) if i == -1: goOn = False #row[textColumn] = full_text return full_text ## On vectorise la fonction removeMarkers() afin de l'appliquer de manière efficace (en terme de temps de calcul) sur le dataframe vec_removeMarkers = np.vectorize(removeMarkers) # Reading data and preprocessings steps preprocessor = Preprocessor() df = pd.read_csv('corpus_tei.csv') listOfM = df['class'].unique() df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") preprocessor.remove_null_rows(df_original, 'content') #df_original = removeMarkers(df_original, 'content', listOfM) df_original['content_withoutMarkers'] = vec_removeMarkers(df_original.content, listOfM) df_1 = df_original[['ensemble_domaine_enccre','content']].copy() df_2 = df_original[['domaine_enccre','content']].copy() df_3 = df_original[['normClass_artfl','content']].copy() ############ shall we remove articles with less n tokens ####### remove markers preprocessor.remove_null_rows(df_1, 'content_withoutMarkers') preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre') preprocessor.remove_null_rows(df_2, 'content_withoutMarkers') preprocessor.remove_null_rows(df_2, 'domaine_enccre') preprocessor.remove_null_rows(df_3, 'content_withoutMarkers') preprocessor.remove_null_rows(df_3, 'normClass_artfl') df_1 = split_class(df_1, 'ensemble_domaine_enccre') df_2 = split_class(df_2, 'domaine_enccre') df_3 = split_class(df_3, 'normClass_artfl') d_1 = create_dict(df_1, 'ensemble_domaine_enccre') tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count']) tosave.to_excel("ensemble_domaine_enccre.xlsx") d_2 = create_dict(df_2, 'domaine_enccre') tosave = pd.DataFrame.from_dict(d_2, orient='index', columns=[ 'Count']) tosave.to_excel("domaine_enccre.xlsx") d_3 = create_dict(df_3, 'normClass_artfl') tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count']) tosave.to_excel("normClass_artfl.xlsx") df_1.to_csv('dataframe_with_ensemble_domaine_enccre.csv') df_2.to_csv('dataframe_with_domaine_enccre.csv') df_3.to_csv('dataframe_with_normClass_artfl.csv') print(df_original.shape)