diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index e1c15c0284363725d61d819616a70d3f45ffece4..eb2aa0f76410b846a99c553ac6e3e65c021f72fb 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -17,8 +17,6 @@ import nltk nltk.download('stopwords') nltk.download('punkt') - - parser = argparse.ArgumentParser() parser.add_argument("dataPath", help="Path of the dataframe") parser.add_argument("columnText", help="the column name of the text that should preproceed", default = 'content') @@ -26,9 +24,6 @@ parser.add_argument("columnClass", help="ColumnClass the column name of the clas parser.add_argument("minOfInstancePerClass", help="minOfInstancePerClass the minimum of instance required for each class", type=int) parser.add_argument("maxOfInstancePerClass", help="maxOfInstancePerClass the maximum of instance required resamling classes", type=int) - - - args = parser.parse_args() dataPath = args.dataPath columnText = args.columnText @@ -47,31 +42,20 @@ dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)): os.makedirs(os.path.join('reports', columnClass, dir_name_report)) - # Reading data and preprocessings steps - preprocessor = Preprocessor() - df_original = pd.read_csv(dataPath) - df = df_original[[columnClass,columnText]].copy() -#preprocessor.remove_null_rows(df, columnText) -#preprocessor.remove_null_rows(df, columnClass) -#df = split_class(df, columnClass) df = remove_weak_classes(df, columnClass, minOfInstancePerClass) df = resample_classes(df, columnClass, maxOfInstancePerClass) -#preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text - #Read configuration file for retreiving parameters of features extractors config = configparser.ConfigParser() config.read('settings.conf') - - vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df')) vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df')) vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None @@ -79,33 +63,22 @@ doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) - for columnInput in [columnText, 'firstParagraph']: print('Process: ' + columnInput) extractor = feature_extractor(df,columnText, columnClass) - #extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass) - features_techniques = [ ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] - ''' - features_techniques_paragraphe = [ - ('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), - ('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), - ('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] - ''' - #prepare data df = df[df[columnClass] != 'unclassified'] y = df[columnClass] #case of full text - for feature_technique_name, features in features_techniques: train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) encoder = preprocessing.LabelEncoder() diff --git a/requirements.txt b/requirements.txt index ab54835a61f9acc00afee644c0bfd94d19a4add5..b083ca106aab942e0d285e6dafa445ea29851550 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ torchvision==0.8.2 tokenizers==0.10.1 regex==2018.1.10 tensorflow==2.2.0 -gensim==3.8.1 +gensim==3.8.1 \ No newline at end of file diff --git a/tmp_preprocess_data.py b/tmp_preprocess_data.py index 73fa83436b42060fd9be952e0c7d48af6b524657..d353bc44e0dc37e124b1aa94be45af9540e31150 100644 --- a/tmp_preprocess_data.py +++ b/tmp_preprocess_data.py @@ -15,23 +15,16 @@ from sklearn.model_selection import GridSearchCV import configparser from re import search import math -from unidecode import unidecode import re import nltk from ClassPreprocessor import create_dict - - print("Begin preprocess") # Reading data and preprocessings steps preprocessor = Preprocessor() -#df = pd.read_csv('data/corpus_tei.csv') -#listOfM = df['class'].unique() - - print("load dataset") @@ -48,14 +41,6 @@ df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParag df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy() df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy() -############ shall we remove articles with less n tokens ####### remove markers -#preprocessor.remove_null_rows(df_1, 'contentWithoutClass') -#preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre') -#preprocessor.remove_null_rows(df_2, 'contentWithoutClass') -#preprocessor.remove_null_rows(df_2, 'domaine_enccre') -#preprocessor.remove_null_rows(df_3, 'contentWithoutClass') -#preprocessor.remove_null_rows(df_3, 'normClass') - print("split ensemble domaine enccre") df_1 = split_class(df_1, 'ensemble_domaine_enccre') print("save dataframe")