diff --git a/.gitignore b/.gitignore index 1dadeac8da8350ee30d611a75ca58c2d654792f1..879b27990a045c10d8ee40503dab6debad3c7239 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ .DS_Store data/EDdA_dataframe_withContent.tsv .DS_Store +*.pyc +.DS_Store diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index be4fd36b4a88f38aa353b51476b8f5003e45feef..39f8c220fd99466a3c13d6e9eb9d51fe2ec3fccb 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -31,6 +31,11 @@ columnClass = args.columnClass minOfInstancePerClass = args.minOfInstancePerClass maxOfInstancePerClass = args.maxOfInstancePerClass +if not os.path.exists('reports'): + os.makedirs('reports') + +if not os.path.exists(os.path.join('reports', columnClass)): + os.makedirs(os.path.join('reports', columnClass)) # create directory in the reports directory so save the classification results dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) @@ -47,13 +52,13 @@ df_original = pd.read_csv(dataPath) df = df_original[[columnClass,columnText]].copy() -preprocessor.remove_null_rows(df, columnText) -preprocessor.remove_null_rows(df, columnClass) +#preprocessor.remove_null_rows(df, columnText) +#preprocessor.remove_null_rows(df, columnClass) #df = split_class(df, columnClass) -df = remove_weak_classes(df, columnClass, minOfInstancePerClass ) +df = remove_weak_classes(df, columnClass, minOfInstancePerClass) df = resample_classes(df, columnClass, maxOfInstancePerClass) -preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text +#preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text #Read configuration file for retreiving parameters of features extractors @@ -70,109 +75,53 @@ doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) -extractor = feature_extractor(df,columnText, columnClass) -extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass) +for columnInput in [columnText, 'firstParagraph']: + extractor = feature_extractor(df,columnText, columnClass) + #extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass) -features_techniques = [ -('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), -('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), -('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] -features_techniques_paragraphe = [ -('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), -('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), -('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] + features_techniques = [ + ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), + ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), + ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] + ''' + features_techniques_paragraphe = [ + ('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), + ('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), + ('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] + ''' + #prepare data + df = df[df[columnClass] != 'unclassified'] + y = df[columnClass] + #case of full text + for feature_technique_name, features in features_techniques: + train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) + encoder = preprocessing.LabelEncoder() + train_y = encoder.fit_transform(train_y) + valid_y = encoder.fit_transform(test_y) -#prepare data -df = df[df[columnClass] != 'unclassified'] -y = df[columnClass] + for tmp_clf, tmp_grid_params in zip(classifiers, grid_params): + clf_name, clf = tmp_clf + grid_param_name, grid_param = tmp_grid_params + print(clf_name, clf, grid_param_name, grid_param) + if clf_name == 'bayes' : + if feature_technique_name == 'doc2vec': + continue + else: + t_begin = time.time() + clf.fit(train_x, train_y) + t_end =time.time() + training_time = t_end - t_begin + y_pred = clf.predict(test_x) -#case of full text - -for feature_technique_name, features in features_techniques: - train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) - encoder = preprocessing.LabelEncoder() - train_y = encoder.fit_transform(train_y) - valid_y = encoder.fit_transform(test_y) - - for tmp_clf, tmp_grid_params in zip(classifiers, grid_params): - clf_name, clf = tmp_clf - grid_param_name, grid_param = tmp_grid_params - print(clf_name, clf, grid_param_name, grid_param) - if clf_name == 'bayes' : - if feature_technique_name == 'doc2vec': - continue - else: - t_begin = time.time() - clf.fit(train_x, train_y) - t_end =time.time() - training_time = t_end - t_begin - - y_pred = clf.predict(test_x) - - else : - clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) - t_begin = time.time() - clf.fit(train_x, train_y) - t_end =time.time() - training_time = t_end - t_begin - - y_pred = clf.predict(test_x) - - - - -#evaluate model - - file_name_report = feature_technique_name + '_' + clf_name - - report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf') - with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f: - - sys.stdout = f # Change the standard output to the file we created. - print(report) - print('accuracy : {}'.format(accuracy)) - print('weighted_Precision : {}'.format(weighted_avg['precision'])) - print('weighted_Recall : {}'.format(weighted_avg['recall'])) - print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) - print('weighted_Support : {}'.format(weighted_avg['support'])) - print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) - print('training time : {}'.format(training_time)) - #sys.stdout = sys.stdout # Reset the standard output to its original value - sys.stdout = sys.__stdout__ - - - - - - - - - - - - - -for feature_technique_name, features in features_techniques_paragraphe: - train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) - encoder = preprocessing.LabelEncoder() - train_y = encoder.fit_transform(train_y) - valid_y = encoder.fit_transform(test_y) - - for tmp_clf, clf_grid_params in zip(classifiers, grid_params): - clf_name, clf = tmp_clf - grid_param_name, grid_param = tmp_grid_params - - if clf_name == 'bayes' : - if feature_technique_name == 'doc2vec': - continue - else: + else : + clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) t_begin = time.time() clf.fit(train_x, train_y) t_end =time.time() @@ -180,34 +129,21 @@ for feature_technique_name, features in features_techniques_paragraphe: y_pred = clf.predict(test_x) - else : - clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) - t_begin = time.time() - clf.fit(train_x, train_y) - t_end =time.time() - training_time = t_end - t_begin - - y_pred = clf.predict(test_x) - - - - -#evaluate model - - file_name_report_paragraphe = feature_technique_name + '_paragraphe_' + clf_name - - report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe)+'.pdf') - with open(os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe+'.txt'), 'w') as f: - sys.stdout = f # Change the standard output to the file we created. - print(report) - print('accuracy : {}'.format(accuracy)) - print('weighted_Precision : {}'.format(weighted_avg['precision'])) - print('weighted_Recall : {}'.format(weighted_avg['recall'])) - print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) - print('weighted_Support : {}'.format(weighted_avg['support'])) - print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) - print('training time : {}'.format(training_time)) - sys.stdout = sys.stdout # Reset the standard output to its original value - - sys.stdout = sys.__stdout__ + #evaluate model + file_name_report = columnInput + '_' +feature_technique_name + '_' + clf_name + + report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf') + with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f: + + sys.stdout = f # Change the standard output to the file we created. + print(report) + print('accuracy : {}'.format(accuracy)) + print('weighted_Precision : {}'.format(weighted_avg['precision'])) + print('weighted_Recall : {}'.format(weighted_avg['recall'])) + print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) + print('weighted_Support : {}'.format(weighted_avg['support'])) + print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) + print('training time : {}'.format(training_time)) + #sys.stdout = sys.stdout # Reset the standard output to its original value + sys.stdout = sys.__stdout__ diff --git a/requirements.txt b/requirements.txt index 7a02be35accb36eca051132ea9af0968bec54aae..cb1f87bd29621957ff2ced2a9f3b9f000f6e0388 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,9 +6,9 @@ Keras==2.4.3 Keras-Preprocessing==1.1.2 sentence-transformers==0.4.1.2 transformers==4.3.2 -torch==1.8.1 +torch==1.7.1 torchvision==0.8.2 tokenizers==0.10.1 regex==2018.1.10 tensorflow==2.2.0 -gensgensim==3.8.1 +gensim==3.8.1 diff --git a/script.txt b/script.txt index c4529aeb7318e5ca49ea80524c1abdf3c56d000b..97eaef6d479680661b718d372831a786a679ae2a 100644 --- a/script.txt +++ b/script.txt @@ -3,13 +3,13 @@ mkdir -p reports/ensemble_domaine_enccre mkdir -p reports/normClass_artfl pip install -r requirements.txt python tmp_preprocess_data.py -python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 300 1500 -python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 1500 -python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 800 -python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 100 1500 -python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 1500 -python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 50 1500 -python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 500 -python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 300 1500 -python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 2000 -python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 300 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 50 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 50 800 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 100 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 300 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 50 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 300 500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 300 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 50 2000 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 50 500 diff --git a/tmp_preprocess_data.py b/tmp_preprocess_data.py index 3db8eae5c6eed912837621127db77d7b6ec03dcf..73fa83436b42060fd9be952e0c7d48af6b524657 100644 --- a/tmp_preprocess_data.py +++ b/tmp_preprocess_data.py @@ -22,65 +22,58 @@ from ClassPreprocessor import create_dict -def removeMarkers(full_text, listOfMarkers): - - if not pd.isna(full_text): - for m in listOfMarkers: - marker = str(m) - marker_with_brcts = '('+ marker +')' - full_text = full_text.replace(marker_with_brcts , "") - full_text = full_text.replace(marker , "") - #full_text = row[textColumn] - i = unidecode(full_text).find(marker_with_brcts) - goOn = False - if i != -1: - goOn = True - while goOn: - - full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):])) - i = unidecode(full_text).find(marker_with_brcts) - if i == -1: - goOn = False - #row[textColumn] = full_text - return full_text - -## On vectorise la fonction removeMarkers() afin de l'appliquer de manière efficace (en terme de temps de calcul) sur le dataframe -vec_removeMarkers = np.vectorize(removeMarkers) - +print("Begin preprocess") # Reading data and preprocessings steps preprocessor = Preprocessor() -df = pd.read_csv('corpus_tei.csv') -listOfM = df['class'].unique() +#df = pd.read_csv('data/corpus_tei.csv') +#listOfM = df['class'].unique() -df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") -preprocessor.remove_null_rows(df_original, 'content') -#df_original = removeMarkers(df_original, 'content', listOfM) -df_original['content_withoutMarkers'] = vec_removeMarkers(df_original.content, listOfM) +print("load dataset") -df_1 = df_original[['ensemble_domaine_enccre','content']].copy() -df_2 = df_original[['domaine_enccre','content']].copy() -df_3 = df_original[['normClass_artfl','content']].copy() +df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") +df = df_original.copy() -############ shall we remove articles with less n tokens ####### remove markers -preprocessor.remove_null_rows(df_1, 'content_withoutMarkers') -preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre') -preprocessor.remove_null_rows(df_2, 'content_withoutMarkers') -preprocessor.remove_null_rows(df_2, 'domaine_enccre') -preprocessor.remove_null_rows(df_3, 'content_withoutMarkers') -preprocessor.remove_null_rows(df_3, 'normClass_artfl') +print("remove blank rows") +df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True) +df.reset_index(drop=True, inplace=True) + +#preprocessor.remove_null_rows(df_original, 'content') +print("copy") +df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy() +df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy() +df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy() +############ shall we remove articles with less n tokens ####### remove markers +#preprocessor.remove_null_rows(df_1, 'contentWithoutClass') +#preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre') +#preprocessor.remove_null_rows(df_2, 'contentWithoutClass') +#preprocessor.remove_null_rows(df_2, 'domaine_enccre') +#preprocessor.remove_null_rows(df_3, 'contentWithoutClass') +#preprocessor.remove_null_rows(df_3, 'normClass') + +print("split ensemble domaine enccre") df_1 = split_class(df_1, 'ensemble_domaine_enccre') +print("save dataframe") +df_1.to_csv('./data/dataframe_with_ensemble_domaine_enccre.csv') + +print("split ensemble domaine enccre") df_2 = split_class(df_2, 'domaine_enccre') -df_3 = split_class(df_3, 'normClass_artfl') +print("save dataframe") +df_2.to_csv('./data/dataframe_with_domaine_enccre.csv') +print("split ensemble domaine enccre") +df_3 = split_class(df_3, 'normClass') +print("save dataframe") +df_3.to_csv('./data/dataframe_with_normClass_artfl.csv') +print("some stats") d_1 = create_dict(df_1, 'ensemble_domaine_enccre') tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count']) tosave.to_excel("ensemble_domaine_enccre.xlsx") @@ -93,10 +86,4 @@ d_3 = create_dict(df_3, 'normClass_artfl') tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count']) tosave.to_excel("normClass_artfl.xlsx") -df_1.to_csv('dataframe_with_ensemble_domaine_enccre.csv') -df_2.to_csv('dataframe_with_domaine_enccre.csv') -df_3.to_csv('dataframe_with_normClass_artfl.csv') - - - print(df_original.shape)