update

32f28eea · Ludovic Moncla · 2492f680 · 32f28eea · 32f28eea · 32f28eea
Commit 32f28eea authored 4 years ago by Ludovic Moncla
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@
 .DS_Store
 data/EDdA_dataframe_withContent.tsv
 .DS_Store
+*.pyc
+.DS_Store
--- a/experimentsClassicClassifiers.py
+++ b/experimentsClassicClassifiers.py
@@ -31,6 +31,11 @@ columnClass = args.columnClass
 minOfInstancePerClass = args.minOfInstancePerClass
 maxOfInstancePerClass = args.maxOfInstancePerClass
+if not os.path.exists('reports'):
+    os.makedirs('reports')
+if not os.path.exists(os.path.join('reports',  columnClass)):
+    os.makedirs(os.path.join('reports', columnClass))
 # create directory in the reports directory so save the classification results
 dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
@@ -47,13 +52,13 @@ df_original = pd.read_csv(dataPath)
 df = df_original[[columnClass,columnText]].copy()
-preprocessor.remove_null_rows(df, columnText)
+#preprocessor.remove_null_rows(df, columnText)
-preprocessor.remove_null_rows(df, columnClass)
+#preprocessor.remove_null_rows(df, columnClass)
 #df = split_class(df, columnClass)
-df = remove_weak_classes(df, columnClass, minOfInstancePerClass )
+df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
 df = resample_classes(df, columnClass, maxOfInstancePerClass)
-preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text
+#preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text
 #Read configuration file for retreiving parameters of features extractors
@@ -70,109 +75,53 @@ doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs'))
 doc2vec_lr = float(config.get('vectorizers','doc2vec_lr'))
-extractor = feature_extractor(df,columnText, columnClass)
+for columnInput in [columnText, 'firstParagraph']:
-extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass)
+    extractor = feature_extractor(df,columnText, columnClass)
+    #extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass)
-features_techniques = [
-('counter',  extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
-('tf_idf',  extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
-('doc2vec',  extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
-features_techniques_paragraphe = [
+    features_techniques = [
-('counter',  extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
+    ('counter',  extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
-('tf_idf',  extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
+    ('tf_idf',  extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
-('doc2vec',  extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
+    ('doc2vec',  extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
+    '''
+    features_techniques_paragraphe = [
+    ('counter',  extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
+    ('tf_idf',  extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
+    ('doc2vec',  extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
+    '''
+    #prepare data
+    df = df[df[columnClass] != 'unclassified']
+    y  = df[columnClass]
+    #case of full text
+    for feature_technique_name, features in features_techniques:
+        train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
+        encoder = preprocessing.LabelEncoder()
+        train_y = encoder.fit_transform(train_y)
+        valid_y = encoder.fit_transform(test_y)
-#prepare data
+        for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
-df = df[df[columnClass] != 'unclassified']
+            clf_name, clf = tmp_clf
-y  = df[columnClass]
+            grid_param_name, grid_param = tmp_grid_params
+            print(clf_name, clf, grid_param_name, grid_param)
+            if clf_name == 'bayes' :
+                if feature_technique_name == 'doc2vec':
+                    continue
+                else:
+                    t_begin = time.time()
+                    clf.fit(train_x, train_y)
+                    t_end =time.time()
+                    training_time = t_end - t_begin
+                    y_pred = clf.predict(test_x)
-#case of full text
+            else :
+                clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
-for feature_technique_name, features in features_techniques:
-    train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
-    encoder = preprocessing.LabelEncoder()
-    train_y = encoder.fit_transform(train_y)
-    valid_y = encoder.fit_transform(test_y)
-    for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
-        clf_name, clf = tmp_clf
-        grid_param_name, grid_param = tmp_grid_params
-        print(clf_name, clf, grid_param_name, grid_param)
-        if clf_name == 'bayes' :
-            if feature_technique_name == 'doc2vec':
-                continue
-            else:
-                t_begin = time.time()
-                clf.fit(train_x, train_y)
-                t_end =time.time()
-                training_time = t_end - t_begin
-                y_pred = clf.predict(test_x)
-        else :
-            clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
-            t_begin = time.time()
-            clf.fit(train_x, train_y)
-            t_end =time.time()
-            training_time = t_end - t_begin
-            y_pred = clf.predict(test_x)
-#evaluate model
-        file_name_report = feature_technique_name + '_' + clf_name
-        report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
-        with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f:
-            sys.stdout = f # Change the standard output to the file we created.
-            print(report)
-            print('accuracy : {}'.format(accuracy))
-            print('weighted_Precision : {}'.format(weighted_avg['precision']))
-            print('weighted_Recall    : {}'.format(weighted_avg['recall']))
-            print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
-            print('weighted_Support   : {}'.format(weighted_avg['support']))
-            print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
-            print('training time   : {}'.format(training_time))
-            #sys.stdout = sys.stdout # Reset the standard output to its original value
-            sys.stdout = sys.__stdout__
-for feature_technique_name, features in features_techniques_paragraphe:
-    train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
-    encoder = preprocessing.LabelEncoder()
-    train_y = encoder.fit_transform(train_y)
-    valid_y = encoder.fit_transform(test_y)
-    for tmp_clf, clf_grid_params in zip(classifiers, grid_params):
-        clf_name, clf = tmp_clf
-        grid_param_name, grid_param = tmp_grid_params
-        if clf_name == 'bayes' :
-            if feature_technique_name == 'doc2vec':
-                continue
-            else:
                t_begin = time.time()
                clf.fit(train_x, train_y)
                t_end =time.time()
@@ -180,34 +129,21 @@ for feature_technique_name, features in features_techniques_paragraphe:
                y_pred = clf.predict(test_x)
-        else :
+    #evaluate model
-            clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
+            file_name_report = columnInput + '_' +feature_technique_name + '_' + clf_name
-            t_begin = time.time()
-            clf.fit(train_x, train_y)
+            report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
-            t_end =time.time()
+            with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f:
-            training_time = t_end - t_begin
+                sys.stdout = f # Change the standard output to the file we created.
-            y_pred = clf.predict(test_x)
+                print(report)
+                print('accuracy : {}'.format(accuracy))
+                print('weighted_Precision : {}'.format(weighted_avg['precision']))
+                print('weighted_Recall    : {}'.format(weighted_avg['recall']))
+                print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
-#evaluate model
+                print('weighted_Support   : {}'.format(weighted_avg['support']))
+                print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
-        file_name_report_paragraphe = feature_technique_name + '_paragraphe_' + clf_name
+                print('training time   : {}'.format(training_time))
+                #sys.stdout = sys.stdout # Reset the standard output to its original value
-        report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe)+'.pdf')
+                sys.stdout = sys.__stdout__
-        with open(os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe+'.txt'), 'w') as f:
-            sys.stdout = f # Change the standard output to the file we created.
-            print(report)
-            print('accuracy : {}'.format(accuracy))
-            print('weighted_Precision : {}'.format(weighted_avg['precision']))
-            print('weighted_Recall    : {}'.format(weighted_avg['recall']))
-            print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
-            print('weighted_Support   : {}'.format(weighted_avg['support']))
-            print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
-            print('training time   : {}'.format(training_time))
-            sys.stdout = sys.stdout # Reset the standard output to its original value
-            sys.stdout = sys.__stdout__
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,9 +6,9 @@ Keras==2.4.3
 Keras-Preprocessing==1.1.2
 sentence-transformers==0.4.1.2
 transformers==4.3.2
-torch==1.8.1
+torch==1.7.1
 torchvision==0.8.2
 tokenizers==0.10.1
 regex==2018.1.10
 tensorflow==2.2.0
-gensgensim==3.8.1
+gensim==3.8.1
--- a/script.txt
+++ b/script.txt
@@ -3,13 +3,13 @@ mkdir -p reports/ensemble_domaine_enccre
 mkdir -p reports/normClass_artfl
 pip install -r requirements.txt
 python tmp_preprocess_data.py 
-python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 300 1500
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 300 1500
-python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 1500 
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 50 1500 
-python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 800     
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 50 800     
-python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 100 1500   
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 100 1500   
-python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 1500
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 300 1500
-python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 50 1500
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 50 1500
-python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 500            
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 300 500            
-python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 300 1500
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 300 1500
-python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 2000
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 50 2000
-python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 500
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 50 500
--- a/tmp_preprocess_data.py
+++ b/tmp_preprocess_data.py
@@ -22,65 +22,58 @@ from ClassPreprocessor import create_dict
-def removeMarkers(full_text, listOfMarkers):
-    if not pd.isna(full_text):
-        for m in listOfMarkers:
-            marker = str(m)
-            marker_with_brcts = '('+ marker +')'
-            full_text = full_text.replace(marker_with_brcts , "")
-            full_text = full_text.replace(marker , "")
-            #full_text = row[textColumn]
-            i = unidecode(full_text).find(marker_with_brcts)
-            goOn = False
-            if i != -1:
-                goOn = True
-            while goOn:
-                full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
-                i = unidecode(full_text).find(marker_with_brcts)
-                if i == -1:
-                    goOn = False
-            #row[textColumn] = full_text
-    return full_text
-## On vectorise la fonction removeMarkers() afin de l'appliquer de manière efficace (en terme de temps de calcul) sur le dataframe
-vec_removeMarkers = np.vectorize(removeMarkers)
+print("Begin preprocess")
 # Reading data and preprocessings steps
 preprocessor = Preprocessor()
-df = pd.read_csv('corpus_tei.csv')
+#df = pd.read_csv('data/corpus_tei.csv')
-listOfM = df['class'].unique()
+#listOfM = df['class'].unique()
-df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
-preprocessor.remove_null_rows(df_original, 'content')
-#df_original = removeMarkers(df_original, 'content', listOfM)
-df_original['content_withoutMarkers'] = vec_removeMarkers(df_original.content, listOfM)
+print("load dataset")
-df_1 = df_original[['ensemble_domaine_enccre','content']].copy()
+df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
-df_2 = df_original[['domaine_enccre','content']].copy()
+df = df_original.copy()
-df_3 = df_original[['normClass_artfl','content']].copy()
-############ shall we remove articles with less n tokens ####### remove markers
+print("remove blank rows")
-preprocessor.remove_null_rows(df_1, 'content_withoutMarkers')
+df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True)
-preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
+df.reset_index(drop=True, inplace=True)
-preprocessor.remove_null_rows(df_2, 'content_withoutMarkers')
-preprocessor.remove_null_rows(df_2, 'domaine_enccre')
+#preprocessor.remove_null_rows(df_original, 'content')
-preprocessor.remove_null_rows(df_3, 'content_withoutMarkers')
+print("copy")
-preprocessor.remove_null_rows(df_3, 'normClass_artfl')
+df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
+df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
+df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy()
+############ shall we remove articles with less n tokens ####### remove markers
+#preprocessor.remove_null_rows(df_1, 'contentWithoutClass')
+#preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
+#preprocessor.remove_null_rows(df_2, 'contentWithoutClass')
+#preprocessor.remove_null_rows(df_2, 'domaine_enccre')
+#preprocessor.remove_null_rows(df_3, 'contentWithoutClass')
+#preprocessor.remove_null_rows(df_3, 'normClass')
+print("split ensemble domaine enccre")
 df_1 = split_class(df_1, 'ensemble_domaine_enccre')
+print("save dataframe")
+df_1.to_csv('./data/dataframe_with_ensemble_domaine_enccre.csv')
+print("split ensemble domaine enccre")
 df_2 = split_class(df_2, 'domaine_enccre')
-df_3 = split_class(df_3, 'normClass_artfl')
+print("save dataframe")
+df_2.to_csv('./data/dataframe_with_domaine_enccre.csv')
+print("split ensemble domaine enccre")
+df_3 = split_class(df_3, 'normClass')
+print("save dataframe")
+df_3.to_csv('./data/dataframe_with_normClass_artfl.csv')
+print("some stats")
 d_1 = create_dict(df_1, 'ensemble_domaine_enccre')
 tosave = pd.DataFrame.from_dict(d_1, orient='index',  columns=[ 'Count'])
 tosave.to_excel("ensemble_domaine_enccre.xlsx")
@@ -93,10 +86,4 @@ d_3 = create_dict(df_3, 'normClass_artfl')
 tosave = pd.DataFrame.from_dict(d_3, orient='index',  columns=[ 'Count'])
 tosave.to_excel("normClass_artfl.xlsx")
-df_1.to_csv('dataframe_with_ensemble_domaine_enccre.csv')
-df_2.to_csv('dataframe_with_domaine_enccre.csv')
-df_3.to_csv('dataframe_with_normClass_artfl.csv')
 print(df_original.shape)