From e659184ad250574cf37c29465398e9cfd7a3d864 Mon Sep 17 00:00:00 2001 From: Khalleud <ledk14@gmail.com> Date: Sat, 12 Jun 2021 17:13:29 +0200 Subject: [PATCH] [FIX] update classifiers in split and feature extraction order --- classifiers.py | 4 ++-- experimentsClassicClassifiers.py | 12 ++++++++---- settings.conf | 12 +++++++----- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/classifiers.py b/classifiers.py index c061dac..5eb6173 100644 --- a/classifiers.py +++ b/classifiers.py @@ -24,7 +24,7 @@ classifiers = [ param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } -param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} +param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":['none',"l2"]} param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } @@ -35,6 +35,6 @@ grid_params = [ ('svm', param_grid_svm), ('decisionTree', param_grid_decisionTree), ('rfc', param_grid_rfc ), - ('knn', param_grid_knn), + ('knn', param_grid_knn), ] diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index 35da41c..a3f2af2 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -3,6 +3,7 @@ import os import time import argparse import pandas as pd +import numpy as np from data_preprocessing import Preprocessor from features_extractor import feature_extractor from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class @@ -64,9 +65,12 @@ config.read('settings.conf') vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df')) vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df')) vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None + doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) -doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) -doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) +max_epochs = int(config.get('vectorizers','max_epochs')) +doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count')) +doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed. +doc2vec_workers = int(config.get('vectorizers','doc2vec_workers')) for columnInput in [columnText, 'firstParagraph']: @@ -76,7 +80,7 @@ for columnInput in [columnText, 'firstParagraph']: df = df[df[columnClass] != 'unclassified'] y = df[columnClass] - train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) + train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(test_y) @@ -87,7 +91,7 @@ for columnInput in [columnText, 'firstParagraph']: features_techniques = [ ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), - ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] + ('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm))] diff --git a/settings.conf b/settings.conf index f1ef2be..eebf815 100644 --- a/settings.conf +++ b/settings.conf @@ -1,8 +1,10 @@ [vectorizers] vectorization_max_df= 1.0 -vectorization_min_df= 1 +vectorization_min_df= 4 vectorization_numberOfFeatures= None -doc2vec_vec_size = 300 -doc2vec_epochs = 10 -doc2vec_lr = 0.025 -min_word_per_article = 4 +doc2vec_vec_size = 700 +max_epochs = 10 +doc2vec_min_count = 12 +doc2vec_dm = 0 +doc2vec_workers = 4 +min_word_per_article = 25 -- GitLab