Skip to content
Snippets Groups Projects
Commit e659184a authored by Khalleud's avatar Khalleud
Browse files

[FIX] update classifiers in split and feature extraction order

parent b98176ec
No related branches found
No related tags found
1 merge request!4Branch dev vectorization feature
...@@ -24,7 +24,7 @@ classifiers = [ ...@@ -24,7 +24,7 @@ classifiers = [
param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":['none',"l2"]}
param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]}
param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
...@@ -35,6 +35,6 @@ grid_params = [ ...@@ -35,6 +35,6 @@ grid_params = [
('svm', param_grid_svm), ('svm', param_grid_svm),
('decisionTree', param_grid_decisionTree), ('decisionTree', param_grid_decisionTree),
('rfc', param_grid_rfc ), ('rfc', param_grid_rfc ),
('knn', param_grid_knn), ('knn', param_grid_knn),
] ]
...@@ -3,6 +3,7 @@ import os ...@@ -3,6 +3,7 @@ import os
import time import time
import argparse import argparse
import pandas as pd import pandas as pd
import numpy as np
from data_preprocessing import Preprocessor from data_preprocessing import Preprocessor
from features_extractor import feature_extractor from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
...@@ -64,9 +65,12 @@ config.read('settings.conf') ...@@ -64,9 +65,12 @@ config.read('settings.conf')
vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df')) vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df'))
vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df')) vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df'))
vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None
doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size'))
doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) max_epochs = int(config.get('vectorizers','max_epochs'))
doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count'))
doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
doc2vec_workers = int(config.get('vectorizers','doc2vec_workers'))
for columnInput in [columnText, 'firstParagraph']: for columnInput in [columnText, 'firstParagraph']:
...@@ -76,7 +80,7 @@ for columnInput in [columnText, 'firstParagraph']: ...@@ -76,7 +80,7 @@ for columnInput in [columnText, 'firstParagraph']:
df = df[df[columnClass] != 'unclassified'] df = df[df[columnClass] != 'unclassified']
y = df[columnClass] y = df[columnClass]
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder() encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y) train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y) valid_y = encoder.fit_transform(test_y)
...@@ -87,7 +91,7 @@ for columnInput in [columnText, 'firstParagraph']: ...@@ -87,7 +91,7 @@ for columnInput in [columnText, 'firstParagraph']:
features_techniques = [ features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] ('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm))]
......
[vectorizers] [vectorizers]
vectorization_max_df= 1.0 vectorization_max_df= 1.0
vectorization_min_df= 1 vectorization_min_df= 4
vectorization_numberOfFeatures= None vectorization_numberOfFeatures= None
doc2vec_vec_size = 300 doc2vec_vec_size = 700
doc2vec_epochs = 10 max_epochs = 10
doc2vec_lr = 0.025 doc2vec_min_count = 12
min_word_per_article = 4 doc2vec_dm = 0
doc2vec_workers = 4
min_word_per_article = 25
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment