Skip to content
Snippets Groups Projects
Commit a2ae5c96 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

[FIX] update number of jobs in GridSearchCV

parent 875518be
No related branches found
No related tags found
No related merge requests found
...@@ -14,26 +14,26 @@ classifiers = [ ...@@ -14,26 +14,26 @@ classifiers = [
('bayes', MultinomialNB()), ('bayes', MultinomialNB()),
('lr', LogisticRegression()), ('lr', LogisticRegression()),
('sgd', SGDClassifier()), ('sgd', SGDClassifier()),
('svm', SVC() ), #('decisionTree',DecisionTreeClassifier()),
('decisionTree',DecisionTreeClassifier()),
('rfc', RandomForestClassifier()), ('rfc', RandomForestClassifier()),
('knn', KNeighborsClassifier()) ('knn', KNeighborsClassifier()),
('svm', SVC() )
] ]
param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} param_grid_svm = {'C':[1,10,100,1000],'gamma':[0.1,0.001,0.0001], 'kernel':['linear','rbf']}
param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } #param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1"], "max_iter" : [500]}
param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
grid_params = [ grid_params = [
('bayes', None), ('bayes', None),
('svm', param_grid_svm),
('decisionTree', param_grid_decisionTree),
('rfc', param_grid_rfc ),
('lr', param_grid_lr), ('lr', param_grid_lr),
('sgd', param_grid_sgd ), ('sgd', param_grid_sgd ),
#('decisionTree', param_grid_decisionTree),
('rfc', param_grid_rfc ),
('knn', param_grid_knn), ('knn', param_grid_knn),
('svm', param_grid_svm),
] ]
...@@ -68,8 +68,8 @@ doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) ...@@ -68,8 +68,8 @@ doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size'))
doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs'))
doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) doc2vec_lr = float(config.get('vectorizers','doc2vec_lr'))
for columnInput in [columnText, 'firstParagraph']:
for columnInput in ['firstParagraph',columnText]:
print('Process: ' + columnInput) print('Process: ' + columnInput)
extractor = feature_extractor(df, columnInput, columnClass) extractor = feature_extractor(df, columnInput, columnClass)
...@@ -77,7 +77,8 @@ for columnInput in [columnText, 'firstParagraph']: ...@@ -77,7 +77,8 @@ for columnInput in [columnText, 'firstParagraph']:
features_techniques = [ features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))
]
#prepare data #prepare data
df = df[df[columnClass] != 'unclassified'] df = df[df[columnClass] != 'unclassified']
...@@ -97,7 +98,7 @@ for columnInput in [columnText, 'firstParagraph']: ...@@ -97,7 +98,7 @@ for columnInput in [columnText, 'firstParagraph']:
model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl" model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl"
if clf_name != 'bayes' : if clf_name != 'bayes' :
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3, n_jobs=-1)
elif feature_technique_name == 'doc2vec': elif feature_technique_name == 'doc2vec':
continue continue
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment