diff --git a/classifiers.py b/classifiers.py index 5eb61732e265ba2179b631fec786f6c986a6753d..e8f13a89f9313514a68664e3092586c7032c9c46 100644 --- a/classifiers.py +++ b/classifiers.py @@ -14,15 +14,15 @@ classifiers = [ ('bayes', MultinomialNB()), ('lr', LogisticRegression()), ('sgd', SGDClassifier()), - ('svm', SVC() ), - ('decisionTree',DecisionTreeClassifier()), + #('decisionTree',DecisionTreeClassifier()), ('rfc', RandomForestClassifier()), - ('knn', KNeighborsClassifier()) + ('knn', KNeighborsClassifier()), + ('svm', SVC() ) ] -param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} -param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } +param_grid_svm = {'C':[1,10,100,1000],'gamma':[0.1,0.001,0.0001], 'kernel':['linear','rbf']} +#param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":['none',"l2"]} param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index a3f2af24e759c6cd3fb911b025a1d8fa998cfbf6..d9fbc4869bd76388be6bde697c378d362eb4c311 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -72,8 +72,8 @@ doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count')) doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed. doc2vec_workers = int(config.get('vectorizers','doc2vec_workers')) -for columnInput in [columnText, 'firstParagraph']: +for columnInput in ['firstParagraph',columnText]: print('Process: ' + columnInput) #prepare data @@ -108,7 +108,7 @@ for columnInput in [columnText, 'firstParagraph']: model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl" if clf_name != 'bayes' : - clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) + clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3, n_jobs=-1) elif feature_technique_name == 'doc2vec': continue