diff --git a/classifiers.py b/classifiers.py index f68b2c6519a28447f62aa24426ef707455155f4f..e63763d77048288212348816b6d33062add88758 100644 --- a/classifiers.py +++ b/classifiers.py @@ -14,26 +14,26 @@ classifiers = [ ('bayes', MultinomialNB()), ('lr', LogisticRegression()), ('sgd', SGDClassifier()), - ('svm', SVC() ), - ('decisionTree',DecisionTreeClassifier()), + #('decisionTree',DecisionTreeClassifier()), ('rfc', RandomForestClassifier()), - ('knn', KNeighborsClassifier()) + ('knn', KNeighborsClassifier()), + ('svm', SVC() ) ] -param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} -param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } +param_grid_svm = {'C':[1,10,100,1000],'gamma':[0.1,0.001,0.0001], 'kernel':['linear','rbf']} +#param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} -param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} +param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1"], "max_iter" : [500]} param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } grid_params = [ ('bayes', None), - ('svm', param_grid_svm), - ('decisionTree', param_grid_decisionTree), - ('rfc', param_grid_rfc ), ('lr', param_grid_lr), ('sgd', param_grid_sgd ), + #('decisionTree', param_grid_decisionTree), + ('rfc', param_grid_rfc ), ('knn', param_grid_knn), + ('svm', param_grid_svm), ] diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index 1cc2f91dac3edb0d237da6605ce743a112c0e01c..85326e2e812a4e603c0ca1068d5da1452191c91c 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -68,8 +68,8 @@ doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) -for columnInput in [columnText, 'firstParagraph']: +for columnInput in ['firstParagraph',columnText]: print('Process: ' + columnInput) extractor = feature_extractor(df, columnInput, columnClass) @@ -77,7 +77,8 @@ for columnInput in [columnText, 'firstParagraph']: features_techniques = [ ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), - ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] + ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr)) + ] #prepare data df = df[df[columnClass] != 'unclassified'] @@ -97,7 +98,7 @@ for columnInput in [columnText, 'firstParagraph']: model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl" if clf_name != 'bayes' : - clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) + clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3, n_jobs=-1) elif feature_technique_name == 'doc2vec': continue