diff --git a/classifiers.py b/classifiers.py index f68b2c6519a28447f62aa24426ef707455155f4f..c061dac54e384c33fe4d9c33f23c977f7fddd3e5 100644 --- a/classifiers.py +++ b/classifiers.py @@ -30,10 +30,11 @@ param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'di grid_params = [ ('bayes', None), + ('lr', param_grid_lr), + ('sgd', param_grid_sgd ), ('svm', param_grid_svm), ('decisionTree', param_grid_decisionTree), ('rfc', param_grid_rfc ), - ('lr', param_grid_lr), - ('sgd', param_grid_sgd ), - ('knn', param_grid_knn), + ('knn', param_grid_knn), + ] diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index 1cc2f91dac3edb0d237da6605ce743a112c0e01c..35da41c4e80bf9e430df179bef1fb0a3e1f2f2d6 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -72,35 +72,42 @@ for columnInput in [columnText, 'firstParagraph']: print('Process: ' + columnInput) - extractor = feature_extractor(df, columnInput, columnClass) + #prepare data + df = df[df[columnClass] != 'unclassified'] + y = df[columnClass] + + train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) + encoder = preprocessing.LabelEncoder() + train_y = encoder.fit_transform(train_y) + valid_y = encoder.fit_transform(test_y) + + + extractor = feature_extractor(train_x, test_x, columnInput, columnClass) features_techniques = [ ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] - #prepare data - df = df[df[columnClass] != 'unclassified'] - y = df[columnClass] + #case of full text for feature_technique_name, features in features_techniques: - train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) - encoder = preprocessing.LabelEncoder() - train_y = encoder.fit_transform(train_y) - valid_y = encoder.fit_transform(test_y) + + # features has the train_x and the test_x after vectorization + train_x, test_x = features for tmp_clf, tmp_grid_params in zip(classifiers, grid_params): clf_name, clf = tmp_clf grid_param_name, grid_param = tmp_grid_params print(clf_name, clf, grid_param_name, grid_param) model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl" - + if clf_name != 'bayes' : clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) elif feature_technique_name == 'doc2vec': continue - + t_begin = time.time() if os.path.isfile(os.path.join('./models', model_file_name)): diff --git a/features_extractor.py b/features_extractor.py index a0c99fe4cd018aff722527e727cb51a516b0f315..56d1944087d6153ebc0f810c2f16b20e83826c23 100644 --- a/features_extractor.py +++ b/features_extractor.py @@ -12,16 +12,17 @@ from nltk.tokenize import word_tokenize class feature_extractor: - def __init__(self, data, column, target): + def __init__(self, train_x, test_x, column, target): self.column = column - self.data = data - self.X = data[column] - self.y = data[target] + #self.data = data + #self.X = data[column] + #self.y = data[target] - self.docs = [] - for index, row in data.iterrows(): - self.docs.append(row[column]) + self.docs_train = train_x[column].tolist() + self.docs_test = test_x[column].tolist() + #for index, row in data.iterrows(): + # self.docs.append(row[column]) def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ): @@ -36,9 +37,9 @@ class feature_extractor: stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures) - stem_vectorizer_fr.fit(self.docs) + stem_vectorizer_fr.fit(self.docs_train) - return stem_vectorizer_fr.transform(self.docs) + return stem_vectorizer_fr.transform(self.docs_train), stem_vectorizer_fr.transform(self.docs_test) def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None): @@ -53,21 +54,26 @@ class feature_extractor: return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words) tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures) - tfidf_vectorizer.fit(self.docs) - return tfidf_vectorizer.transform(self.docs) + tfidf_vectorizer.fit(self.docs_train) + return tfidf_vectorizer.transform(self.docs_train), tfidf_vectorizer.transform(self.docs_test) def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1): - tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)] + #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)] + tagged_tr = [TaggedDocument(words = word_tokenize(_d.lower()),tags = [str(i)]) for i, _d in enumerate(self.docs_train)] + #Tag test set + tagged_test = [TaggedDocument(words=word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(self.docs_test)] + + model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1) - model.build_vocab(tagged_data) + model.build_vocab(tagged_tr) for epoch in range(max_epochs): print('iteration {0}'.format(epoch)) - model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) + model.train(tagged_tr, total_examples=model.corpus_count, epochs=model.iter) # decrease the learning rate model.alpha -= 0.0002 # fix the learning rate, no decay @@ -78,12 +84,13 @@ class feature_extractor: nb_docs_small = len(set_tags) doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size)) - i = 0 - for t in set_tags: - doc_vec_doc2vec[i] = model.docvecs[t] - i += 1 - - return doc_vec_doc2vec + #i = 0 + #for t in set_tags: + # doc_vec_doc2vec[i] = model.docvecs[t] + # i += 1 + X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))]) + X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))]) + return X_train, X_test def text_based_features(self):