diff --git a/classifiers.py b/classifiers.py index e63763d77048288212348816b6d33062add88758..e8f13a89f9313514a68664e3092586c7032c9c46 100644 --- a/classifiers.py +++ b/classifiers.py @@ -24,16 +24,17 @@ classifiers = [ param_grid_svm = {'C':[1,10,100,1000],'gamma':[0.1,0.001,0.0001], 'kernel':['linear','rbf']} #param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } -param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} -param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1"], "max_iter" : [500]} +param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":['none',"l2"]} +param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } grid_params = [ ('bayes', None), ('lr', param_grid_lr), ('sgd', param_grid_sgd ), - #('decisionTree', param_grid_decisionTree), + ('svm', param_grid_svm), + ('decisionTree', param_grid_decisionTree), ('rfc', param_grid_rfc ), ('knn', param_grid_knn), - ('svm', param_grid_svm), + ] diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index 85326e2e812a4e603c0ca1068d5da1452191c91c..d9fbc4869bd76388be6bde697c378d362eb4c311 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -3,6 +3,7 @@ import os import time import argparse import pandas as pd +import numpy as np from data_preprocessing import Preprocessor from features_extractor import feature_extractor from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class @@ -64,44 +65,53 @@ config.read('settings.conf') vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df')) vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df')) vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None + doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) -doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) -doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) +max_epochs = int(config.get('vectorizers','max_epochs')) +doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count')) +doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed. +doc2vec_workers = int(config.get('vectorizers','doc2vec_workers')) for columnInput in ['firstParagraph',columnText]: print('Process: ' + columnInput) - extractor = feature_extractor(df, columnInput, columnClass) + #prepare data + df = df[df[columnClass] != 'unclassified'] + y = df[columnClass] + + train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) + encoder = preprocessing.LabelEncoder() + train_y = encoder.fit_transform(train_y) + valid_y = encoder.fit_transform(test_y) + + + extractor = feature_extractor(train_x, test_x, columnInput, columnClass) features_techniques = [ ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), - ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr)) - ] + ('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm))] + - #prepare data - df = df[df[columnClass] != 'unclassified'] - y = df[columnClass] #case of full text for feature_technique_name, features in features_techniques: - train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) - encoder = preprocessing.LabelEncoder() - train_y = encoder.fit_transform(train_y) - valid_y = encoder.fit_transform(test_y) + + # features has the train_x and the test_x after vectorization + train_x, test_x = features for tmp_clf, tmp_grid_params in zip(classifiers, grid_params): clf_name, clf = tmp_clf grid_param_name, grid_param = tmp_grid_params print(clf_name, clf, grid_param_name, grid_param) model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl" - + if clf_name != 'bayes' : clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3, n_jobs=-1) elif feature_technique_name == 'doc2vec': continue - + t_begin = time.time() if os.path.isfile(os.path.join('./models', model_file_name)): diff --git a/features_extractor.py b/features_extractor.py index a0c99fe4cd018aff722527e727cb51a516b0f315..e807b0877e0ca15380c6da0477e592e215b787d7 100644 --- a/features_extractor.py +++ b/features_extractor.py @@ -8,20 +8,22 @@ import pandas as pd import numpy as np from gensim.models.doc2vec import Doc2Vec, TaggedDocument from nltk.tokenize import word_tokenize +import spacy class feature_extractor: - def __init__(self, data, column, target): + def __init__(self, train_x, test_x, column, target): self.column = column - self.data = data - self.X = data[column] - self.y = data[target] + #self.data = data + #self.X = data[column] + #self.y = data[target] - self.docs = [] - for index, row in data.iterrows(): - self.docs.append(row[column]) + self.docs_train = train_x[column].tolist() + self.docs_test = test_x[column].tolist() + #for index, row in data.iterrows(): + # self.docs.append(row[column]) def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ): @@ -36,9 +38,9 @@ class feature_extractor: stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures) - stem_vectorizer_fr.fit(self.docs) + stem_vectorizer_fr.fit(self.docs_train) - return stem_vectorizer_fr.transform(self.docs) + return stem_vectorizer_fr.transform(self.docs_train), stem_vectorizer_fr.transform(self.docs_test) def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None): @@ -53,37 +55,45 @@ class feature_extractor: return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words) tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures) - tfidf_vectorizer.fit(self.docs) - return tfidf_vectorizer.transform(self.docs) + tfidf_vectorizer.fit(self.docs_train) + return tfidf_vectorizer.transform(self.docs_train), tfidf_vectorizer.transform(self.docs_test) - def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1): - tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)] - model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1) + def doc2vec(self, max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm): - model.build_vocab(tagged_data) + nlp = spacy.load("fr_core_news_sm") + stopWords = set(stopwords.words('french')) - for epoch in range(max_epochs): - print('iteration {0}'.format(epoch)) - model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) - # decrease the learning rate - model.alpha -= 0.0002 - # fix the learning rate, no decay - model.min_alpha = model.alpha - set_tags = list(model.docvecs.doctags) - nb_docs_small = len(set_tags) - doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size)) + def tokenize_fr_text(sentence): - i = 0 - for t in set_tags: - doc_vec_doc2vec[i] = model.docvecs[t] - i += 1 + result = string.punctuation - return doc_vec_doc2vec + + # Tokeniser la phrase + doc = nlp(sentence) + # Retourner le texte de chaque token + return [X.text.lower() for X in doc if not X.text in stopWords and not X.text in result and not len(X.text) < 2] + + + #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)] + tagged_tr = [TaggedDocument(words = tokenize_fr_text(_d),tags = [str(i)]) for i, _d in enumerate(self.docs_train)] + #Tag test set + tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(self.docs_test)] + + model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm) + model.build_vocab(tagged_tr) + model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs) + + + + X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))]) + X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))]) + + return X_train, X_test def text_based_features(self): diff --git a/settings.conf b/settings.conf index f1ef2be9fa2c8509e308b79fb8e8d137d295d93f..eebf815d6e33aa5c2c6ff040920a05e33b6a7139 100644 --- a/settings.conf +++ b/settings.conf @@ -1,8 +1,10 @@ [vectorizers] vectorization_max_df= 1.0 -vectorization_min_df= 1 +vectorization_min_df= 4 vectorization_numberOfFeatures= None -doc2vec_vec_size = 300 -doc2vec_epochs = 10 -doc2vec_lr = 0.025 -min_word_per_article = 4 +doc2vec_vec_size = 700 +max_epochs = 10 +doc2vec_min_count = 12 +doc2vec_dm = 0 +doc2vec_workers = 4 +min_word_per_article = 25