From 631df416a23fb088d61dd5586a0f3f5bb6de5832 Mon Sep 17 00:00:00 2001 From: Khalleud <ledk14@gmail.com> Date: Wed, 9 Jun 2021 16:55:04 +0200 Subject: [PATCH 1/3] [FIX] update feature extractor with split --- classifiers.py | 7 +++-- experimentsClassicClassifiers.py | 27 +++++++++++------- features_extractor.py | 47 ++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/classifiers.py b/classifiers.py index f68b2c6..c061dac 100644 --- a/classifiers.py +++ b/classifiers.py @@ -30,10 +30,11 @@ param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'di grid_params = [ ('bayes', None), + ('lr', param_grid_lr), + ('sgd', param_grid_sgd ), ('svm', param_grid_svm), ('decisionTree', param_grid_decisionTree), ('rfc', param_grid_rfc ), - ('lr', param_grid_lr), - ('sgd', param_grid_sgd ), - ('knn', param_grid_knn), + ('knn', param_grid_knn), + ] diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index 1cc2f91..35da41c 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -72,35 +72,42 @@ for columnInput in [columnText, 'firstParagraph']: print('Process: ' + columnInput) - extractor = feature_extractor(df, columnInput, columnClass) + #prepare data + df = df[df[columnClass] != 'unclassified'] + y = df[columnClass] + + train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) + encoder = preprocessing.LabelEncoder() + train_y = encoder.fit_transform(train_y) + valid_y = encoder.fit_transform(test_y) + + + extractor = feature_extractor(train_x, test_x, columnInput, columnClass) features_techniques = [ ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] - #prepare data - df = df[df[columnClass] != 'unclassified'] - y = df[columnClass] + #case of full text for feature_technique_name, features in features_techniques: - train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) - encoder = preprocessing.LabelEncoder() - train_y = encoder.fit_transform(train_y) - valid_y = encoder.fit_transform(test_y) + + # features has the train_x and the test_x after vectorization + train_x, test_x = features for tmp_clf, tmp_grid_params in zip(classifiers, grid_params): clf_name, clf = tmp_clf grid_param_name, grid_param = tmp_grid_params print(clf_name, clf, grid_param_name, grid_param) model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl" - + if clf_name != 'bayes' : clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) elif feature_technique_name == 'doc2vec': continue - + t_begin = time.time() if os.path.isfile(os.path.join('./models', model_file_name)): diff --git a/features_extractor.py b/features_extractor.py index a0c99fe..56d1944 100644 --- a/features_extractor.py +++ b/features_extractor.py @@ -12,16 +12,17 @@ from nltk.tokenize import word_tokenize class feature_extractor: - def __init__(self, data, column, target): + def __init__(self, train_x, test_x, column, target): self.column = column - self.data = data - self.X = data[column] - self.y = data[target] + #self.data = data + #self.X = data[column] + #self.y = data[target] - self.docs = [] - for index, row in data.iterrows(): - self.docs.append(row[column]) + self.docs_train = train_x[column].tolist() + self.docs_test = test_x[column].tolist() + #for index, row in data.iterrows(): + # self.docs.append(row[column]) def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ): @@ -36,9 +37,9 @@ class feature_extractor: stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures) - stem_vectorizer_fr.fit(self.docs) + stem_vectorizer_fr.fit(self.docs_train) - return stem_vectorizer_fr.transform(self.docs) + return stem_vectorizer_fr.transform(self.docs_train), stem_vectorizer_fr.transform(self.docs_test) def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None): @@ -53,21 +54,26 @@ class feature_extractor: return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words) tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures) - tfidf_vectorizer.fit(self.docs) - return tfidf_vectorizer.transform(self.docs) + tfidf_vectorizer.fit(self.docs_train) + return tfidf_vectorizer.transform(self.docs_train), tfidf_vectorizer.transform(self.docs_test) def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1): - tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)] + #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)] + tagged_tr = [TaggedDocument(words = word_tokenize(_d.lower()),tags = [str(i)]) for i, _d in enumerate(self.docs_train)] + #Tag test set + tagged_test = [TaggedDocument(words=word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(self.docs_test)] + + model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1) - model.build_vocab(tagged_data) + model.build_vocab(tagged_tr) for epoch in range(max_epochs): print('iteration {0}'.format(epoch)) - model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) + model.train(tagged_tr, total_examples=model.corpus_count, epochs=model.iter) # decrease the learning rate model.alpha -= 0.0002 # fix the learning rate, no decay @@ -78,12 +84,13 @@ class feature_extractor: nb_docs_small = len(set_tags) doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size)) - i = 0 - for t in set_tags: - doc_vec_doc2vec[i] = model.docvecs[t] - i += 1 - - return doc_vec_doc2vec + #i = 0 + #for t in set_tags: + # doc_vec_doc2vec[i] = model.docvecs[t] + # i += 1 + X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))]) + X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))]) + return X_train, X_test def text_based_features(self): -- GitLab From b98176ec8e6147467fe282ddbe2011bbcd93878a Mon Sep 17 00:00:00 2001 From: Khalleud <ledk14@gmail.com> Date: Sat, 12 Jun 2021 16:20:08 +0200 Subject: [PATCH 2/3] [FIX] update doc2vec in feature extractor --- features_extractor.py | 45 +++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/features_extractor.py b/features_extractor.py index 56d1944..e807b08 100644 --- a/features_extractor.py +++ b/features_extractor.py @@ -8,6 +8,7 @@ import pandas as pd import numpy as np from gensim.models.doc2vec import Doc2Vec, TaggedDocument from nltk.tokenize import word_tokenize +import spacy class feature_extractor: @@ -60,36 +61,38 @@ class feature_extractor: - def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1): - #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)] - tagged_tr = [TaggedDocument(words = word_tokenize(_d.lower()),tags = [str(i)]) for i, _d in enumerate(self.docs_train)] - #Tag test set - tagged_test = [TaggedDocument(words=word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(self.docs_test)] + def doc2vec(self, max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm): + nlp = spacy.load("fr_core_news_sm") + stopWords = set(stopwords.words('french')) - model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1) - model.build_vocab(tagged_tr) - for epoch in range(max_epochs): - print('iteration {0}'.format(epoch)) - model.train(tagged_tr, total_examples=model.corpus_count, epochs=model.iter) - # decrease the learning rate - model.alpha -= 0.0002 - # fix the learning rate, no decay - model.min_alpha = model.alpha + def tokenize_fr_text(sentence): + + result = string.punctuation + + + # Tokeniser la phrase + doc = nlp(sentence) + # Retourner le texte de chaque token + return [X.text.lower() for X in doc if not X.text in stopWords and not X.text in result and not len(X.text) < 2] + + + #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)] + tagged_tr = [TaggedDocument(words = tokenize_fr_text(_d),tags = [str(i)]) for i, _d in enumerate(self.docs_train)] + #Tag test set + tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(self.docs_test)] + + model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm) + model.build_vocab(tagged_tr) + model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs) - set_tags = list(model.docvecs.doctags) - nb_docs_small = len(set_tags) - doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size)) - #i = 0 - #for t in set_tags: - # doc_vec_doc2vec[i] = model.docvecs[t] - # i += 1 X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))]) X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))]) + return X_train, X_test -- GitLab From e659184ad250574cf37c29465398e9cfd7a3d864 Mon Sep 17 00:00:00 2001 From: Khalleud <ledk14@gmail.com> Date: Sat, 12 Jun 2021 17:13:29 +0200 Subject: [PATCH 3/3] [FIX] update classifiers in split and feature extraction order --- classifiers.py | 4 ++-- experimentsClassicClassifiers.py | 12 ++++++++---- settings.conf | 12 +++++++----- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/classifiers.py b/classifiers.py index c061dac..5eb6173 100644 --- a/classifiers.py +++ b/classifiers.py @@ -24,7 +24,7 @@ classifiers = [ param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } -param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} +param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":['none',"l2"]} param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } @@ -35,6 +35,6 @@ grid_params = [ ('svm', param_grid_svm), ('decisionTree', param_grid_decisionTree), ('rfc', param_grid_rfc ), - ('knn', param_grid_knn), + ('knn', param_grid_knn), ] diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index 35da41c..a3f2af2 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -3,6 +3,7 @@ import os import time import argparse import pandas as pd +import numpy as np from data_preprocessing import Preprocessor from features_extractor import feature_extractor from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class @@ -64,9 +65,12 @@ config.read('settings.conf') vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df')) vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df')) vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None + doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) -doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) -doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) +max_epochs = int(config.get('vectorizers','max_epochs')) +doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count')) +doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed. +doc2vec_workers = int(config.get('vectorizers','doc2vec_workers')) for columnInput in [columnText, 'firstParagraph']: @@ -76,7 +80,7 @@ for columnInput in [columnText, 'firstParagraph']: df = df[df[columnClass] != 'unclassified'] y = df[columnClass] - train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) + train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(test_y) @@ -87,7 +91,7 @@ for columnInput in [columnText, 'firstParagraph']: features_techniques = [ ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), - ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] + ('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm))] diff --git a/settings.conf b/settings.conf index f1ef2be..eebf815 100644 --- a/settings.conf +++ b/settings.conf @@ -1,8 +1,10 @@ [vectorizers] vectorization_max_df= 1.0 -vectorization_min_df= 1 +vectorization_min_df= 4 vectorization_numberOfFeatures= None -doc2vec_vec_size = 300 -doc2vec_epochs = 10 -doc2vec_lr = 0.025 -min_word_per_article = 4 +doc2vec_vec_size = 700 +max_epochs = 10 +doc2vec_min_count = 12 +doc2vec_dm = 0 +doc2vec_workers = 4 +min_word_per_article = 25 -- GitLab