Skip to content
Snippets Groups Projects
Commit 8882c956 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Merge branch 'branch_dev_vectorizationFeature' into 'master'

Branch dev vectorization feature

See merge request !4
parents a2ae5c96 c2f5f604
No related branches found
No related tags found
1 merge request!4Branch dev vectorization feature
......@@ -24,16 +24,17 @@ classifiers = [
param_grid_svm = {'C':[1,10,100,1000],'gamma':[0.1,0.001,0.0001], 'kernel':['linear','rbf']}
#param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1"], "max_iter" : [500]}
param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":['none',"l2"]}
param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]}
param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
grid_params = [
('bayes', None),
('lr', param_grid_lr),
('sgd', param_grid_sgd ),
#('decisionTree', param_grid_decisionTree),
('svm', param_grid_svm),
('decisionTree', param_grid_decisionTree),
('rfc', param_grid_rfc ),
('knn', param_grid_knn),
('svm', param_grid_svm),
]
......@@ -3,6 +3,7 @@ import os
import time
import argparse
import pandas as pd
import numpy as np
from data_preprocessing import Preprocessor
from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
......@@ -64,44 +65,53 @@ config.read('settings.conf')
vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df'))
vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df'))
vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None
doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size'))
doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs'))
doc2vec_lr = float(config.get('vectorizers','doc2vec_lr'))
max_epochs = int(config.get('vectorizers','max_epochs'))
doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count'))
doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
doc2vec_workers = int(config.get('vectorizers','doc2vec_workers'))
for columnInput in ['firstParagraph',columnText]:
print('Process: ' + columnInput)
extractor = feature_extractor(df, columnInput, columnClass)
#prepare data
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
extractor = feature_extractor(train_x, test_x, columnInput, columnClass)
features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))
]
('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm))]
#prepare data
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
#case of full text
for feature_technique_name, features in features_techniques:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
# features has the train_x and the test_x after vectorization
train_x, test_x = features
for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
clf_name, clf = tmp_clf
grid_param_name, grid_param = tmp_grid_params
print(clf_name, clf, grid_param_name, grid_param)
model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl"
if clf_name != 'bayes' :
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3, n_jobs=-1)
elif feature_technique_name == 'doc2vec':
continue
t_begin = time.time()
if os.path.isfile(os.path.join('./models', model_file_name)):
......
......@@ -8,20 +8,22 @@ import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import spacy
class feature_extractor:
def __init__(self, data, column, target):
def __init__(self, train_x, test_x, column, target):
self.column = column
self.data = data
self.X = data[column]
self.y = data[target]
#self.data = data
#self.X = data[column]
#self.y = data[target]
self.docs = []
for index, row in data.iterrows():
self.docs.append(row[column])
self.docs_train = train_x[column].tolist()
self.docs_test = test_x[column].tolist()
#for index, row in data.iterrows():
# self.docs.append(row[column])
def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ):
......@@ -36,9 +38,9 @@ class feature_extractor:
stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures)
stem_vectorizer_fr.fit(self.docs)
stem_vectorizer_fr.fit(self.docs_train)
return stem_vectorizer_fr.transform(self.docs)
return stem_vectorizer_fr.transform(self.docs_train), stem_vectorizer_fr.transform(self.docs_test)
def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None):
......@@ -53,37 +55,45 @@ class feature_extractor:
return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures)
tfidf_vectorizer.fit(self.docs)
return tfidf_vectorizer.transform(self.docs)
tfidf_vectorizer.fit(self.docs_train)
return tfidf_vectorizer.transform(self.docs_train), tfidf_vectorizer.transform(self.docs_test)
def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1):
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)]
model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
def doc2vec(self, max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm):
model.build_vocab(tagged_data)
nlp = spacy.load("fr_core_news_sm")
stopWords = set(stopwords.words('french'))
for epoch in range(max_epochs):
print('iteration {0}'.format(epoch))
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
set_tags = list(model.docvecs.doctags)
nb_docs_small = len(set_tags)
doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size))
def tokenize_fr_text(sentence):
i = 0
for t in set_tags:
doc_vec_doc2vec[i] = model.docvecs[t]
i += 1
result = string.punctuation
return doc_vec_doc2vec
# Tokeniser la phrase
doc = nlp(sentence)
# Retourner le texte de chaque token
return [X.text.lower() for X in doc if not X.text in stopWords and not X.text in result and not len(X.text) < 2]
#tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)]
tagged_tr = [TaggedDocument(words = tokenize_fr_text(_d),tags = [str(i)]) for i, _d in enumerate(self.docs_train)]
#Tag test set
tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(self.docs_test)]
model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm)
model.build_vocab(tagged_tr)
model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs)
X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])
X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])
return X_train, X_test
def text_based_features(self):
......
[vectorizers]
vectorization_max_df= 1.0
vectorization_min_df= 1
vectorization_min_df= 4
vectorization_numberOfFeatures= None
doc2vec_vec_size = 300
doc2vec_epochs = 10
doc2vec_lr = 0.025
min_word_per_article = 4
doc2vec_vec_size = 700
max_epochs = 10
doc2vec_min_count = 12
doc2vec_dm = 0
doc2vec_workers = 4
min_word_per_article = 25
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment