diff --git a/features_extractor.py b/features_extractor.py index 56d1944087d6153ebc0f810c2f16b20e83826c23..e807b0877e0ca15380c6da0477e592e215b787d7 100644 --- a/features_extractor.py +++ b/features_extractor.py @@ -8,6 +8,7 @@ import pandas as pd import numpy as np from gensim.models.doc2vec import Doc2Vec, TaggedDocument from nltk.tokenize import word_tokenize +import spacy class feature_extractor: @@ -60,36 +61,38 @@ class feature_extractor: - def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1): - #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)] - tagged_tr = [TaggedDocument(words = word_tokenize(_d.lower()),tags = [str(i)]) for i, _d in enumerate(self.docs_train)] - #Tag test set - tagged_test = [TaggedDocument(words=word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(self.docs_test)] + def doc2vec(self, max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm): + nlp = spacy.load("fr_core_news_sm") + stopWords = set(stopwords.words('french')) - model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1) - model.build_vocab(tagged_tr) - for epoch in range(max_epochs): - print('iteration {0}'.format(epoch)) - model.train(tagged_tr, total_examples=model.corpus_count, epochs=model.iter) - # decrease the learning rate - model.alpha -= 0.0002 - # fix the learning rate, no decay - model.min_alpha = model.alpha + def tokenize_fr_text(sentence): + + result = string.punctuation + + + # Tokeniser la phrase + doc = nlp(sentence) + # Retourner le texte de chaque token + return [X.text.lower() for X in doc if not X.text in stopWords and not X.text in result and not len(X.text) < 2] + + + #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)] + tagged_tr = [TaggedDocument(words = tokenize_fr_text(_d),tags = [str(i)]) for i, _d in enumerate(self.docs_train)] + #Tag test set + tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(self.docs_test)] + + model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm) + model.build_vocab(tagged_tr) + model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs) - set_tags = list(model.docvecs.doctags) - nb_docs_small = len(set_tags) - doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size)) - #i = 0 - #for t in set_tags: - # doc_vec_doc2vec[i] = model.docvecs[t] - # i += 1 X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))]) X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))]) + return X_train, X_test