diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index 3208fb0a2ae70477e4d54ed1e5e420d88267c55b..530e60d672ef8cdeea668622359582a3eb76e428 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -91,7 +91,7 @@ for columnInput in [columnText]: features_techniques = [ ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), - ('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm))] + ('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm, doc2vec_workers))] diff --git a/features_extractor.py b/features_extractor.py index e807b0877e0ca15380c6da0477e592e215b787d7..18bf313a8433dbab9cea27b0e66ea81ad537e626 100644 --- a/features_extractor.py +++ b/features_extractor.py @@ -61,7 +61,7 @@ class feature_extractor: - def doc2vec(self, max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm): + def doc2vec(self, max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm, doc2vec_workers): nlp = spacy.load("fr_core_news_sm") stopWords = set(stopwords.words('french')) @@ -84,7 +84,7 @@ class feature_extractor: #Tag test set tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(self.docs_test)] - model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm) + model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm, workers = doc2vec_workers) model.build_vocab(tagged_tr) model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs) diff --git a/settings.conf b/settings.conf index eebf815d6e33aa5c2c6ff040920a05e33b6a7139..023215eb39cba2e6bc69adec1c05dbb117b55ac9 100644 --- a/settings.conf +++ b/settings.conf @@ -6,5 +6,5 @@ doc2vec_vec_size = 700 max_epochs = 10 doc2vec_min_count = 12 doc2vec_dm = 0 -doc2vec_workers = 4 +doc2vec_workers = 8 min_word_per_article = 25