from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import string import pandas as pd import numpy as np from gensim.models.doc2vec import Doc2Vec, TaggedDocument from nltk.tokenize import word_tokenize class feature_extractor: def __init__(self, data, column, target): self.column = column self.data = data self.X = data[column] self.y = data[target] self.docs = [] for index, row in data.iterrows(): self.docs.append(row[column]) def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ): stop_words = set(stopwords.words('french')) stemmer_fr = SnowballStemmer("french") analyzer = CountVectorizer().build_analyzer() def stemmed_words_fr(doc): return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words) stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures) stem_vectorizer_fr.fit(self.docs) return stem_vectorizer_fr.transform(self.docs) def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None): stop_words = set(stopwords.words('french')) stemmer_fr = SnowballStemmer("french") analyzer = TfidfVectorizer().build_analyzer() def stemmed_words_fr(doc): return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words) tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures) tfidf_vectorizer.fit(self.docs) return tfidf_vectorizer.transform(self.docs) def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1): tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)] model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1) model.build_vocab(tagged_data) for epoch in range(max_epochs): print('iteration {0}'.format(epoch)) model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) # decrease the learning rate model.alpha -= 0.0002 # fix the learning rate, no decay model.min_alpha = model.alpha set_tags = list(model.docvecs.doctags) nb_docs_small = len(set_tags) doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size)) i = 0 for t in set_tags: doc_vec_doc2vec[i] = model.docvecs[t] i += 1 return doc_vec_doc2vec def text_based_features(self): # Classical measures df = pd.DataFrame(columns=['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count']) df['char_count'] = self.data[self.column].apply(len) df['word_count'] = self.data[self.column].apply(lambda x: len(x.split())) df['word_density'] = df['char_count'] / (df['word_count']+1) df['punctuation_count'] = self.data[self.column].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) df['title_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()])) df['upper_case_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()])) return df