Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
class feature_extractor:
def __init__(self, data, column, target):
self.column = column
self.data = data
self.X = data[column]
self.y = data[target]
self.docs = []
for index, row in data.iterrows():
self.docs.append(row[column])
def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = CountVectorizer().build_analyzer()
def stemmed_words_fr(doc):
return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures)
stem_vectorizer_fr.fit(self.docs)
return stem_vectorizer_fr.transform(self.docs)
def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = TfidfVectorizer().build_analyzer()
def stemmed_words_fr(doc):
return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures)
tfidf_vectorizer.fit(self.docs)
return tfidf_vectorizer.transform(self.docs)
def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1):
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)]
model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
print('iteration {0}'.format(epoch))
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
set_tags = list(model.docvecs.doctags)
nb_docs_small = len(set_tags)
doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size))
i = 0
for t in set_tags:
doc_vec_doc2vec[i] = model.docvecs[t]
i += 1
return doc_vec_doc2vec
def text_based_features(self):
# Classical measures
df = pd.DataFrame(columns=['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count'])
df['char_count'] = self.data[self.column].apply(len)
df['word_count'] = self.data[self.column].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = self.data[self.column].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
df['title_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
return df