Newer
Older
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
def __init__(self, train_x, test_x, column, target):
#self.data = data
#self.X = data[column]
#self.y = data[target]
self.docs_train = train_x[column].tolist()
self.docs_test = test_x[column].tolist()
#for index, row in data.iterrows():
# self.docs.append(row[column])
def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = CountVectorizer().build_analyzer()
def stemmed_words_fr(doc):
return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures)
stem_vectorizer_fr.fit(self.docs_train)
return stem_vectorizer_fr.transform(self.docs_train), stem_vectorizer_fr.transform(self.docs_test)
def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = TfidfVectorizer().build_analyzer()
def stemmed_words_fr(doc):
return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures)
tfidf_vectorizer.fit(self.docs_train)
return tfidf_vectorizer.transform(self.docs_train), tfidf_vectorizer.transform(self.docs_test)
def doc2vec(self, max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm, doc2vec_workers):
nlp = spacy.load("fr_core_news_sm")
stopWords = set(stopwords.words('french'))
def tokenize_fr_text(sentence):
result = string.punctuation
# Tokeniser la phrase
doc = nlp(sentence)
# Retourner le texte de chaque token
return [X.text.lower() for X in doc if not X.text in stopWords and not X.text in result and not len(X.text) < 2]
#tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)]
tagged_tr = [TaggedDocument(words = tokenize_fr_text(_d),tags = [str(i)]) for i, _d in enumerate(self.docs_train)]
#Tag test set
tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(self.docs_test)]
model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm, workers = doc2vec_workers)
model.build_vocab(tagged_tr)
model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs)
X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])
X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])
def text_based_features(self):
# Classical measures
df = pd.DataFrame(columns=['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count'])
df['char_count'] = self.data[self.column].apply(len)
df['word_count'] = self.data[self.column].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = self.data[self.column].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
df['title_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
return df