-
Ludovic Moncla authored0f45f5be
data_preprocessing.py 4.22 KiB
import pandas as pd
import numpy as np
from re import search
import math
from unidecode import unidecode
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk
class Preprocessor():
def init(self):
pass
def remove_null_rows(self, df, columnName):
#df = df[df[columnName].notna()]
df.dropna(subset = [columnName], inplace = True)
df.reset_index(drop=True, inplace=True)
return
def removeMarkers(self, df, textColumn, markerColumn = 'class'):
#remove null values or add condition if exist
#self.remove_null_rows(df, markerColumn)
#self.remove_null_rows(df, textColumn)
for index, row in df.iterrows():
if not pd.isna(row[markerColumn]) and not pd.isna(row[textColumn]):
marker = row[markerColumn]
marker_with_brcts = '('+ marker +')'
row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
row[textColumn] = row[textColumn].replace(marker , "")
full_text = row[textColumn]
i = unidecode(full_text).find(marker_with_brcts)
goOn = False
if i != -1:
goOn = True
while goOn:
full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
i = unidecode(full_text).find(marker_with_brcts)
if i == -1:
goOn = False
row[textColumn] = full_text
return df
def removeWordsByFrequency(self, df, textColumn, min_word_occurence, max_word_occurence):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = CountVectorizer().build_analyzer()
def token_fr(doc):
return (w for w in analyzer(doc) if not w in stop_words)
stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr, max_df= max_word_occurence , min_df= min_word_occurence, max_features=None)
docs = []
for index, row in df.iterrows():
docs.append(row[textColumn])
stem_vectorizer_fr.fit(docs)
featured_docs = stem_vectorizer_fr.transform(docs)
tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
for index, tokens in enumerate(tokens_per_docs):
# join token to recreate text with new tokens
new_text = ' '.join(tokens)
df.loc[index][textColumn] = new_text
return
def removeArticlesByTokensNumbers(self, df, textColumn, min_word_per_article):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = CountVectorizer().build_analyzer()
def token_fr(doc):
return (w for w in analyzer(doc) if not w in stop_words)
stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr)
docs = []
for index, row in df.iterrows():
docs.append(row[textColumn])
stem_vectorizer_fr.fit(docs)
featured_docs = stem_vectorizer_fr.transform(docs)
tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
concerned_article_index = []
for index, tokens in enumerate(tokens_per_docs):
if len(tokens) <= min_word_per_article:
concerned_article_index.append(index)
df = df.drop(index = concerned_article_index, inplace = True)
return
def getFirstParagraph(self, df, textColumn, columnName):
new_column = []
for index, row in df.iterrows():
paragraphs = row[textColumn].split('\n \n')
new_column.append(paragraphs[0])
df[columnName] = new_column
return
def getFirstSentence(self, df, textColumn, columnName):
sent = []
for index, row in df.iterrows():
sentences = nltk.sent_tokenize(row[textColumn])
sent.append(sentences[0])
df[columnName] = sent
return
def saveDataFrametoCSV(self, df, pathName):
df.to_csv(pathName)