import pandas as pd import numpy as np from re import search import math from unidecode import unidecode from sklearn.feature_extraction.text import CountVectorizer from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import re import nltk class Preprocessor(): def init(self): pass def remove_null_rows(self, df, columnName): #df = df[df[columnName].notna()] df.dropna(subset = [columnName], inplace = True) df.reset_index(drop=True, inplace=True) return def removeMarkers(self, df, textColumn, markerColumn = 'class'): #remove null values or add condition if exist #self.remove_null_rows(df, markerColumn) #self.remove_null_rows(df, textColumn) for index, row in df.iterrows(): if not pd.isna(row[markerColumn]) and not pd.isna(row[textColumn]): marker = row[markerColumn] marker_with_brcts = '('+ marker +')' row[textColumn] = row[textColumn].replace(marker_with_brcts , "") row[textColumn] = row[textColumn].replace(marker , "") full_text = row[textColumn] i = unidecode(full_text).find(marker_with_brcts) goOn = False if i != -1: goOn = True while goOn: full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):])) i = unidecode(full_text).find(marker_with_brcts) if i == -1: goOn = False row[textColumn] = full_text return df def removeWordsByFrequency(self, df, textColumn, min_word_occurence, max_word_occurence): stop_words = set(stopwords.words('french')) stemmer_fr = SnowballStemmer("french") analyzer = CountVectorizer().build_analyzer() def token_fr(doc): return (w for w in analyzer(doc) if not w in stop_words) stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr, max_df= max_word_occurence , min_df= min_word_occurence, max_features=None) docs = [] for index, row in df.iterrows(): docs.append(row[textColumn]) stem_vectorizer_fr.fit(docs) featured_docs = stem_vectorizer_fr.transform(docs) tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs) for index, tokens in enumerate(tokens_per_docs): # join token to recreate text with new tokens new_text = ' '.join(tokens) df.loc[index][textColumn] = new_text return def removeArticlesByTokensNumbers(self, df, textColumn, min_word_per_article): stop_words = set(stopwords.words('french')) stemmer_fr = SnowballStemmer("french") analyzer = CountVectorizer().build_analyzer() def token_fr(doc): return (w for w in analyzer(doc) if not w in stop_words) stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr) docs = [] for index, row in df.iterrows(): docs.append(row[textColumn]) stem_vectorizer_fr.fit(docs) featured_docs = stem_vectorizer_fr.transform(docs) tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs) concerned_article_index = [] for index, tokens in enumerate(tokens_per_docs): if len(tokens) <= min_word_per_article: concerned_article_index.append(index) df = df.drop(index = concerned_article_index, inplace = True) return def getFirstParagraph(self, df, textColumn, columnName): new_column = [] for index, row in df.iterrows(): paragraphs = row[textColumn].split('\n \n') new_column.append(paragraphs[0]) df[columnName] = new_column return def getFirstSentence(self, df, textColumn, columnName): sent = [] for index, row in df.iterrows(): sentences = nltk.sent_tokenize(row[textColumn]) sent.append(sentences[0]) df[columnName] = sent return def saveDataFrametoCSV(self, df, pathName): df.to_csv(pathName)