Skip to content
Snippets Groups Projects
data_preprocessing.py 4.22 KiB
Newer Older
Khalleud's avatar
Khalleud committed
import pandas as pd
import numpy as np
from re import search
import math
from unidecode import unidecode
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk

class Preprocessor():

    def init(self):
        pass


    def remove_null_rows(self, df, columnName):
        #df = df[df[columnName].notna()]
        df.dropna(subset = [columnName], inplace = True)
        df.reset_index(drop=True, inplace=True)

        return

    def removeMarkers(self, df, textColumn, markerColumn = 'class'):

        #remove null values or add condition if exist
        #self.remove_null_rows(df, markerColumn)
        #self.remove_null_rows(df, textColumn)

        for index, row in df.iterrows():
            if not pd.isna(row[markerColumn]) and not pd.isna(row[textColumn]):

                marker = row[markerColumn]
                marker_with_brcts = '('+ marker +')'
                row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
                row[textColumn] = row[textColumn].replace(marker , "")
                full_text = row[textColumn]
                i = unidecode(full_text).find(marker_with_brcts)
                goOn = False
                if i != -1:
                    goOn = True
                while goOn:

                    full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
                    i = unidecode(full_text).find(marker_with_brcts)
                    if i == -1:
                        goOn = False


                row[textColumn] = full_text

        return df


    def removeWordsByFrequency(self, df, textColumn, min_word_occurence, max_word_occurence):

        stop_words = set(stopwords.words('french'))
        stemmer_fr = SnowballStemmer("french")
        analyzer = CountVectorizer().build_analyzer()

        def token_fr(doc):
            return (w for w in analyzer(doc) if not w in stop_words)

        stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr, max_df= max_word_occurence , min_df= min_word_occurence, max_features=None)

        docs = []

        for index, row in df.iterrows():
            docs.append(row[textColumn])

        stem_vectorizer_fr.fit(docs)
        featured_docs = stem_vectorizer_fr.transform(docs)
        tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)

        for index, tokens in enumerate(tokens_per_docs):
            # join token to recreate text with new tokens
            new_text = ' '.join(tokens)
            df.loc[index][textColumn] = new_text

        return

    def removeArticlesByTokensNumbers(self, df,  textColumn, min_word_per_article):

        stop_words = set(stopwords.words('french'))
        stemmer_fr = SnowballStemmer("french")
        analyzer = CountVectorizer().build_analyzer()

        def token_fr(doc):
            return (w for w in analyzer(doc) if not w in stop_words)

        stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr)

        docs = []

        for index, row in df.iterrows():
            docs.append(row[textColumn])

        stem_vectorizer_fr.fit(docs)
        featured_docs = stem_vectorizer_fr.transform(docs)
        tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)

        concerned_article_index = []
        for index, tokens in enumerate(tokens_per_docs):

            if len(tokens) <= min_word_per_article:
                concerned_article_index.append(index)

        df = df.drop(index = concerned_article_index, inplace = True)

        return


    def  getFirstParagraph(self, df, textColumn, columnName):
        new_column = []
        for index, row in df.iterrows():
            paragraphs = row[textColumn].split('\n \n')
            new_column.append(paragraphs[0])
        df[columnName] = new_column
        return

    def getFirstSentence(self, df, textColumn, columnName):
        sent = []
        for index, row in df.iterrows():
            sentences = nltk.sent_tokenize(row[textColumn])
            sent.append(sentences[0])
        df[columnName] = sent
        return

    def saveDataFrametoCSV(self, df, pathName):
        df.to_csv(pathName)