[ADD] add scripts

db8395ca · Khalleud · 1c9ce02c · db8395ca · db8395ca · db8395ca
Commit db8395ca authored May 11, 2021 by Khalleud
--- a/projet/ClassPreprocessor.py
+++ b/projet/ClassPreprocessor.py
+import pandas as pd
+import numpy as np
+import statistics
+
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+
+def remove_weak_classes(df, classColumnName, threshold):
+
+    dictOfClassInstances = create_dict(df,classColumnName)
+
+
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    #df = df[df[columnTarget] not in keys]
+    #df =  df.merge(df_tmp, how = 'outer' ,indicator=True)
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+
+
+
+def split_class(df, columnProcessed):
+    i = 0
+    new_df = pd.DataFrame(columns= df.columns)
+    for index, row in df.iterrows():
+        #cls = re.split(';', row[columnProcessed])
+        cls = filter(None, row[columnProcessed].split(';'))
+        cls = list(cls)
+        #cls = re.findall(r"[\w']+", row [columnProcessed])
+        r = row
+        for categ in cls:
+            r[columnProcessed] = categ
+            #new_df.append(r, ignore_index = True)
+            new_df.loc[i] = r
+            i = i + 1
+
+    return new_df
+
+def get_median_dict(dict):
+    return statistics.median(dict.values())
+
+def resample_classes(df, classColumnName, numberOfInstances):
+    # numberOfInstances first elements
+    #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns])
+    #random numberOfInstances elements
+    replace = False  # with replacement
+
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
--- a/projet/classifiers.py
+++ b/projet/classifiers.py
+
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import SGDClassifier
+from sklearn.neighbors import KNeighborsClassifier
+
+import numpy as np
+
+
+classifiers = [
+                ('bayes', MultinomialNB()),
+                ('svm', SVC() ),
+                ('decisionTree',DecisionTreeClassifier()),
+                ('rfc', RandomForestClassifier()),
+                ('lr', LogisticRegression()),
+                ('sgd', SGDClassifier()),
+                ('knn', KNeighborsClassifier())
+                ]
+
+
+param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
+param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(1,10), 'min_samples_split': range(1,10), 'min_samples_leaf': range(1,5) }
+param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
+param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
+param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : 1000}
+param_grid_knn = {'n_neighbors' : list(range(1,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
+
+grid_params = [
+                ('bayes', None),
+                ('svm', param_grid_svm),
+                ('decisionTree', param_grid_decisionTree),
+                ('rfc', param_grid_rfc ),
+                ('lr', param_grid_lr),
+                ('sgd', param_grid_sgd ),
+                ('knn', param_grid_knn),
+                ]
--- a/projet/data/EDdA_dataframe_withContent.tsv
+++ b/projet/data/EDdA_dataframe_withContent.tsv
--- a/projet/data_preparation.py
+++ b/projet/data_preparation.py
+from os import path
+from os.path import basename, splitext
+import pandas as pd
+import os
+from data_process.TEIFile import TEIFile
+
+
+
+
+def basename_without_ext(path):
+    base_name = basename(path)
+    stem, ext = splitext(base_name)
+    if stem.endswith('.tei'):
+        # Return base name without tei file
+        return stem[0:-4]
+    else:
+        return stem
+
+
+def tei_to_csv_entry(tei_file, txt_file):
+    print(f"Going on {tei_file}")
+    tei = TEIFile(tei_file, txt_file)
+    print(f"Handled {tei_file}")
+    base_name = basename_without_ext(tei_file)
+    return base_name, tei._text, tei._Head, tei._author, tei._Objecttype, tei._Class, tei._normclass, tei._generatedclass, tei._englishclass, tei._attribution
+
+
+input_path = r'./data/EDdA/'
+output_name = "corpus_tei.csv"
+
+column_names = ["articleName", "text", "head", "author", "objecttype", "class", "normclass", "generatedclass", "englishclass", "attribution"]
+
+df = pd.DataFrame(columns = column_names)
+
+marge = 0
+
+
+for tome in os.listdir(input_path):
+    volume = tome[1:]
+
+    for index, article in enumerate(os.listdir(input_path + tome +"/")):
+        filepath = os.path.join(input_path, tome, article)
+        base_name = basename_without_ext(filepath)
+
+        df.loc[index+marge] = tei_to_csv_entry(filepath, ' ')
+        df.loc[index+marge]['articleName'] = volume+'_'+base_name
+    marge += index +1
+
+
+
+df.to_csv(output_name, index=False)
--- a/projet/data_preprocessing.py
+++ b/projet/data_preprocessing.py
+import pandas as pd
+import numpy as np
+from re import search
+import math
+from unidecode import unidecode
+from sklearn.feature_extraction.text import CountVectorizer
+from nltk.stem.snowball import SnowballStemmer
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import re
+import nltk
+
+class Preprocessor():
+
+    def init(self):
+        pass
+
+
+    def remove_null_rows(self, df, columnName):
+        #df = df[df[columnName].notna()]
+        df.dropna(subset = [columnName], inplace = True)
+        df.reset_index(drop=True, inplace=True)
+
+        return
+
+    def removeMarkers(self, df, textColumn, markerColumn = 'class'):
+
+        #remove null values or add condition if exist
+        #self.remove_null_rows(df, markerColumn)
+        #self.remove_null_rows(df, textColumn)
+
+        for index, row in df.iterrows():
+            if not pd.isna(row[markerColumn]) and not pd.isna(row[textColumn]):
+
+                marker = row[markerColumn]
+                marker_with_brcts = '('+ marker +')'
+                row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
+                row[textColumn] = row[textColumn].replace(marker , "")
+                full_text = row[textColumn]
+                i = unidecode(full_text).find(marker_with_brcts)
+                goOn = False
+                if i != -1:
+                    goOn = True
+                while goOn:
+
+                    full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
+                    i = unidecode(full_text).find(marker_with_brcts)
+                    if i == -1:
+                        goOn = False
+
+
+                row[textColumn] = full_text
+
+        return df
+
+
+    def removeWordsByFrequency(self, df, textColumn, min_word_occurence, max_word_occurence):
+
+        stop_words = set(stopwords.words('french'))
+        stemmer_fr = SnowballStemmer("french")
+        analyzer = CountVectorizer().build_analyzer()
+
+        def token_fr(doc):
+            return (w for w in analyzer(doc) if not w in stop_words)
+
+        stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr, max_df= max_word_occurence , min_df= min_word_occurence, max_features=None)
+
+        docs = []
+
+        for index, row in df.iterrows():
+            docs.append(row[textColumn])
+
+        stem_vectorizer_fr.fit(docs)
+        featured_docs = stem_vectorizer_fr.transform(docs)
+        tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
+
+        for index, tokens in enumerate(tokens_per_docs):
+            # join token to recreate text with new tokens
+            new_text = ' '.join(tokens)
+            df.loc[index][textColumn] = new_text
+
+        return
+
+    def removeArticlesByTokensNumbers(self, df,  textColumn, min_word_per_article):
+
+        stop_words = set(stopwords.words('french'))
+        stemmer_fr = SnowballStemmer("french")
+        analyzer = CountVectorizer().build_analyzer()
+
+        def token_fr(doc):
+            return (w for w in analyzer(doc) if not w in stop_words)
+
+        stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr)
+
+        docs = []
+
+        for index, row in df.iterrows():
+            docs.append(row[textColumn])
+
+        stem_vectorizer_fr.fit(docs)
+        featured_docs = stem_vectorizer_fr.transform(docs)
+        tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
+
+        concerned_article_index = []
+        for index, tokens in enumerate(tokens_per_docs):
+
+            if len(tokens) <= min_word_per_article:
+                concerned_article_index.append(index)
+
+        df = df.drop(index = concerned_article_index, inplace = True)
+
+        return
+
+
+    def  getFirstParagraph(self, df, textColumn, columnName):
+        new_column = []
+        for index, row in df.iterrows():
+            paragraphs = row[textColumn].split('\n \n')
+            new_column.append(paragraphs[0])
+        df[columnName] = new_column
+        return
+
+    def getFirstSentence(self, df, textColumn, columnName):
+        sent = []
+        for index, row in df.iterrows():
+            sentences = nltk.sent_tokenize(row[textColumn])
+            sent.append(sentences[0])
+        df[columnName] = sent
+        return
+
+    def saveDataFrametoCSV(self, df, pathName):
+        df.to_csv(pathName)
+
+    
--- a/projet/data_process/TEIFile.py
+++ b/projet/data_process/TEIFile.py
+from data_process.data_functions import read_tei
+
+class TEIFile(object):
+    def __init__(self, filename, textfilename):
+        self.filename = filename
+        self.soup = read_tei(filename)
+        self._text = None
+        self._Head = ''
+        self._Objecttype = ''
+        self._attribution = ''
+        self._Class = ''
+        self._normclass = ''
+        self._englishclass = ''
+        self._generatedclass = ''
+        self._author = ''
+
+
+        if self.soup.find('index', type='head'):
+            self._Head = self.soup.find('index', type='head')['value']
+
+        if self.soup.find('index', type='objecttype'):
+            self._Objecttype = self.soup.find('index', type='objecttype')['value']
+
+
+        if self.soup.find('index', type='attribution'):
+            self._attribution = self.soup.find('index', type='attribution')['value']
+
+        if self.soup.find('index', type='class') and self.soup.find('index', type='class').has_attr('value') :
+
+            self._Class = self.soup.find('index', type='class')['value']
+
+        if self.soup.find('index', type='normclass'):
+            self._normclass =  self.soup.find('index', type='normclass')['value']
+
+        if self.soup.find('index', type='englishclass'):
+            self._englishclass = self.soup.find('index', type='englishclass')['value']
+
+        if self.soup.find('index', type='generatedclass'):
+            self._generatedclass = self.soup.find('index', type='generatedclass')['value']
+
+        if self.soup.find('index', type = 'author'):
+            self._author = self.soup.find('index', type='author')['value']
+
+
+
+        ps = self.soup.find_all('p')
+        Texts = []
+        for p in ps[1:]:
+            Texts.append(p.getText())
+
+        self._text = ' '.join(Texts)
--- a/projet/data_process/data_functions.py
+++ b/projet/data_process/data_functions.py
+from bs4 import BeautifulSoup
+
+def read_tei(tei_file):
+    with open(tei_file, 'r') as tei:
+        soup = BeautifulSoup(tei, 'lxml')
+        return soup
+    raise RuntimeError('Cannot generate a soup from the input')
+
+
+def elem_to_text(elem, default=''):
+    if elem:
+        return elem.getText(separator=' ', strip=True)
+    else:
+        return default
--- a/projet/evaluate_model.py
+++ b/projet/evaluate_model.py
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+import pandas as pd
+
+
+def evaluate_model(y_pred, valid_y, classes, classesName):
+
+    #classifier, label_list, test_x, valid_y, title = "Confusion matrix"):
+    precision = []
+    recall = []
+    f1 = []
+    support = []
+    weighted_avg = None
+    accuracy = None
+
+    df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
+    report = classification_report( y_pred, valid_y, output_dict = True)
+    for c in classes:
+        precision.append(report[c]['precision'])
+        recall.append(report[c]['recall'])
+        f1.append(report[c]['f1-score'])
+        support.append(report[c]['support'])
+
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+    cnf_matrix = confusion_matrix(valid_y, y_pred)
+    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
+    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
+    TP = np.diag(cnf_matrix)
+    TN = cnf_matrix.sum() - (FP + FN + TP)
+
+    df['className'] = classesName
+    df['precision'] = precision
+    df['recall'] = recall
+    df['f1-score'] = f1
+    df['support'] = support
+    df['FP'] = FP
+    df['FN'] = FN
+    df['TP'] = TP
+    df['TN'] = TN
+    #disp = plot_confusion_matrix(classifier, test_x, valid_y,
+    #                                 display_labels= label_list,
+    #                                 cmap=plt.cm.Blues,
+    #                                 normalize=None)
+    #disp.ax_.set_title(title)
+
+    #print(title)
+    #print(disp.confusion_matrix)
+
+    #plt.show()
+    return df, accuracy, weighted_avg
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+
+
+#y_true = [2, 0, 2, 2, 0, 1]
+#y_pred = [0, 0, 2, 2, 0, 2]
+#cf_matrix = confusion_matrix(y_true, y_pred)
+#sns.heatmap(cf_matrix, annot=True)
+#import matplotlib.pyplot as plt
+#plt.show()
--- a/projet/features_extractor.py
+++ b/projet/features_extractor.py
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from nltk.stem.snowball import SnowballStemmer
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import string
+import pandas as pd
+import numpy as np
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+from nltk.tokenize import word_tokenize
+
+
+class feature_extractor:
+
+    def __init__(self, data, column, target):
+
+        self.column = column
+        self.data = data
+        self.X = data[column]
+        self.y = data[target]
+
+        self.docs = []
+        for index, row in data.iterrows():
+            self.docs.append(row[column])
+
+
+    def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ):
+        stop_words = set(stopwords.words('french'))
+
+        stemmer_fr = SnowballStemmer("french")
+
+        analyzer = CountVectorizer().build_analyzer()
+
+        def stemmed_words_fr(doc):
+            return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
+
+        stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures)
+
+        stem_vectorizer_fr.fit(self.docs)
+
+        return stem_vectorizer_fr.transform(self.docs)
+
+
+    def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None):
+
+        stop_words = set(stopwords.words('french'))
+
+        stemmer_fr = SnowballStemmer("french")
+
+        analyzer = TfidfVectorizer().build_analyzer()
+
+        def stemmed_words_fr(doc):
+            return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
+
+        tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures)
+        tfidf_vectorizer.fit(self.docs)
+        return tfidf_vectorizer.transform(self.docs)
+
+
+
+
+    def doc2vec(self, max_epochs, vec_size, alpha , dm = 1):
+        tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)]
+        model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
+
+        model.build_vocab(tagged_data)
+
+        for epoch in range(max_epochs):
+            print('iteration {0}'.format(epoch))
+            model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
+            # decrease the learning rate
+            model.alpha -= 0.0002
+            # fix the learning rate, no decay
+            model.min_alpha = model.alpha
+
+
+        set_tags = list(model.docvecs.doctags)
+        nb_docs_small = len(set_tags)
+        doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size))
+
+        i = 0
+        for t in set_tags:
+            doc_vec_doc2vec[i] = model.docvecs[t]
+            i += 1
+
+        return doc_vec_doc2vec
+
+
+    def text_based_features(self):
+
+        # Classical measures
+
+        df = pd.DataFrame(columns=['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count'])
+        df['char_count'] = self.data[self.column].apply(len)
+        df['word_count'] = self.data[self.column].apply(lambda x: len(x.split()))
+        df['word_density'] = df['char_count'] / (df['word_count']+1)
+        df['punctuation_count'] = self.data[self.column].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
+        df['title_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
+        df['upper_case_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
+
+        return df
--- a/projet/main_1.py
+++ b/projet/main_1.py
+import pandas as pd
+from data_preprocessing import Preprocessor
+from features_extractor import feature_extractor
+from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
+from classifiers import classifiers, grid_params
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+from evaluate_model import evaluate_model
+from sklearn.naive_bayes import MultinomialNB
+
+# Reading data
+df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
+df_normClass_artfl = df[['normClass_artfl','content']].copy()
+
+#remove null values of class column and text column
+preprocessor = Preprocessor()
+preprocessor.remove_null_rows(df_normClass_artfl, 'content')
+preprocessor.remove_null_rows(df_normClass_artfl, 'normClass_artfl')
+df_normClass_artfl = split_class(df_normClass_artfl, 'normClass_artfl')
+
+minOfInstancePerClass = 200
+maxOfInstancePerClass = 1500
+
+#remove weak classes and resample classes
+
+df_normClass_artfl = remove_weak_classes(df_normClass_artfl, 'normClass_artfl', minOfInstancePerClass )
+df_normClass_artfl = resample_classes(df_normClass_artfl, 'normClass_artfl', maxOfInstancePerClass)
+
+preprocessor.saveDataFrametoCSV(df_normClass_artfl,'df_normClass_artfl.csv')
+#features extraction step
+#df_normClass_artfl = pd.read_csv('df_normClass_artfl.csv')
+extractor = feature_extractor(df_normClass_artfl,'content', 'normClass_artfl')
+
+X_count_vect = extractor.count_vect()
+X_tf = extractor.tf_idf()
+#X_doc2vec = extractor.doc2vec(10, 20, 0.025)
+#X_text_feature = extractor.text_based_features()
+
+
+# preparing the train and test data
+df_normClass_artfl = df_normClass_artfl[df_normClass_artfl['normClass_artfl'] != 'unclassified']
+y  = df_normClass_artfl['normClass_artfl']
+
+train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y )
+encoder = preprocessing.LabelEncoder()
+train_y = encoder.fit_transform(train_y)
+valid_y = encoder.fit_transform(test_y)
+
+
+
+# fit the model
+m = MultinomialNB()
+
+m.fit(train_x, train_y)
+
+
+y_pred = m.predict(test_x)
+
+
+
+
+#evaluate model
+
+
+report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_)
+print(report)
+print('accuracy : {}'.format(accuracy))
+print('weighted_Precision : {}'.format(weighted_avg['precision']))
+print('weighted_Recall    : {}'.format(weighted_avg['recall']))
+print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
+print('weighted_Support   : {}'.format(weighted_avg['support']))
--- a/projet/main_2.py
+++ b/projet/main_2.py
+import pandas as pd
+from data_preprocessing import Preprocessor
+from features_extractor import feature_extractor
+from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
+from classifiers import classifiers, grid_params
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+from evaluate_model import evaluate_model
+from sklearn.naive_bayes import MultinomialNB
+
+# Reading data
+df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
+df_domaine_enccre = df[['_domaine_enccre','content']].copy()
+
+#remove null values of class column and text column
+preprocessor = Preprocessor()
+preprocessor.remove_null_rows(df_domaine_enccre, 'content')
+preprocessor.remove_null_rows(df_domaine_enccre, '_domaine_enccre')
+df_domaine_enccre = split_class(df_domaine_enccre, '_domaine_enccre')
+
+minOfInstancePerClass = 200
+maxOfInstancePerClass = 1500
+
+#remove weak classes and resample classes
+
+df_domaine_enccre = remove_weak_classes(df_domaine_enccre, '_domaine_enccre', minOfInstancePerClass )
+df_domaine_enccre = resample_classes(df_domaine_enccre, '_domaine_enccre', maxOfInstancePerClass)
+
+preprocessor.saveDataFrametoCSV(df_domaine_enccre,'df_domaine_enccre.csv')
+#features extraction step
+#df_domaine_enccre = pd.read_csv('df_domaine_enccre.csv')
+
+
+extractor = feature_extractor(df_domaine_enccre,'content', '_domaine_enccre')
+
+X_count_vect = extractor.count_vect()
+X_tf = extractor.tf_idf()
+#X_doc2vec = extractor.doc2vec(10, 20, 0.025)
+#X_text_feature = extractor.text_based_features()
+
+
+# preparing the train and test data
+df_domaine_enccre = df_domaine_enccre[df_domaine_enccre['domaine_enccre'] != 'unclassified']
+y  = df_domaine_enccre['domaine_enccre']
+
+train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y )
+encoder = preprocessing.LabelEncoder()
+train_y = encoder.fit_transform(train_y)
+valid_y = encoder.fit_transform(test_y)
+
+
+
+# fit the model
+m = MultinomialNB()
+
+m.fit(train_x, train_y)
+
+
+y_pred = m.predict(test_x)
+
+
+
+
+#evaluate model
+
+
+report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_)
+print(report)
+print('accuracy : {}'.format(accuracy))
+print('weighted_Precision : {}'.format(weighted_avg['precision']))
+print('weighted_Recall    : {}'.format(weighted_avg['recall']))
+print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
+print('weighted_Support   : {}'.format(weighted_avg['support']))
--- a/projet/main_3.py
+++ b/projet/main_3.py
+import pandas as pd
+import numpy as np
+from data_preprocessing import Preprocessor
+from features_extractor import feature_extractor
+from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
+from classifiers import classifiers, grid_params
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+from evaluate_model import evaluate_model
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+
+
+# Reading data
+df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
+df_ensemble_domaine_enccre = df[['ensemble_domaine_enccre','content']].copy()
+
+#remove null values of class column and text column
+preprocessor = Preprocessor()
+preprocessor.remove_null_rows(df_ensemble_domaine_enccre, 'content')
+preprocessor.remove_null_rows(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre')
+#df_ensemble_domaine_enccre = split_class(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre')
+
+minOfInstancePerClass = 200
+maxOfInstancePerClass = 1500
+
+#remove weak classes and resample classes
+print(create_dict(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre'))
+df_ensemble_domaine_enccre = remove_weak_classes(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre', minOfInstancePerClass )
+df_ensemble_domaine_enccre = resample_classes(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre', maxOfInstancePerClass)
+
+print(create_dict(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre'))
+
+#preprocessor.saveDataFrametoCSV(df_ensemble_domaine_enccre,'df_ensemble_domaine_enccre.csv')
+#features extraction step
+#df_ensemble_domaine_enccre = pd.read_csv('df_ensemble_domaine_enccre.csv')
+
+
+extractor = feature_extractor(df_ensemble_domaine_enccre,'content', 'ensemble_domaine_enccre')
+
+X_count_vect = extractor.count_vect()
+#X_tf = extractor.tf_idf()
+#X_doc2vec = extractor.doc2vec(10, 20, 0.025)
+#X_text_feature = extractor.text_based_features()
+
+
+# preparing the train and test data
+df_ensemble_domaine_enccre = df_ensemble_domaine_enccre[df_ensemble_domaine_enccre['ensemble_domaine_enccre'] != 'unclassified']
+y  = df_ensemble_domaine_enccre['ensemble_domaine_enccre']
+
+train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y )
+encoder = preprocessing.LabelEncoder()
+train_y = encoder.fit_transform(train_y)
+valid_y = encoder.fit_transform(test_y)
+
+
+
+# fit the model
+
+m = LogisticRegression() #MultinomialNB()
+#m.fit(train_x, train_y)
+
+param_grid_lr = {"C":np.logspace(-3,3,7)}
+
+clf = GridSearchCV(m, param_grid = param_grid_lr, cv = 5, verbose=True, n_jobs=-1)
+
+# Fit on data
+
+best_clf = clf.fit(train_x, train_y)
+
+y_pred = clf.predict(test_x)
+
+
+
+
+#evaluate model
+
+
+report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_)
+print(report)
+print('accuracy : {}'.format(accuracy))
+print('weighted_Precision : {}'.format(weighted_avg['precision']))
+print('weighted_Recall    : {}'.format(weighted_avg['recall']))
+print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
+print('weighted_Support   : {}'.format(weighted_avg['support']))
--- a/projet/requirements.txt
+++ b/projet/requirements.txt
+beautifulsoup4
+lxml
+Unidecode
+Unidecode==1.2.0