Merge branch 'branch_v1' into 'master'

Branch v1 See merge request !1

Merge branch 'branch_v1' into 'master'
Branch v1 See merge request !1
edd2e616 · Ludovic Moncla · 1c9ce02c · 39ca3e0a · edd2e616 · edd2e616
Commit edd2e616 authored 3 years ago by Ludovic Moncla
--- a/projet/ClassPreprocessor.py
+++ b/projet/ClassPreprocessor.py
+import pandas as pd
+import numpy as np
+import statistics
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+def remove_weak_classes(df, classColumnName, threshold):
+    dictOfClassInstances = create_dict(df,classColumnName)
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    #df = df[df[columnTarget] not in keys]
+    #df =  df.merge(df_tmp, how = 'outer' ,indicator=True)
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+def split_class(df, columnProcessed):
+    i = 0
+    new_df = pd.DataFrame(columns= df.columns)
+    for index, row in df.iterrows():
+        #cls = re.split(';', row[columnProcessed])
+        cls = filter(None, row[columnProcessed].split(';'))
+        cls = list(cls)
+        #cls = re.findall(r"[\w']+", row [columnProcessed])
+        r = row
+        for categ in cls:
+            r[columnProcessed] = categ
+            #new_df.append(r, ignore_index = True)
+            new_df.loc[i] = r
+            i = i + 1
+    return new_df
+def get_median_dict(dict):
+    return statistics.median(dict.values())
+def resample_classes(df, classColumnName, numberOfInstances):
+    # numberOfInstances first elements
+    #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns])
+    #random numberOfInstances elements
+    replace = False  # with replacement
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
--- a/projet/__init__.py
+++ b/projet/__init__.py
+from data_process.data_functions import read_tei, elem_to_text, basename_without_ext, tei_to_csv_entry
+from data_process.TEIFile import TEIFile
--- a/projet/classifiers.py
+++ b/projet/classifiers.py
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import SGDClassifier
+from sklearn.neighbors import KNeighborsClassifier
+import numpy as np
+classifiers = [
+                ('bayes', MultinomialNB()),
+                ('svm', SVC() ),
+                ('decisionTree',DecisionTreeClassifier()),
+                ('rfc', RandomForestClassifier()),
+                ('lr', LogisticRegression()),
+                ('sgd', SGDClassifier()),
+                ('knn', KNeighborsClassifier())
+                ]
+param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
+param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
+param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
+param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
+param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]}
+param_grid_knn = {'n_neighbors' : list(range(1,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
+grid_params = [
+                ('bayes', None),
+                ('svm', param_grid_svm),
+                ('decisionTree', param_grid_decisionTree),
+                ('rfc', param_grid_rfc ),
+                ('lr', param_grid_lr),
+                ('sgd', param_grid_sgd ),
+                ('knn', param_grid_knn),
+                ]
--- a/projet/data_preparation.py
+++ b/projet/data_preparation.py
+from os import path
+from os.path import basename, splitext
+import pandas as pd
+import os
+from data_process.TEIFile import TEIFile
+def basename_without_ext(path):
+    base_name = basename(path)
+    stem, ext = splitext(base_name)
+    if stem.endswith('.tei'):
+        # Return base name without tei file
+        return stem[0:-4]
+    else:
+        return stem
+def tei_to_csv_entry(tei_file, txt_file):
+    print(f"Going on {tei_file}")
+    tei = TEIFile(tei_file, txt_file)
+    print(f"Handled {tei_file}")
+    base_name = basename_without_ext(tei_file)
+    return base_name, tei._text, tei._Head, tei._author, tei._Objecttype, tei._Class, tei._normclass, tei._generatedclass, tei._englishclass, tei._attribution
+input_path = r'./data/EDdA/'
+output_name = "corpus_tei.csv"
+column_names = ["articleName", "text", "head", "author", "objecttype", "class", "normclass", "generatedclass", "englishclass", "attribution"]
+df = pd.DataFrame(columns = column_names)
+marge = 0
+for tome in os.listdir(input_path):
+    volume = tome[1:]
+    for index, article in enumerate(os.listdir(input_path + tome +"/")):
+        filepath = os.path.join(input_path, tome, article)
+        base_name = basename_without_ext(filepath)
+        df.loc[index+marge] = tei_to_csv_entry(filepath, ' ')
+        df.loc[index+marge]['articleName'] = volume+'_'+base_name
+    marge += index +1
+df.to_csv(output_name, index=False)
--- a/projet/data_preprocessing.py
+++ b/projet/data_preprocessing.py
+import pandas as pd
+import numpy as np
+from re import search
+import math
+from unidecode import unidecode
+from sklearn.feature_extraction.text import CountVectorizer
+from nltk.stem.snowball import SnowballStemmer
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import re
+import nltk
+class Preprocessor():
+    def init(self):
+        pass
+    def remove_null_rows(self, df, columnName):
+        #df = df[df[columnName].notna()]
+        df.dropna(subset = [columnName], inplace = True)
+        df.reset_index(drop=True, inplace=True)
+        return
+    def removeMarkers(self, df, textColumn, markerColumn = 'class'):
+        #remove null values or add condition if exist
+        #self.remove_null_rows(df, markerColumn)
+        #self.remove_null_rows(df, textColumn)
+        for index, row in df.iterrows():
+            if not pd.isna(row[markerColumn]) and not pd.isna(row[textColumn]):
+                marker = row[markerColumn]
+                marker_with_brcts = '('+ marker +')'
+                row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
+                row[textColumn] = row[textColumn].replace(marker , "")
+                full_text = row[textColumn]
+                i = unidecode(full_text).find(marker_with_brcts)
+                goOn = False
+                if i != -1:
+                    goOn = True
+                while goOn:
+                    full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
+                    i = unidecode(full_text).find(marker_with_brcts)
+                    if i == -1:
+                        goOn = False
+                row[textColumn] = full_text
+        return df
+    def removeWordsByFrequency(self, df, textColumn, min_word_occurence, max_word_occurence):
+        stop_words = set(stopwords.words('french'))
+        stemmer_fr = SnowballStemmer("french")
+        analyzer = CountVectorizer().build_analyzer()
+        def token_fr(doc):
+            return (w for w in analyzer(doc) if not w in stop_words)
+        stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr, max_df= max_word_occurence , min_df= min_word_occurence, max_features=None)
+        docs = []
+        for index, row in df.iterrows():
+            docs.append(row[textColumn])
+        stem_vectorizer_fr.fit(docs)
+        featured_docs = stem_vectorizer_fr.transform(docs)
+        tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
+        for index, tokens in enumerate(tokens_per_docs):
+            # join token to recreate text with new tokens
+            new_text = ' '.join(tokens)
+            df.loc[index][textColumn] = new_text
+        return
+    def removeArticlesByTokensNumbers(self, df,  textColumn, min_word_per_article):
+        stop_words = set(stopwords.words('french'))
+        stemmer_fr = SnowballStemmer("french")
+        analyzer = CountVectorizer().build_analyzer()
+        def token_fr(doc):
+            return (w for w in analyzer(doc) if not w in stop_words)
+        stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr)
+        docs = []
+        for index, row in df.iterrows():
+            docs.append(row[textColumn])
+        stem_vectorizer_fr.fit(docs)
+        featured_docs = stem_vectorizer_fr.transform(docs)
+        tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
+        concerned_article_index = []
+        for index, tokens in enumerate(tokens_per_docs):
+            if len(tokens) <= min_word_per_article:
+                concerned_article_index.append(index)
+        df = df.drop(index = concerned_article_index, inplace = True)
+        return
+    def  getFirstParagraph(self, df, textColumn, columnName):
+        new_column = []
+        for index, row in df.iterrows():
+            paragraphs = row[textColumn].split('\n \n')
+            new_column.append(paragraphs[0])
+        df[columnName] = new_column
+        return
+    def getFirstSentence(self, df, textColumn, columnName):
+        sent = []
+        for index, row in df.iterrows():
+            sentences = nltk.sent_tokenize(row[textColumn])
+            sent.append(sentences[0])
+        df[columnName] = sent
+        return
+    def saveDataFrametoCSV(self, df, pathName):
+        df.to_csv(pathName)
--- a/projet/data_process/TEIFile.py
+++ b/projet/data_process/TEIFile.py
+from data_process.data_functions import read_tei
+class TEIFile(object):
+    def __init__(self, filename, textfilename):
+        self.filename = filename
+        self.soup = read_tei(filename)
+        self._text = None
+        self._Head = ''
+        self._Objecttype = ''
+        self._attribution = ''
+        self._Class = ''
+        self._normclass = ''
+        self._englishclass = ''
+        self._generatedclass = ''
+        self._author = ''
+        if self.soup.find('index', type='head'):
+            self._Head = self.soup.find('index', type='head')['value']
+        if self.soup.find('index', type='objecttype'):
+            self._Objecttype = self.soup.find('index', type='objecttype')['value']
+        if self.soup.find('index', type='attribution'):
+            self._attribution = self.soup.find('index', type='attribution')['value']
+        if self.soup.find('index', type='class') and self.soup.find('index', type='class').has_attr('value') :
+            self._Class = self.soup.find('index', type='class')['value']
+        if self.soup.find('index', type='normclass'):
+            self._normclass =  self.soup.find('index', type='normclass')['value']
+        if self.soup.find('index', type='englishclass'):
+            self._englishclass = self.soup.find('index', type='englishclass')['value']
+        if self.soup.find('index', type='generatedclass'):
+            self._generatedclass = self.soup.find('index', type='generatedclass')['value']
+        if self.soup.find('index', type = 'author'):
+            self._author = self.soup.find('index', type='author')['value']
+        ps = self.soup.find_all('p')
+        Texts = []
+        for p in ps[1:]:
+            Texts.append(p.getText())
+        self._text = ' '.join(Texts)
--- a/projet/data_process/__pycache__/TEIFile.cpython-37.pyc
+++ b/projet/data_process/__pycache__/TEIFile.cpython-37.pyc
--- a/projet/data_process/__pycache__/data_functions.cpython-37.pyc
+++ b/projet/data_process/__pycache__/data_functions.cpython-37.pyc
--- a/projet/data_process/data_functions.py
+++ b/projet/data_process/data_functions.py
+from bs4 import BeautifulSoup
+def read_tei(tei_file):
+    with open(tei_file, 'r') as tei:
+        soup = BeautifulSoup(tei, 'lxml')
+        return soup
+    raise RuntimeError('Cannot generate a soup from the input')
+def elem_to_text(elem, default=''):
+    if elem:
+        return elem.getText(separator=' ', strip=True)
+    else:
+        return default
--- a/projet/evaluate_model.py
+++ b/projet/evaluate_model.py
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+import pandas as pd
+import seaborn as sns
+def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave):
+    #classifier, label_list, test_x, valid_y, title = "Confusion matrix"):
+    precision = []
+    recall = []
+    f1 = []
+    support = []
+    weighted_avg = None
+    accuracy = None
+    df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
+    report = classification_report( y_pred, valid_y, output_dict = True)
+    for c in classes:
+        precision.append(report[c]['precision'])
+        recall.append(report[c]['recall'])
+        f1.append(report[c]['f1-score'])
+        support.append(report[c]['support'])
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+    cnf_matrix = confusion_matrix(valid_y, y_pred)
+    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
+    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
+    TP = np.diag(cnf_matrix)
+    TN = cnf_matrix.sum() - (FP + FN + TP)
+    df['className'] = classesName
+    df['precision'] = precision
+    df['recall'] = recall
+    df['f1-score'] = f1
+    df['support'] = support
+    df['FP'] = FP
+    df['FN'] = FN
+    df['TP'] = TP
+    df['TN'] = TN
+    #disp = plot_confusion_matrix(classifier, test_x, valid_y,
+    #                                 display_labels= label_list,
+    #                                 cmap=plt.cm.Blues,
+    #                                 normalize=None)
+    #disp.ax_.set_title(title)
+    #print(title)
+    #print(disp.confusion_matrix)
+    #plt.show()
+    plt.rcParams["font.size"] = 3
+    plot_confusion_matrix(clf, X_test, y_test)
+    plt.savefig(pathSave)
+    return df, accuracy, weighted_avg
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+#y_true = [2, 0, 2, 2, 0, 1]
+#y_pred = [0, 0, 2, 2, 0, 2]
+#cf_matrix = confusion_matrix(y_true, y_pred)
+#sns.heatmap(cf_matrix, annot=True)
+#import matplotlib.pyplot as plt
+#plt.show()
--- a/projet/experimentsClassicClassifiers.py
+++ b/projet/experimentsClassicClassifiers.py
+import sys
+import os
+import time
+import argparse
+import pandas as pd
+from data_preprocessing import Preprocessor
+from features_extractor import feature_extractor
+from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
+from classifiers import classifiers, grid_params
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+from evaluate_model import evaluate_model
+from sklearn.model_selection import GridSearchCV
+import configparser
+parser = argparse.ArgumentParser()
+parser.add_argument("dataPath", help="Path of the dataframe")
+parser.add_argument("columnText", help="the column name of the text that should preproceed", default = 'content')
+parser.add_argument("columnClass", help="ColumnClass the column name of the classes")
+parser.add_argument("minOfInstancePerClass", help="minOfInstancePerClass the minimum of instance required for each class", type=int)
+parser.add_argument("maxOfInstancePerClass", help="maxOfInstancePerClass the maximum of instance required resamling classes", type=int)
+args = parser.parse_args()
+dataPath = args.dataPath
+columnText = args.columnText
+columnClass = args.columnClass
+minOfInstancePerClass = args.minOfInstancePerClass
+maxOfInstancePerClass = args.maxOfInstancePerClass
+# create directory in the reports directory so save the classification results
+dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
+if not os.path.exists(os.path.join('reports',  columnClass, dir_name_report)):
+    os.makedirs(os.path.join('reports', columnClass, dir_name_report))
+# Reading data and preprocessings steps
+preprocessor = Preprocessor()
+df_original = pd.read_csv(dataPath)
+df = df_original[[columnClass,columnText]].copy()
+preprocessor.remove_null_rows(df, columnText)
+preprocessor.remove_null_rows(df, columnClass)
+#df = split_class(df, columnClass)
+df = remove_weak_classes(df, columnClass, minOfInstancePerClass )
+df = resample_classes(df, columnClass, maxOfInstancePerClass)
+preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text
+#Read configuration file for retreiving parameters of features extractors
+config = configparser.ConfigParser()
+config.read('settings.conf')
+vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else  float(config.get('vectorizers','vectorization_max_df'))
+vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else  float(config.get('vectorizers','vectorization_min_df'))
+vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None
+doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size'))
+doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs'))
+doc2vec_lr = float(config.get('vectorizers','doc2vec_lr'))
+extractor = feature_extractor(df,columnText, columnClass)
+extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass)
+features_techniques = [
+('counter',  extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
+('tf_idf',  extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
+('doc2vec',  extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
+features_techniques_paragraphe = [
+('counter',  extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
+('tf_idf',  extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
+('doc2vec',  extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
+#prepare data
+df = df[df[columnClass] != 'unclassified']
+y  = df[columnClass]
+#case of full text
+for feature_technique_name, features in features_techniques:
+    train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
+    encoder = preprocessing.LabelEncoder()
+    train_y = encoder.fit_transform(train_y)
+    valid_y = encoder.fit_transform(test_y)
+    for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
+        clf_name, clf = tmp_clf
+        grid_param_name, grid_param = tmp_grid_params
+        print(clf_name, clf, grid_param_name, grid_param)
+        if clf_name == 'bayes' :
+            if feature_technique_name == 'doc2vec':
+                continue
+            else:
+                t_begin = time.time()
+                clf.fit(train_x, train_y)
+                t_end =time.time()
+                training_time = t_end - t_begin
+                y_pred = clf.predict(test_x)
+        else :
+            clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
+            t_begin = time.time()
+            clf.fit(train_x, train_y)
+            t_end =time.time()
+            training_time = t_end - t_begin
+            y_pred = clf.predict(test_x)
+#evaluate model
+        file_name_report = feature_technique_name + '_' + clf_name
+        report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
+        with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f:
+            sys.stdout = f # Change the standard output to the file we created.
+            print(report)
+            print('accuracy : {}'.format(accuracy))
+            print('weighted_Precision : {}'.format(weighted_avg['precision']))
+            print('weighted_Recall    : {}'.format(weighted_avg['recall']))
+            print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
+            print('weighted_Support   : {}'.format(weighted_avg['support']))
+            print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
+            print('training time   : {}'.format(training_time))
+            #sys.stdout = sys.stdout # Reset the standard output to its original value
+            sys.stdout = sys.__stdout__
+for feature_technique_name, features in features_techniques_paragraphe:
+    train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
+    encoder = preprocessing.LabelEncoder()
+    train_y = encoder.fit_transform(train_y)
+    valid_y = encoder.fit_transform(test_y)
+    for tmp_clf, clf_grid_params in zip(classifiers, grid_params):
+        clf_name, clf = tmp_clf
+        grid_param_name, grid_param = tmp_grid_params
+        if clf_name == 'bayes' :
+            if feature_technique_name == 'doc2vec':
+                continue
+            else:
+                t_begin = time.time()
+                clf.fit(train_x, train_y)
+                t_end =time.time()
+                training_time = t_end - t_begin
+                y_pred = clf.predict(test_x)
+        else :
+            clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
+            t_begin = time.time()
+            clf.fit(train_x, train_y)
+            t_end =time.time()
+            training_time = t_end - t_begin
+            y_pred = clf.predict(test_x)
+#evaluate model
+        file_name_report_paragraphe = feature_technique_name + '_paragraphe_' + clf_name
+        report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe)+'.pdf')
+        with open(os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe+'.txt'), 'w') as f:
+            sys.stdout = f # Change the standard output to the file we created.
+            print(report)
+            print('accuracy : {}'.format(accuracy))
+            print('weighted_Precision : {}'.format(weighted_avg['precision']))
+            print('weighted_Recall    : {}'.format(weighted_avg['recall']))
+            print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
+            print('weighted_Support   : {}'.format(weighted_avg['support']))
+            print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
+            print('training time   : {}'.format(training_time))
+            sys.stdout = sys.stdout # Reset the standard output to its original value
+            sys.stdout = sys.__stdout__
--- a/projet/features_extractor.py
+++ b/projet/features_extractor.py
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from nltk.stem.snowball import SnowballStemmer
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import string
+import pandas as pd
+import numpy as np
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+from nltk.tokenize import word_tokenize
+class feature_extractor:
+    def __init__(self, data, column, target):
+        self.column = column
+        self.data = data
+        self.X = data[column]
+        self.y = data[target]
+        self.docs = []
+        for index, row in data.iterrows():
+            self.docs.append(row[column])
+    def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ):
+        stop_words = set(stopwords.words('french'))
+        stemmer_fr = SnowballStemmer("french")
+        analyzer = CountVectorizer().build_analyzer()
+        def stemmed_words_fr(doc):
+            return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
+        stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures)
+        stem_vectorizer_fr.fit(self.docs)
+        return stem_vectorizer_fr.transform(self.docs)
+    def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None):
+        stop_words = set(stopwords.words('french'))
+        stemmer_fr = SnowballStemmer("french")
+        analyzer = TfidfVectorizer().build_analyzer()
+        def stemmed_words_fr(doc):
+            return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
+        tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures)
+        tfidf_vectorizer.fit(self.docs)
+        return tfidf_vectorizer.transform(self.docs)
+    def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1):
+        tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)]
+        model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
+        model.build_vocab(tagged_data)
+        for epoch in range(max_epochs):
+            print('iteration {0}'.format(epoch))
+            model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
+            # decrease the learning rate
+            model.alpha -= 0.0002
+            # fix the learning rate, no decay
+            model.min_alpha = model.alpha
+        set_tags = list(model.docvecs.doctags)
+        nb_docs_small = len(set_tags)
+        doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size))
+        i = 0
+        for t in set_tags:
+            doc_vec_doc2vec[i] = model.docvecs[t]
+            i += 1
+        return doc_vec_doc2vec
+    def text_based_features(self):
+        # Classical measures
+        df = pd.DataFrame(columns=['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count'])
+        df['char_count'] = self.data[self.column].apply(len)
+        df['word_count'] = self.data[self.column].apply(lambda x: len(x.split()))
+        df['word_density'] = df['char_count'] / (df['word_count']+1)
+        df['punctuation_count'] = self.data[self.column].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
+        df['title_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
+        df['upper_case_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
+        return df
--- a/projet/requirements.txt
+++ b/projet/requirements.txt
+beautifulsoup4
+lxml
+Unidecode
+Unidecode==1.2.0
+Keras==2.4.3
+Keras-Preprocessing==1.1.2
+sentence-transformers==0.4.1.2
+transformers==4.3.2
+torch==1.8.1
+torchvision==0.8.2
+tokenizers==0.10.1
+regex==2018.1.10
+tensorflow==2.2.0
+gensgensim==3.8.1
--- a/projet/script.txt
+++ b/projet/script.txt
+mkdir -p reports/domaine_enccre
+mkdir -p reports/ensemble_domaine_enccre
+mkdir -p reports/normClass_artfl
+pip install -r requierments.txt
+python tmp_preprocess_data.py 
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 300 1500
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 1500 
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 800     
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 100 1500   
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 1500
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 50 1500
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 500            
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 300 1500
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 2000
+python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 500
--- a/projet/settings.conf
+++ b/projet/settings.conf
+[vectorizers]
+vectorization_max_df= 1.0
+vectorization_min_df= 1
+vectorization_numberOfFeatures= None
+doc2vec_vec_size = 300
+doc2vec_epochs = 10
+doc2vec_lr = 0.025
+min_word_per_article = 4
--- a/projet/tmp_preprocess_data.py
+++ b/projet/tmp_preprocess_data.py
+import sys
+import os
+import time
+import argparse
+import pandas as pd
+import numpy as np
+from data_preprocessing import Preprocessor
+from features_extractor import feature_extractor
+from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
+from classifiers import classifiers, grid_params
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+from evaluate_model import evaluate_model
+from sklearn.model_selection import GridSearchCV
+import configparser
+from re import search
+import math
+from unidecode import unidecode
+import re
+import nltk
+from ClassPreprocessor import create_dict
+def removeMarkers(df, textColumn, listOfMarkers):
+    #remove null values or add condition if exist
+    #self.remove_null_rows(df, markerColumn)
+    #self.remove_null_rows(df, textColumn)
+    tmp = 0
+    for index, row in df.iterrows():
+        tmp += 1
+        print(tmp)
+        if not pd.isna(row[textColumn]):
+            for m in listOfMarkers:
+                marker = str(m)
+                marker_with_brcts = '('+ marker +')'
+                row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
+                row[textColumn] = row[textColumn].replace(marker , "")
+                full_text = row[textColumn]
+                i = unidecode(full_text).find(marker_with_brcts)
+                goOn = False
+                if i != -1:
+                    goOn = True
+                while goOn:
+                    full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
+                    i = unidecode(full_text).find(marker_with_brcts)
+                    if i == -1:
+                        goOn = False
+                row[textColumn] = full_text
+    return df
+# Reading data and preprocessings steps
+preprocessor = Preprocessor()
+df = pd.read_csv('corpus_tei.csv')
+listOfM = df['class'].unique()
+df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
+preprocessor.remove_null_rows(df_original, 'content')
+df_original = removeMarkers(df_original, 'content', listOfM)
+df_1 = df_original[['ensemble_domaine_enccre','content']].copy()
+df_2 = df_original[['domaine_enccre','content']].copy()
+df_3 = df_original[['normClass_artfl','content']].copy()
+############ shall we remove articles with less n tokens ####### remove markers
+preprocessor.remove_null_rows(df_1, 'content')
+preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
+preprocessor.remove_null_rows(df_2, 'content')
+preprocessor.remove_null_rows(df_2, 'domaine_enccre')
+preprocessor.remove_null_rows(df_3, 'content')
+preprocessor.remove_null_rows(df_3, 'normClass_artfl')
+df_1 = split_class(df_1, 'ensemble_domaine_enccre')
+df_2 = split_class(df_2, 'domaine_enccre')
+df_3 = split_class(df_3, 'normClass_artfl')
+d_1 = create_dict(df_1, 'ensemble_domaine_enccre')
+tosave = pd.DataFrame.from_dict(d_1, orient='index',  columns=[ 'Count'])
+tosave.to_excel("ensemble_domaine_enccre.xlsx")
+d_2 = create_dict(df_2, 'domaine_enccre')
+tosave = pd.DataFrame.from_dict(d_2, orient='index',  columns=[ 'Count'])
+tosave.to_excel("domaine_enccre.xlsx")
+d_3 = create_dict(df_3, 'normClass_artfl')
+tosave = pd.DataFrame.from_dict(d_3, orient='index',  columns=[ 'Count'])
+tosave.to_excel("normClass_artfl.xlsx")
+df_1.to_csv('dataframe_with_ensemble_domaine_enccre.csv')
+df_2.to_csv('dataframe_with_domaine_enccre.csv')
+df_3.to_csv('dataframe_with_normClass_artfl.csv')
+print(df_original.shape)