deleted

9eb58e8a · Ludovic Moncla · 65a6182a · 65a6182a · 65a6182a · 65a6182a
Commit 9eb58e8a authored 3 years ago by Ludovic Moncla
--- a/ClassPreprocessor.py
+++ b/ClassPreprocessor.py
-import pandas as pd
-import numpy as np
-import statistics
-def create_dict(df, classColumnName):
-    return dict(df[classColumnName].value_counts())
-def remove_weak_classes(df, classColumnName, threshold):
-    dictOfClassInstances = create_dict(df,classColumnName)
-    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
-    keys = [*dictionary]
-    df_tmp = df[~ df[classColumnName].isin(keys)]
-    #df = df[df[columnTarget] not in keys]
-    #df =  df.merge(df_tmp, how = 'outer' ,indicator=True)
-    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
-    return df
-def split_class(df, columnProcessed):
-    i = 0
-    new_df = pd.DataFrame(columns= df.columns)
-    for index, row in df.iterrows():
-        #cls = re.split(';', row[columnProcessed])
-        cls = filter(None, row[columnProcessed].split(';'))
-        cls = list(cls)
-        #cls = re.findall(r"[\w']+", row [columnProcessed])
-        r = row
-        for categ in cls:
-            r[columnProcessed] = categ
-            #new_df.append(r, ignore_index = True)
-            new_df.loc[i] = r
-            i = i + 1
-    return new_df
-def get_median_dict(dict):
-    return statistics.median(dict.values())
-def resample_classes(df, classColumnName, numberOfInstances):
-    # numberOfInstances first elements
-    #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns])
-    #random numberOfInstances elements
-    replace = False  # with replacement
-    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
-    return df.groupby(classColumnName, as_index=False).apply(fn)
--- a/bert_settings.conf
+++ b/bert_settings.conf
-[general]
-columnText = contentWithoutClass
-columnClass = ensemble_domaine_enccre
-minOfInstancePerClass = 50
-maxOfInstancePerClass = 1500
-[model]
-model =  bert
-path = bert-base-multilingual-cased
-#model = camembert
-#path = camembert-base
-max_len_sequences = 256
-batch_size = 32
-epochs = 4
--- a/classifiers.py
+++ b/classifiers.py
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.svm import SVC
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.linear_model import LogisticRegression
-from sklearn.linear_model import SGDClassifier
-from sklearn.neighbors import KNeighborsClassifier
-import numpy as np
-classifiers = [
-                ('bayes', MultinomialNB()),
-                ('lr', LogisticRegression()),
-                ('sgd', SGDClassifier()),
-                ('svm', SVC() ),
-                #('decisionTree',DecisionTreeClassifier()),
-                ('rfc', RandomForestClassifier()),
-                #('knn', KNeighborsClassifier())
-                ]
-param_grid_svm = {'kernel':['linear','rbf']}
-#param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
-param_grid_rfc = { 'max_features': ['sqrt', 'log2'], 'max_depth' : [4,5,6,7,8]}
-param_grid_lr = {"C":np.logspace(-3,3,7)}
-param_grid_sgd = { "loss" : ["log", "modified_huber"]}
-#param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
-grid_params = [
-                ('bayes', None),
-                ('lr', param_grid_lr),
-                ('sgd', param_grid_sgd ),
-                ('svm', param_grid_svm),
-                #('decisionTree', param_grid_decisionTree),
-                ('rfc', param_grid_rfc ),
-                #('knn', param_grid_knn),
-                ]
--- a/data_preparation.py
+++ b/data_preparation.py
-from os import path
-from os.path import basename, splitext
-import pandas as pd
-import os
-from data_process.TEIFile import TEIFile
-def basename_without_ext(path):
-    base_name = basename(path)
-    stem, ext = splitext(base_name)
-    if stem.endswith('.tei'):
-        # Return base name without tei file
-        return stem[0:-4]
-    else:
-        return stem
-def tei_to_csv_entry(tei_file, txt_file):
-    print(f"Going on {tei_file}")
-    tei = TEIFile(tei_file, txt_file)
-    print(f"Handled {tei_file}")
-    base_name = basename_without_ext(tei_file)
-    return base_name, tei._text, tei._Head, tei._author, tei._Objecttype, tei._Class, tei._normclass, tei._generatedclass, tei._englishclass, tei._attribution
-input_path = r'./data/EDdA/'
-output_name = "corpus_tei.csv"
-column_names = ["articleName", "text", "head", "author", "objecttype", "class", "normclass", "generatedclass", "englishclass", "attribution"]
-df = pd.DataFrame(columns = column_names)
-marge = 0
-for tome in os.listdir(input_path):
-    volume = tome[1:]
-    for index, article in enumerate(os.listdir(input_path + tome +"/")):
-        filepath = os.path.join(input_path, tome, article)
-        base_name = basename_without_ext(filepath)
-        df.loc[index+marge] = tei_to_csv_entry(filepath, ' ')
-        df.loc[index+marge]['articleName'] = volume+'_'+base_name
-    marge += index +1
-df.to_csv(output_name, index=False)
--- a/data_preprocessing.py
+++ b/data_preprocessing.py
-import pandas as pd
-import numpy as np
-from re import search
-import math
-from unidecode import unidecode
-from sklearn.feature_extraction.text import CountVectorizer
-from nltk.stem.snowball import SnowballStemmer
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-import re
-import nltk
-class Preprocessor():
-    def init(self):
-        pass
-    def remove_null_rows(self, df, columnName):
-        #df = df[df[columnName].notna()]
-        df.dropna(subset = [columnName], inplace = True)
-        df.reset_index(drop=True, inplace=True)
-        return
-    def removeMarkers(self, df, textColumn, markerColumn = 'class'):
-        #remove null values or add condition if exist
-        #self.remove_null_rows(df, markerColumn)
-        #self.remove_null_rows(df, textColumn)
-        for index, row in df.iterrows():
-            if not pd.isna(row[markerColumn]) and not pd.isna(row[textColumn]):
-                marker = row[markerColumn]
-                marker_with_brcts = '('+ marker +')'
-                row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
-                row[textColumn] = row[textColumn].replace(marker , "")
-                full_text = row[textColumn]
-                i = unidecode(full_text).find(marker_with_brcts)
-                goOn = False
-                if i != -1:
-                    goOn = True
-                while goOn:
-                    full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
-                    i = unidecode(full_text).find(marker_with_brcts)
-                    if i == -1:
-                        goOn = False
-                row[textColumn] = full_text
-        return df
-    def removeWordsByFrequency(self, df, textColumn, min_word_occurence, max_word_occurence):
-        stop_words = set(stopwords.words('french'))
-        stemmer_fr = SnowballStemmer("french")
-        analyzer = CountVectorizer().build_analyzer()
-        def token_fr(doc):
-            return (w for w in analyzer(doc) if not w in stop_words)
-        stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr, max_df= max_word_occurence , min_df= min_word_occurence, max_features=None)
-        docs = []
-        for index, row in df.iterrows():
-            docs.append(row[textColumn])
-        stem_vectorizer_fr.fit(docs)
-        featured_docs = stem_vectorizer_fr.transform(docs)
-        tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
-        for index, tokens in enumerate(tokens_per_docs):
-            # join token to recreate text with new tokens
-            new_text = ' '.join(tokens)
-            df.loc[index][textColumn] = new_text
-        return
-    def removeArticlesByTokensNumbers(self, df,  textColumn, min_word_per_article):
-        stop_words = set(stopwords.words('french'))
-        stemmer_fr = SnowballStemmer("french")
-        analyzer = CountVectorizer().build_analyzer()
-        def token_fr(doc):
-            return (w for w in analyzer(doc) if not w in stop_words)
-        stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr)
-        docs = []
-        for index, row in df.iterrows():
-            docs.append(row[textColumn])
-        stem_vectorizer_fr.fit(docs)
-        featured_docs = stem_vectorizer_fr.transform(docs)
-        tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
-        concerned_article_index = []
-        for index, tokens in enumerate(tokens_per_docs):
-            if len(tokens) <= min_word_per_article:
-                concerned_article_index.append(index)
-        df = df.drop(index = concerned_article_index, inplace = True)
-        return
-    def  getFirstParagraph(self, df, textColumn, columnName):
-        new_column = []
-        for index, row in df.iterrows():
-            paragraphs = row[textColumn].split('\n \n')
-            new_column.append(paragraphs[0])
-        df[columnName] = new_column
-        return
-    def getFirstSentence(self, df, textColumn, columnName):
-        sent = []
-        for index, row in df.iterrows():
-            sentences = nltk.sent_tokenize(row[textColumn])
-            sent.append(sentences[0])
-        df[columnName] = sent
-        return
-    def saveDataFrametoCSV(self, df, pathName):
-        df.to_csv(pathName)
--- a/evaluate_bertFineTuning.py
+++ b/evaluate_bertFineTuning.py
-import matplotlib.pyplot as plt
-from sklearn.metrics import plot_confusion_matrix
-from sklearn.metrics import confusion_matrix
-from sklearn.metrics import classification_report
-import seaborn as sns
-def evaluate_bertFineTuning(pred_labels_, true_labels_, encoder):
-    report = classification_report( pred_labels_, true_labels_, output_dict = True)
-    classes = [str(e) for e in encoder.transform(encoder.classes_)]
-    classesName = encoder.classes_
-    accuracy = report['accuracy']
-    weighted_avg = report['weighted avg']
-    precision = []
-    recall = []
-    f1 = []
-    support = []
-    dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
-    for c in classes:
-        precision.append(report[c]['precision'])
-        recall.append(report[c]['recall'])
-        f1.append(report[c]['f1-score'])
-        support.append(report[c]['support'])
-    accuracy = report['accuracy']
-    weighted_avg = report['weighted avg']
-    cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
-    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
-    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
-    TP = np.diag(cnf_matrix)
-    TN = cnf_matrix.sum() - (FP + FN + TP)
-    dff['className'] = classesName
-    dff['precision'] = precision
-    dff['recall'] = recall
-    dff['f1-score'] = f1
-    dff['support'] = support
-    dff['FP'] = FP
-    dff['FN'] = FN
-    dff['TP'] = TP
-    dff['TN'] = TN
-    return dff, accuracy, weighted_avg
--- a/evaluate_model.py
+++ b/evaluate_model.py
-import matplotlib.pyplot as plt
-import numpy as np
-from sklearn.metrics import plot_confusion_matrix
-from sklearn.metrics import confusion_matrix
-from sklearn.metrics import classification_report
-import pandas as pd
-import seaborn as sns
-def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave):
-    #classifier, label_list, test_x, valid_y, title = "Confusion matrix"):
-    precision = []
-    recall = []
-    f1 = []
-    support = []
-    weighted_avg = None
-    accuracy = None
-    df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
-    report = classification_report( y_pred, valid_y, output_dict = True)
-    for c in classes:
-        precision.append(report[c]['precision'])
-        recall.append(report[c]['recall'])
-        f1.append(report[c]['f1-score'])
-        support.append(report[c]['support'])
-    accuracy = report['accuracy']
-    weighted_avg = report['weighted avg']
-    cnf_matrix = confusion_matrix(valid_y, y_pred)
-    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
-    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
-    TP = np.diag(cnf_matrix)
-    TN = cnf_matrix.sum() - (FP + FN + TP)
-    df['className'] = classesName
-    df['precision'] = precision
-    df['recall'] = recall
-    df['f1-score'] = f1
-    df['support'] = support
-    df['FP'] = FP
-    df['FN'] = FN
-    df['TP'] = TP
-    df['TN'] = TN
-    #disp = plot_confusion_matrix(classifier, test_x, valid_y,
-    #                                 display_labels= label_list,
-    #                                 cmap=plt.cm.Blues,
-    #                                 normalize=None)
-    #disp.ax_.set_title(title)
-    #print(title)
-    #print(disp.confusion_matrix)
-    #plt.show()
-    plt.rcParams["font.size"] = 3
-    plot_confusion_matrix(clf, X_test, y_test)
-    plt.savefig(pathSave)
-    return df, accuracy, weighted_avg
--- a/experimentsClassicClassifiers.py
+++ b/experimentsClassicClassifiers.py
-import sys
-import os
-import time
-import argparse
-import pandas as pd
-import numpy as np
-from data_preprocessing import Preprocessor
-from features_extractor import feature_extractor
-from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
-from classifiers import classifiers, grid_params
-from sklearn.model_selection import train_test_split
-from sklearn import preprocessing
-from evaluate_model import evaluate_model
-from sklearn.model_selection import GridSearchCV
-import configparser
-import pickle
-import nltk
-nltk.download('stopwords')
-nltk.download('punkt')
-parser = argparse.ArgumentParser()
-parser.add_argument("dataPath", help="Path of the dataframe")
-parser.add_argument("columnText", help="the column name of the text that should preproceed", default = 'content')
-parser.add_argument("columnClass", help="ColumnClass the column name of the classes")
-parser.add_argument("minOfInstancePerClass", help="minOfInstancePerClass the minimum of instance required for each class", type=int)
-parser.add_argument("maxOfInstancePerClass", help="maxOfInstancePerClass the maximum of instance required resamling classes", type=int)
-args = parser.parse_args()
-dataPath = args.dataPath
-columnText = args.columnText
-columnClass = args.columnClass
-minOfInstancePerClass = args.minOfInstancePerClass
-maxOfInstancePerClass = args.maxOfInstancePerClass
-if not os.path.exists('reports'):
-    os.makedirs('reports')
-if not os.path.exists(os.path.join('reports',  columnClass)):
-    os.makedirs(os.path.join('reports', columnClass))
-# create directory in the reports directory so save the classification results
-dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
-if not os.path.exists(os.path.join('reports',  columnClass, dir_name_report)):
-    os.makedirs(os.path.join('reports', columnClass, dir_name_report))
-# create directory to save and load models
-if not os.path.exists('models'):
-    os.makedirs('models')
-# Reading data and preprocessings steps
-preprocessor = Preprocessor()
-df = pd.read_csv(dataPath, sep="\t")
-df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
-df = resample_classes(df, columnClass, maxOfInstancePerClass)
-#Read configuration file for retreiving parameters of features extractors
-config = configparser.ConfigParser()
-config.read('settings.conf')
-vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else  float(config.get('vectorizers','vectorization_max_df'))
-vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else  float(config.get('vectorizers','vectorization_min_df'))
-vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None
-doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size'))
-max_epochs = int(config.get('vectorizers','max_epochs'))
-doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count'))
-doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
-doc2vec_workers = int(config.get('vectorizers','doc2vec_workers'))
-print("size after resampling, ",len(df))
-#prepare data
-#df = df[df[columnClass] != 'unclassified']
-y  = df[columnClass]
-print(df.head())
-print(df[columnClass].head())
-train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
-encoder = preprocessing.LabelEncoder()
-train_y = encoder.fit_transform(train_y)
-valid_y = encoder.fit_transform(test_y)
-print("size training set, ",len(train_x))
-print("size validation set, ",len(test_x))
-for columnInput in [columnText, 'firstParagraph']:
-    print('Process: ' + columnInput)
-    extractor = feature_extractor(train_x, test_x, columnInput, columnClass)
-    features_techniques = [
-    ('counter',  extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
-    ('tf_idf',  extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
-    ('doc2vec',  extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count ,  doc2vec_dm))]
-    #case of full text
-    for feature_technique_name, features in features_techniques:
-        print("**** Classifier :", feature_technique_name)
-        # features has the train_x and the test_x after vectorization
-        train_x, test_x = features
-        for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
-            clf_name, clf = tmp_clf
-            grid_param_name, grid_param = tmp_grid_params
-            print(clf_name, clf, grid_param_name, grid_param)
-            model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl"
-            if clf_name != 'bayes' :
-                clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3, n_jobs=-1)
-            elif feature_technique_name == 'doc2vec':
-                    continue
-            t_begin = time.time()
-            if os.path.isfile(os.path.join('./models', model_file_name)):
-                report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
-                with open(os.path.join('./models',columnClass, model_file_name), 'rb') as file:
-                    clf = pickle.load(file)
-            else:
-                with open(os.path.join('./models',columnClass, model_file_name), 'wb') as file:
-                    clf.fit(train_x, train_y)
-                    pickle.dump(clf, file)
-            t_end =time.time()
-            training_time = t_end - t_begin
-            y_pred = clf.predict(test_x)
-    #evaluate model
-            file_name_report = columnInput + '_' +feature_technique_name + '_' + clf_name
-            report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
-            report.to_csv(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.csv'))
-            with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f:
-                sys.stdout = f # Change the standard output to the file we created.
-                print('accuracy : {}'.format(accuracy))
-                print('weighted_Precision : {}'.format(weighted_avg['precision']))
-                print('weighted_Recall    : {}'.format(weighted_avg['recall']))
-                print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
-                print('weighted_Support   : {}'.format(weighted_avg['support']))
-                print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
-                print('training time   : {}'.format(training_time))
-                try:
-                    print('best parameters   : {}'.format(clf.best_params_))
-                except AttributeError:
-                    pass
-                #sys.stdout = sys.stdout # Reset the standard output to its original value
-                sys.stdout = sys.__stdout__
--- a/features_extractor.py
+++ b/features_extractor.py
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.feature_extraction.text import TfidfVectorizer
-from nltk.stem.snowball import SnowballStemmer
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-import string
-import pandas as pd
-import numpy as np
-from gensim.models.doc2vec import Doc2Vec, TaggedDocument
-from nltk.tokenize import word_tokenize
-import spacy
-class feature_extractor:
-    def __init__(self, train_x, test_x, column, target):
-        self.column = column
-        #self.data = data
-        #self.X = data[column]
-        #self.y = data[target]
-        self.docs_train = train_x[column].tolist()
-        self.docs_test = test_x[column].tolist()
-        #for index, row in data.iterrows():
-        #    self.docs.append(row[column])
-    def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ):
-        stop_words = set(stopwords.words('french'))
-        stemmer_fr = SnowballStemmer("french")
-        analyzer = CountVectorizer().build_analyzer()
-        def stemmed_words_fr(doc):
-            return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
-        stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures)
-        stem_vectorizer_fr.fit(self.docs_train)
-        return stem_vectorizer_fr.transform(self.docs_train), stem_vectorizer_fr.transform(self.docs_test)
-    def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None):
-        stop_words = set(stopwords.words('french'))
-        stemmer_fr = SnowballStemmer("french")
-        analyzer = TfidfVectorizer().build_analyzer()
-        def stemmed_words_fr(doc):
-            return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
-        tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures)
-        tfidf_vectorizer.fit(self.docs_train)
-        return tfidf_vectorizer.transform(self.docs_train), tfidf_vectorizer.transform(self.docs_test)
-    def doc2vec(self, max_epochs, doc2vec_vec_size, doc2vec_min_count ,  doc2vec_dm, doc2vec_workers):
-        nlp = spacy.load("fr_core_news_sm")
-        stopWords = set(stopwords.words('french'))
-        def tokenize_fr_text(sentence):
-            result = string.punctuation
-            # Tokeniser la phrase
-            doc = nlp(sentence)
-            # Retourner le texte de chaque token
-            return [X.text.lower() for X in doc if not X.text in stopWords and not X.text in result and not len(X.text) < 2]
-        #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)]
-        tagged_tr = [TaggedDocument(words = tokenize_fr_text(_d),tags = [str(i)]) for i, _d in enumerate(self.docs_train)]
-        #Tag test set
-        tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(self.docs_test)]
-        model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm, workers = doc2vec_workers)
-        model.build_vocab(tagged_tr)
-        model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs)
-        X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])
-        X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])
-        return X_train, X_test
-    def text_based_features(self):
-        # Classical measures
-        df = pd.DataFrame(columns=['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count'])
-        df['char_count'] = self.data[self.column].apply(len)
-        df['word_count'] = self.data[self.column].apply(lambda x: len(x.split()))
-        df['word_density'] = df['char_count'] / (df['word_count']+1)
-        df['punctuation_count'] = self.data[self.column].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
-        df['title_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
-        df['upper_case_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
-        return df
--- a/main.py
+++ b/main.py
-import pandas as pd
-import numpy as np
-import configparser
-from sklearn import preprocessing
-from sklearn.model_selection import train_test_split
-from training_bertFineTuning import training_bertFineTuning
-from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader
-from evaluate_bertFineTuning import evaluate_bertFineTuning
-def create_dict(df, classColumnName):
-    return dict(df[classColumnName].value_counts())
-def remove_weak_classes(df, classColumnName, threshold):
-    dictOfClassInstances = create_dict(df,classColumnName)
-    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
-    keys = [*dictionary]
-    df_tmp = df[~ df[classColumnName].isin(keys)]
-    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
-    return df
-def resample_classes(df, classColumnName, numberOfInstances):
-    #random numberOfInstances elements
-    replace = False  # with replacement
-    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
-    return df.groupby(classColumnName, as_index=False).apply(fn)
-def main():
-    config = configparser.ConfigParser()
-    config.read('bert_settings.conf')
-    dataPath = config.get('general','dataPath')
-    columnText = config.get('general','columnText')
-    columnClass = config.get('general','columnClass')
-    minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
-    maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
-    chosen_tokeniser = config.get('model','tokeniser')
-    chosen_model = config.get('model','model')
-    max_len = int(config.get('model','max_len_sequences'))
-    batch_size = int(config.get('model','batch_size'))
-    epochs = int(config.get('model','epochs'))
-    df = pd.read_csv(dataPath)
-    df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
-    df = resample_classes(df, columnClass, maxOfInstancePerClass)
-    df = df[df[columnClass] != 'unclassified']
-    y  = df[columnClass]
-    numberOfClasses = y.nunique()
-    encoder = preprocessing.LabelEncoder()
-    y = encoder.fit_transform(y)
-    train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
-    sentences = train_x[columnText].values
-    labels = train_y.tolist()
-    #call train method
-    model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs)
-    #save the model
-    model_save_name = config.get('model','modelName')
-    path = config.get('model','path')
-    torch.save(model, os.path.join(path,model_save_name))
-    #print the model parameters
-    params = list(model.named_parameters())
-    print('The BERT model has {:} different named parameters.\n'.format(len(params)))
-    print('==== Embedding Layer ====\n')
-    for p in params[0:5]:
-        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
-        print('\n==== First Transformer ====\n')
-    for p in params[5:21]:
-        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
-        print('\n==== Output Layer ====\n')
-    for p in params[-4:]:
-        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
-    #call predict method
-    prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32)
-    predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader)
-    #call Evaluate
-    result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder)
-    print(result_df)
-    print(accuracy)
-    print(weighted_avg)
-if __name__ == "__main__":
-    main()
--- a/predict_bertFineTuning.py
+++ b/predict_bertFineTuning.py
-import torch
-import pandas as pd
-import numpy as np
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-from transformers import BertTokenizer, CamembertTokenizer
-def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32):
-    if chosen_model == 'bert-base-multilingual-cased' :
-        print('Loading Bert Tokenizer...')
-        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
-    elif chosen_model == 'camembert-base':
-        print('Loading Camembert Tokenizer...')
-        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
-    # Tokenize all of the sentences and map the tokens to thier word IDs.
-    input_ids_test = []
-    # For every sentence...
-    for sent in sentences_to_predict:
-        # `encode` will:
-        #   (1) Tokenize the sentence.
-        #   (2) Prepend the `[CLS]` token to the start.
-        #   (3) Append the `[SEP]` token to the end.
-        #   (4) Map tokens to their IDs.
-        encoded_sent = tokenizer.encode(
-                            sent,                      # Sentence to encode.
-                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
-                    )
-        input_ids_test.append(encoded_sent)
-    # Pad our input tokens
-    padded_test = []
-    for i in input_ids_test:
-      if len(i) > max_len:
-        padded_test.extend([i[:max_len]])
-      else:
-        padded_test.extend([i + [0] * (max_len - len(i))])
-    input_ids_test = np.array(padded_test)
-    # Create attention masks
-    attention_masks = []
-    # Create a mask of 1s for each token followed by 0s for padding
-    for seq in input_ids_test:
-        seq_mask = [float(i>0) for i in seq]
-        attention_masks.append(seq_mask)
-    # Convert to tensors.
-    prediction_inputs = torch.tensor(input_ids_test)
-    prediction_masks = torch.tensor(attention_masks)
-    prediction_labels = torch.tensor(labels)
-    # Set the batch size.
-    batch_size = 32
-    # Create the DataLoader.
-    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
-    prediction_sampler = SequentialSampler(prediction_data)
-    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
-    return prediction_dataloader
-def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
-    # If there's a GPU available...
-    if torch.cuda.is_available():
-        # Tell PyTorch to use the GPU.
-        device = torch.device("cuda")
-        print('There are %d GPU(s) available.' % torch.cuda.device_count())
-        print('We will use the GPU:', torch.cuda.get_device_name(0))
-        # If not...
-    else:
-        print('No GPU available, using the CPU instead.')
-        device = torch.device("cpu")
-    # Put model in evaluation mode
-    model.eval()
-    # Tracking variables
-    predictions_test , true_labels = [], []
-    # Predict
-    for batch in prediction_dataloader:
-    # Add batch to GPU
-        batch = tuple(t.to(device) for t in batch)
-        # Unpack the inputs from the dataloader
-        b_input_ids, b_input_mask, b_labels = batch
-        # Telling the model not to compute or store gradients, saving memory and
-        # speeding up prediction
-        with torch.no_grad():
-            # Forward pass, calculate logit predictions
-            outputs = model(b_input_ids, token_type_ids=None,
-                            attention_mask=b_input_mask)
-        logits = outputs[0]
-        #print(logits)
-        # Move logits and labels to CPU
-        logits = logits.detach().cpu().numpy()
-        label_ids = b_labels.to('cpu').numpy()
-        #print(logits)
-        # Store predictions and true labels
-        predictions_test.append(logits)
-        true_labels.append(label_ids)
-        print('    DONE.')
-        pred_labels = []
-        for i in range(len(true_labels)):
-            # The predictions for this batch are a 2-column ndarray (one column for "0"
-            # and one column for "1"). Pick the label with the highest value and turn this
-            # in to a list of 0s and 1s.
-            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
-            pred_labels.append(pred_labels_i)
-        pred_labels_ = [item for sublist in pred_labels for item in sublist]
-        true_labels_ = [item for sublist in true_labels for item in sublist]
-        return predictions_test_, true_labels_
-def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict):
-    if chosen_model == 'bert-base-multilingual-cased' :
-        print('Loading Bert Tokenizer...')
-        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
-    elif chosen_model == 'camembert-base':
-        print('Loading Camembert Tokenizer...')
-        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
-    # Tokenize all of the sentences and map the tokens to thier word IDs.
-    input_ids_test = []
-    # For every sentence...
-    for sent in sentences_to_predict:
-        # `encode` will:
-        #   (1) Tokenize the sentence.
-        #   (2) Prepend the `[CLS]` token to the start.
-        #   (3) Append the `[SEP]` token to the end.
-        #   (4) Map tokens to their IDs.
-        encoded_sent = tokenizer.encode(
-                            sent,                      # Sentence to encode.
-                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
-                    )
-        input_ids_test.append(encoded_sent)
-        with torch.no_grad():
-            # Forward pass, calculate logit predictions
-            outputs = model(b_input_ids, token_type_ids=None,
-                            attention_mask=b_input_mask)
-        logits = outputs[0]
--- a/settings.conf
+++ b/settings.conf
-[vectorizers]
-vectorization_max_df= 1.0
-vectorization_min_df= 4
-vectorization_numberOfFeatures= None
-doc2vec_vec_size = 700
-max_epochs = 10
-doc2vec_min_count = 12
-doc2vec_dm = 0
-doc2vec_workers = 8
-min_word_per_article = 25
--- a/tmp_preprocess_data.py
+++ b/tmp_preprocess_data.py
-#import sys
-#import os
-#import time
-#import argparse
-import pandas as pd
-#import numpy as np
-#from data_preprocessing import Preprocessor
-#from features_extractor import feature_extractor
-#from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
-#from classifiers import classifiers, grid_params
-#from sklearn.model_selection import train_test_split
-#from sklearn import preprocessing
-#from evaluate_model import evaluate_model
-#from sklearn.model_selection import GridSearchCV
-#import configparser
-#from re import search
-#import math
-#import re
-#import nltk
-#from ClassPreprocessor import create_dict
-#print("Begin preprocess")
-# Reading data and preprocessings steps
-print("load dataset")
-df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
-#df = df_original.copy()
-print("len(df)",len(df))
-print("remove blank rows")
-df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True)
-print("len(df)",len(df))
-print("remove small articles < 15 words")
-#preprocessor = Preprocessor()
-#preprocessor.removeArticlesByTokensNumbers(df, 'content', 25)
-df = df.loc[(df['nb_word']>=15)]
-print("len(df)",len(df))
-df.reset_index(drop=True, inplace=True)
-print("filter unclassified rows")
-# filtrer les articles non classés par ARTFL mais classé par ENCCRE (jeu de test)
-df_unclassified = df.loc[(df['normClass']=="unclassified")]
-df_classified = df.loc[(df['normClass']!="unclassified")]
-print("save dataframe")
-df_classified.to_csv('./data/train_dataframe.tsv', sep="\t")
-df_unclassified.to_csv('./data/test_dataframe.tsv', sep="\t")
-print("some stats")
-print("len(df_unclassified)",len(df_unclassified))
-print("len(df_classified)",len(df_classified))
-'''
-#preprocessor.remove_null_rows(df_original, 'content')
-print("copy")
-df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
-df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
-df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy()
-print("split ensemble domaine enccre")
-df_1 = split_class(df_1, 'ensemble_domaine_enccre')
-print("save dataframe")
-df_1.to_csv('./data/train_dataframe_with_ensemble_domaine_enccre.csv')
-print("split  domaine enccre")
-df_2 = split_class(df_2, 'domaine_enccre')
-print("save dataframe")
-df_2.to_csv('./data/train_dataframe_with_domaine_enccre.csv')
-print("split normclass")
-df_3 = split_class(df_3, 'normClass')
-print("save dataframe")
-df_3.to_csv('./data/train_dataframe_with_normClass_artfl.csv')
-d_1 = create_dict(df_1, 'ensemble_domaine_enccre')
-tosave = pd.DataFrame.from_dict(d_1, orient='index',  columns=[ 'Count'])
-tosave.to_excel("ensemble_domaine_enccre.xlsx")
-d_2 = create_dict(df_2, 'domaine_enccre')
-tosave = pd.DataFrame.from_dict(d_2, orient='index',  columns=[ 'Count'])
-tosave.to_excel("domaine_enccre.xlsx")
-d_3 = create_dict(df_3, 'normClass_artfl')
-tosave = pd.DataFrame.from_dict(d_3, orient='index',  columns=[ 'Count'])
-tosave.to_excel("normClass_artfl.xlsx")
-print(df_original.shape)
-'''
\ No newline at end of file
--- a/training_bertFineTuning.py
+++ b/training_bertFineTuning.py
-import torch
-import pandas as pd
-import numpy as np
-from sklearn import preprocessing
-from sklearn.model_selection import train_test_split
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-from transformers import BertTokenizer, CamembertTokenizer
-from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
-from transformers import get_linear_schedule_with_warmup
-import time
-import datetime
-import random
-import os
-import argparse
-import configparser
-import csv
-def create_dict(df, classColumnName):
-    return dict(df[classColumnName].value_counts())
-def remove_weak_classes(df, classColumnName, threshold):
-    dictOfClassInstances = create_dict(df,classColumnName)
-    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
-    keys = [*dictionary]
-    df_tmp = df[~ df[classColumnName].isin(keys)]
-    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
-    return df
-def resample_classes(df, classColumnName, numberOfInstances):
-    #random numberOfInstances elements
-    replace = False  # with replacement
-    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
-    return df.groupby(classColumnName, as_index=False).apply(fn)
-def flat_accuracy(preds, labels):
-    pred_flat = np.argmax(preds, axis=1).flatten()
-    labels_flat = labels.flatten()
-    return np.sum(pred_flat == labels_flat) / len(labels_flat)
-def format_time(elapsed):
-    '''
-    Takes a time in seconds and returns a string hh:mm:ss
-    '''
-    # Round to the nearest second.
-    elapsed_rounded = int(round((elapsed)))
-    # Format as hh:mm:ss
-    return str(datetime.timedelta(seconds=elapsed_rounded))
-def training_bertFineTuning(chosen_model, model_path,  sentences, labels, max_len,  batch_size, epochs = 4):
-    # If there's a GPU available...
-    if torch.cuda.is_available():
-        # Tell PyTorch to use the GPU.
-        device = torch.device("cuda")
-        print('There are %d GPU(s) available.' % torch.cuda.device_count())
-        print('We will use the GPU:', torch.cuda.get_device_name(0))
-        # If not...
-    else:
-        print('No GPU available, using the CPU instead.')
-        device = torch.device("cpu")
-############################################################################################################
-########################## Model: Tokenization & Input Formatting ###################################################################
-###########################################################################################################
-    if chosen_model == 'bert' :
-        print('Loading Bert Tokenizer...')
-        tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)
-    elif chosen_model == 'camembert':
-        print('Loading Camembert Tokenizer...')
-        tokenizer = CamembertTokenizer.from_pretrained(model_path , do_lower_case=True)
-    # Tokenize all of the sentences and map the tokens to thier word IDs.
-    input_ids = []
-    # For every sentence...
-    for sent in sentences:
-        # `encode` will:
-        #   (1) Tokenize the sentence.
-        #   (2) Prepend the `[CLS]` token to the start.
-        #   (3) Append the `[SEP]` token to the end.
-        #   (4) Map tokens to their IDs.
-        encoded_sent = tokenizer.encode(
-                            str(sent),                      # Sentence to encode.
-                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
-                            # This function also supports truncation and conversion
-                            # to pytorch tensors, but I need to do padding, so I
-                            # can't use these features.
-                            #max_length = 128,          # Truncate all sentences.
-                            #return_tensors = 'pt',     # Return pytorch tensors.
-                            )
-        # Add the encoded sentence to the list.
-        input_ids.append(encoded_sent)
-    padded = []
-    for i in input_ids:
-        if len(i) > max_len:
-            padded.extend([i[:max_len]])
-        else:
-            padded.extend([i + [0] * (max_len - len(i))])
-    padded  = np.array(padded)
-    # Create attention masks
-    attention_masks = []
-    # For each sentence...
-    for sent in padded:
-        # Create the attention mask.
-        #   - If a token ID is 0, then it's padding, set the mask to 0.
-        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
-        att_mask = [int(token_id > 0) for token_id in sent]
-        # Store the attention mask for this sentence.
-        attention_masks.append(att_mask)
-    # Use 90% for training and 10% for validation.
-    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.3, stratify = labels )
-    # Do the same for the masks.
-    train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.3, stratify = labels)
-    # Convert all inputs and labels into torch tensors, the required datatype
-    # for my model.
-    train_inputs = torch.tensor(train_inputs)
-    validation_inputs = torch.tensor(validation_inputs)
-    train_labels = torch.tensor(train_labels)
-    validation_labels = torch.tensor(validation_labels)
-    train_masks = torch.tensor(train_masks)
-    validation_masks = torch.tensor(validation_masks)
-    # The DataLoader needs to know the batch size for training, so I specify it here.
-    # For fine-tuning BERT on a specific task, the authors recommend a batch size of
-    # 16 or 32.
-    # Create the DataLoader for training set.
-    train_data = TensorDataset(train_inputs, train_masks, train_labels)
-    train_sampler = RandomSampler(train_data)
-    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
-    # Create the DataLoader for validation set.
-    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
-    validation_sampler = SequentialSampler(validation_data)
-    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
-    print(' Selecting a model .....')
-    numberOfClasses = len(set(labels))
-    # Load BertForSequenceClassification, the pretrained BERT model with a single
-    # linear classification layer on top.
-    if chosen_model == 'bert':
-        model = BertForSequenceClassification.from_pretrained(
-            model_path, # Use the 12-layer BERT model, with an uncased vocab.
-            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
-            # You can increase this for multi-class tasks.
-            output_attentions = False, # Whether the model returns attentions weights.
-            output_hidden_states = False, # Whether the model returns all hidden-states.
-            )
-    elif chosen_model == 'camembert':
-        model = CamembertForSequenceClassification.from_pretrained(
-            model_path, # Use the 12-layer BERT model, with an uncased vocab.
-            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
-            # You can increase this for multi-class tasks.
-            output_attentions = False, # Whether the model returns attentions weights.
-            output_hidden_states = False, # Whether the model returns all hidden-states.
-            )
-    # Tell pytorch to run this model on the GPU.
-    model.cuda()
-    #Note: AdamW is a class from the huggingface library (as opposed to pytorch)
-    # I believe the 'W' stands for 'Weight Decay fix"
-    optimizer = AdamW(model.parameters(),
-                    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
-                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
-                    )
-    # Total number of training steps is number of batches * number of epochs.
-    total_steps = len(train_dataloader) * epochs
-    # Create the learning rate scheduler.
-    scheduler = get_linear_schedule_with_warmup(optimizer,
-                                            num_warmup_steps = 0, # Default value in run_glue.py
-                                            num_training_steps = total_steps)
-    # This training code is based on the `run_glue.py` script here:
-    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
-    # Set the seed value all over the place to make this reproducible.
-    seed_val = 42
-    random.seed(seed_val)
-    np.random.seed(seed_val)
-    torch.manual_seed(seed_val)
-    torch.cuda.manual_seed_all(seed_val)
-    # Store the average loss after each epoch so I can plot them.
-    loss_values = []
-    # For each epoch...
-    for epoch_i in range(0, epochs):
-        # ========================================
-        #               Training
-        # ========================================
-        # Perform one full pass over the training set.
-        print("")
-        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
-        print('Training...')
-        # Measure how long the training epoch takes.
-        t0 = time.time()
-        # Reset the total loss for this epoch.
-        total_loss = 0
-        # Put the model into training mode.
-        model.train()
-        # For each batch of training data...
-        for step, batch in enumerate(train_dataloader):
-            # Progress update every 40 batches.
-            if step % 40 == 0 and not step == 0:
-                # Calculate elapsed time in minutes.
-                elapsed = format_time(time.time() - t0)
-                # Report progress.
-                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
-            # Unpack this training batch from the dataloader.
-            #
-            # As I unpack the batch, I'll also copy each tensor to the GPU using the
-            # `to` method.
-            #
-            # `batch` contains three pytorch tensors:
-            #   [0]: input ids
-            #   [1]: attention masks
-            #   [2]: labels
-            b_input_ids = batch[0].to(device)
-            b_input_mask = batch[1].to(device)
-            b_labels = batch[2].to(device)
-            # Always clear any previously calculated gradients before performing a
-            # backward pass. PyTorch doesn't do this automatically because
-            # accumulating the gradients is "convenient while training RNNs".
-            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
-            model.zero_grad()
-            # Perform a forward pass (evaluate the model on this training batch).
-            # This will return the loss (rather than the model output) because I
-            # have provided the `labels`.
-            # The documentation for this `model` function is here:
-            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
-            outputs = model(b_input_ids,
-                        token_type_ids=None,
-                        attention_mask=b_input_mask,
-                        labels=b_labels)
-            # The call to `model` always returns a tuple, so I need to pull the
-            # loss value out of the tuple.
-            loss = outputs[0]
-            # Accumulate the training loss over all of the batches so that I can
-            # calculate the average loss at the end. `loss` is a Tensor containing a
-            # single value; the `.item()` function just returns the Python value
-            # from the tensor.
-            total_loss += loss.item()
-            #  Perform a backward pass to calculate the gradients.
-            loss.backward()
-            # Clip the norm of the gradients to 1.0.
-            # This is to help prevent the "exploding gradients" problem.
-            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-            # Update parameters and take a step using the computed gradient.
-            # The optimizer dictates the "update rule"--how the parameters are
-            # modified based on their gradients, the learning rate, etc.
-            optimizer.step()
-            # Update the learning rate.
-            scheduler.step()
-        # Calculate the average loss over the training data.
-        avg_train_loss = total_loss / len(train_dataloader)
-        # Store the loss value for plotting the learning curve.
-        loss_values.append(avg_train_loss)
-        print("")
-        print("  Average training loss: {0:.2f}".format(avg_train_loss))
-        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
-        # ========================================
-        #               Validation
-        # ========================================
-        # After the completion of each training epoch, measure the performance on
-        # the validation set.
-        print("")
-        print("Running Validation...")
-        t0 = time.time()
-        # Put the model in evaluation mode--the dropout layers behave differently
-        # during evaluation.
-        model.eval()
-        # Tracking variables
-        eval_loss, eval_accuracy = 0, 0
-        nb_eval_steps, nb_eval_examples = 0, 0
-        # Evaluate data for one epoch
-        for batch in validation_dataloader:
-            # Add batch to GPU
-            batch = tuple(t.to(device) for t in batch)
-            # Unpack the inputs from dataloader
-            b_input_ids, b_input_mask, b_labels = batch
-            # Telling the model not to compute or store gradients, saving memory and
-            # speeding up validation
-            with torch.no_grad():
-                # Forward pass, calculate logit predictions.
-                # This will return the logits rather than the loss because we have
-                # not provided labels.
-                # token_type_ids is the same as the "segment ids", which
-                # differentiates sentence 1 and 2 in 2-sentence tasks.
-                # The documentation for this `model` function is here:
-                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
-                outputs = model(b_input_ids,
-                                token_type_ids=None,
-                                attention_mask=b_input_mask)
-            # Get the "logits" output by the model. The "logits" are the output
-            # values prior to applying an activation function like the softmax.
-            logits = outputs[0]
-            # Move logits and labels to CPU
-            logits = logits.detach().cpu().numpy()
-            label_ids = b_labels.to('cpu').numpy()
-            # Calculate the accuracy for this batch of test sentences.
-            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
-            # Accumulate the total accuracy.
-            eval_accuracy += tmp_eval_accuracy
-            # Track the number of batches
-            nb_eval_steps += 1
-        # Report the final accuracy for this validation run.
-        print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
-        print("  Validation took: {:}".format(format_time(time.time() - t0)))
-    print("")
-    print("Training complete!")
-    return model
-'''print('Saving Model....')
-model_save_name = config.get('model','modelName')
-path = config.get('model','path')
-#torch.save(model.state_dict(), os.path.join(path,model_save_name))
-torch.save(model, os.path.join(path,model_save_name))'''
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("input_dataset")
-    parser.add_argument("conf_file")
-    parser.add_argument("output_path")
-    args = parser.parse_args()
-    INPUT_DATASET = args.input_dataset
-    CONF_FILE = args.conf_file
-    OUTPUT_PATH = args.output_path
-    config = configparser.ConfigParser()
-    config.read(CONF_FILE)
-    #dataPath = config.get('general','dataPath')
-    columnText = config.get('general','columnText')
-    columnClass = config.get('general','columnClass')
-    minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
-    maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
-    model_path = config.get('model','path')
-    chosen_model = config.get('model','model')
-    max_len = int(config.get('model','max_len_sequences'))
-    batch_size = int(config.get('model','batch_size'))
-    epochs = int(config.get('model','epochs'))
-    df = pd.read_csv(INPUT_DATASET, sep="\t")
-    df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
-    df = resample_classes(df, columnClass, maxOfInstancePerClass)
-    #df = df[df[columnClass] != 'unclassified']
-    y  = df[columnClass]
-    numberOfClasses = y.nunique()
-    encoder = preprocessing.LabelEncoder()
-    y = encoder.fit_transform(y)
-    #train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
-    #sentences = train_x[columnText].values
-    sentences = df[columnText].values
-    #labels = train_y.tolist()
-    labels = y.tolist()
-    #call train method
-    model = training_bertFineTuning(chosen_model,model_path, sentences, labels, max_len, batch_size, epochs)
-    #save the model
-    model_save_name = chosen_model+"_b"+batch_size+"_e"+epochs
-    torch.save(model, os.path.join(OUTPUT_PATH,model_save_name))
-    #print the model parameters
-    params = list(model.named_parameters())
-    print('The BERT model has {:} different named parameters.\n'.format(len(params)))
-    print('==== Embedding Layer ====\n')
-    for p in params[0:5]:
-        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
-        print('\n==== First Transformer ====\n')
-    for p in params[5:21]:
-        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
-        print('\n==== Output Layer ====\n')
-    for p in params[-4:]:
-        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))