From 1ce8c1901885620994872d08e026dcb0f794e19a Mon Sep 17 00:00:00 2001
From: Khalleud <ledk14@gmail.com>
Date: Thu, 5 Aug 2021 19:46:45 +0200
Subject: [PATCH 1/6] [Feature] ADD bert vectorization for classification

---
 experiments/bert_experiments.py | 349 ++++++++++++++++++++++++++++++++
 experiments/parameters.conf     |  10 +
 experiments/requierements.txt   |   7 +
 3 files changed, 366 insertions(+)
 create mode 100644 experiments/bert_experiments.py
 create mode 100644 experiments/parameters.conf
 create mode 100644 experiments/requierements.txt

diff --git a/experiments/bert_experiments.py b/experiments/bert_experiments.py
new file mode 100644
index 0000000..b4e4bcd
--- /dev/null
+++ b/experiments/bert_experiments.py
@@ -0,0 +1,349 @@
+import pandas as pd
+import numpy as np
+import torch
+import transformers as ppb
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+import statistics
+import os
+import sys
+import argparse
+import configparser
+from transformers import CamembertModel, CamembertTokenizer
+from transformers import FlaubertModel, FlaubertTokenizer
+
+
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import SGDClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.model_selection import GridSearchCV
+
+
+import matplotlib.pyplot as plt
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+import seaborn as sns
+
+
+
+
+
+
+def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave):
+
+    #classifier, label_list, test_x, valid_y, title = "Confusion matrix"):
+    precision = []
+    recall = []
+    f1 = []
+    support = []
+    weighted_avg = None
+    accuracy = None
+
+    df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
+    report = classification_report( y_pred, valid_y, output_dict = True)
+    for c in classes:
+        precision.append(report[c]['precision'])
+        recall.append(report[c]['recall'])
+        f1.append(report[c]['f1-score'])
+        support.append(report[c]['support'])
+
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+    cnf_matrix = confusion_matrix(valid_y, y_pred)
+    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
+    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
+    TP = np.diag(cnf_matrix)
+    TN = cnf_matrix.sum() - (FP + FN + TP)
+
+    df['className'] = classesName
+    df['precision'] = precision
+    df['recall'] = recall
+    df['f1-score'] = f1
+    df['support'] = support
+    df['FP'] = FP
+    df['FN'] = FN
+    df['TP'] = TP
+    df['TN'] = TN
+    #disp = plot_confusion_matrix(classifier, test_x, valid_y,
+    #                                 display_labels= label_list,
+    #                                 cmap=plt.cm.Blues,
+    #                                 normalize=None)
+    #disp.ax_.set_title(title)
+
+    #print(title)
+    #print(disp.confusion_matrix)
+
+    #plt.show()
+    plt.rcParams["font.size"] = 3
+    plot_confusion_matrix(clf, X_test, y_test)
+    plt.savefig(pathSave)
+    return df, accuracy, weighted_avg
+
+
+
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+
+def remove_weak_classes(df, classColumnName, threshold):
+
+    dictOfClassInstances = create_dict(df,classColumnName)
+
+
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    #df = df[df[columnTarget] not in keys]
+    #df =  df.merge(df_tmp, how = 'outer' ,indicator=True)
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+
+
+def split_class(df, columnProcessed):
+    i = 0
+    new_df = pd.DataFrame(columns= df.columns)
+    for index, row in df.iterrows():
+        #cls = re.split(';', row[columnProcessed])
+        cls = filter(None, row[columnProcessed].split(';'))
+        cls = list(cls)
+        #cls = re.findall(r"[\w']+", row [columnProcessed])
+        r = row
+        for categ in cls:
+            r[columnProcessed] = categ
+            #new_df.append(r, ignore_index = True)
+            new_df.loc[i] = r
+            i = i + 1
+
+    return new_df
+
+
+def resample_classes(df, classColumnName, numberOfInstances):
+    # numberOfInstances first elements
+    #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns])
+    #random numberOfInstances elements
+    replace = False  # with replacement
+
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
+
+
+def select_classifier(argument):
+
+    classifiers = {
+
+                'lr' :LogisticRegression(),
+                'sgd' :SGDClassifier(),
+                'svm' :SVC() ,
+                'decisionTree' :DecisionTreeClassifier(),
+                'rfc' :RandomForestClassifier(),
+                'knn' : KNeighborsClassifier()
+                }
+
+    param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
+    param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
+    param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
+    param_grid_lr = { "penalty":['none',"l2"]}
+    param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]}
+    param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
+
+    grid_params = {
+
+                'lr': param_grid_lr,
+                'sgd': param_grid_sgd ,
+                'svm': param_grid_svm,
+                'decisionTree': param_grid_decisionTree,
+                'rfc': param_grid_rfc ,
+                'knn': param_grid_knn,
+
+                }
+
+    return classifiers.get(argument), grid_params.get(argument)
+
+
+if __name__ == "__main__":
+
+
+
+
+
+    print('ok')
+    parser = argparse.ArgumentParser()
+    parser.add_argument("modelName", help="bert or distilBert or camembert or flaubert")
+    parser.add_argument("classifier", help="lr or knn or rfc or decisionTree or sgd or svm")
+
+
+    args = parser.parse_args()
+    arg = args.modelName
+    classifier = args.classifier
+
+    config = configparser.ConfigParser()
+    config.read('parameters.conf')
+
+    minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
+    maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+
+    dataPath = config.get('data','dataPath')
+    columnText = config.get('data','columnText')
+    columnClass = config.get('data','columnClass')
+
+
+
+    if not os.path.exists('reports'):
+        os.makedirs('reports')
+
+    if not os.path.exists(os.path.join('reports',  columnClass)):
+        os.makedirs(os.path.join('reports', columnClass))
+
+
+    dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
+    if not os.path.exists(os.path.join('reports',  columnClass, dir_name_report)):
+        os.makedirs(os.path.join('reports', columnClass, dir_name_report))
+
+
+
+    # read data
+    print(dataPath)
+    df = pd.read_csv(dataPath)
+    df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
+    df = resample_classes(df, columnClass, maxOfInstancePerClass)
+
+    print(df.head())
+    print(df.shape)
+    #encode labels
+    df = df[df[columnClass] != 'unclassified']
+    y  = df[columnClass]
+    encoder = preprocessing.LabelEncoder()
+    y = encoder.fit_transform(y)
+
+
+    sentences = df['firstParagraph']
+    labels = y.tolist()
+
+
+
+    # Features Extraction
+        #Bert
+    model_class_bert, tokenizer_class_bert, pretrained_weights_bert = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
+    tokenizer_bert = tokenizer_class_bert.from_pretrained(pretrained_weights_bert)
+    model_bert = model_class_bert.from_pretrained(pretrained_weights_bert)
+        #DistilBert
+    model_class_distilBert, tokenizer_class_distilBert, pretrained_weights_distilBert = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
+    tokenizer_distilBert = tokenizer_class_distilBert.from_pretrained(pretrained_weights_distilBert)
+    model_distilBert = model_class_distilBert.from_pretrained(pretrained_weights_distilBert)
+        #Camembert
+    camembert_tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base")
+    camembert = CamembertModel.from_pretrained("camembert/camembert-base")
+        #Flaubert
+
+    flaubert, log = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased', output_loading_info=True)
+    flaubert_tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', do_lowercase=False)
+
+
+
+    models = {
+            'bert': model_bert,
+            'distilbert': model_distilBert ,
+            'camembert': camembert,
+            'flaubert': flaubert
+            }
+
+    tokenizers = {
+    'bert': tokenizer_bert,
+    'distilbert': tokenizer_distilBert ,
+    'camembert': camembert_tokenizer,
+    'flaubert': flaubert_tokenizer
+
+    }
+
+
+
+
+
+
+    if arg == 'flaubert':
+        model = flaubert
+        tokenizer = flaubert_tokenizer
+    elif arg == 'camembert':
+        model = camembert
+        tokenizer = camembert_tokenizer
+
+    elif arg == 'distilbert':
+        model = model_distilBert
+        tokenizer = tokenizer_distilBert
+
+    elif arg == 'bert':
+        model = model_bert
+        tokenizer = tokenizer_bert
+
+
+
+
+
+
+    tokenized = sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length = 512, truncation = True)))
+
+    # padding the sequences
+    max_len = 0
+    for i in tokenized.values:
+        if len(i) > max_len:
+            max_len = len(i)
+
+    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
+
+
+
+    # attention mask
+
+    attention_mask = np.where(padded != 0, 1, 0)
+
+
+
+    # get features
+    input_ids = torch.tensor(padded)
+    attention_mask = torch.tensor(attention_mask)
+
+    with torch.no_grad():
+        last_hidden_states = model(input_ids, attention_mask=attention_mask)
+
+    features = last_hidden_states[0][:,0,:].numpy()
+    print(features.shape)
+
+    train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
+
+
+    # classification
+
+
+    clf, grid_param = select_classifier(classifier)
+
+    print(features)
+
+
+
+    clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
+
+    clf.fit(train_x, train_y)
+
+    #evaluation
+
+
+    y_pred = clf.predict(test_x)
+
+
+    report, accuracy, weighted_avg = evaluate_model(clf, test_x, test_y, y_pred, test_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_, os.path.join('reports', columnClass, dir_name_report, arg+ '_' + classifier+'.pdf'))
+
+    report.to_csv(os.path.join('reports', columnClass,  dir_name_report, arg + '_' + classifier +'.csv'))
+    with open(os.path.join('reports', columnClass,  dir_name_report, arg + '_' + classifier+'.txt'), 'w') as f:
+
+        sys.stdout = f # Change the standard output to the file we created.
+        print('accuracy : {}'.format(accuracy))
+        print('weighted_Precision : {}'.format(weighted_avg['precision']))
+        print('weighted_Recall    : {}'.format(weighted_avg['recall']))
+        print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
+        print('weighted_Support   : {}'.format(weighted_avg['support']))
+        print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
+        #sys.stdout = sys.stdout # Reset the standard output to its original value
+        sys.stdout = sys.__stdout__
diff --git a/experiments/parameters.conf b/experiments/parameters.conf
new file mode 100644
index 0000000..df584e4
--- /dev/null
+++ b/experiments/parameters.conf
@@ -0,0 +1,10 @@
+[general]
+
+minOfInstancePerClass = 1200
+maxOfInstancePerClass = 7
+
+[data]
+
+dataPath = ../Data/dataframe_with_ensemble_domaine_enccre.csv
+columnText = contentWithoutClass
+columnClass = ensemble_domaine_enccre
diff --git a/experiments/requierements.txt b/experiments/requierements.txt
new file mode 100644
index 0000000..076fef8
--- /dev/null
+++ b/experiments/requierements.txt
@@ -0,0 +1,7 @@
+transformers==4.3.2
+sentencepiece
+sklearn
+pandas
+numpy
+torch==1.8.1
+
-- 
GitLab


From 7217823686c4ef8dcbeb7a05fc3156e2512445b0 Mon Sep 17 00:00:00 2001
From: Khalleud <ledk14@gmail.com>
Date: Thu, 5 Aug 2021 20:04:02 +0200
Subject: [PATCH 2/6] [UPDATE] updating readme

---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b518c4c..6535fd5 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,13 @@ In order to run the classifiers, use the following command :
 
     python experimentsClassicClassifiers.py <dataset_tsv_file> <content_column_name> <labels_column_name> <min_sample_per_class> <max_sample_per_class>
 
+In order to run Classification with pre-trained models, use the following command :
+
+   
+    cd experiments/
+    
+    python bert_experiments.py  <model_Name> <classifier> 
 
 # Acknowledgment
 
-The authors are grateful to the ASLAN project (ANR-10-LABX-0081) of the UniversitÃ© de Lyon, for its financial support within the French program "Investments for the Future" operated by the National Research Agency (ANR).
\ No newline at end of file
+The authors are grateful to the ASLAN project (ANR-10-LABX-0081) of the UniversitÃ© de Lyon, for its financial support within the French program "Investments for the Future" operated by the National Research Agency (ANR).
-- 
GitLab


From 21df7dd30824fa80778a6eb1144e64c858aeea6c Mon Sep 17 00:00:00 2001
From: Khalleud <ledk14@gmail.com>
Date: Sat, 11 Sep 2021 14:26:24 +0200
Subject: [PATCH 3/6] [ADD] Notebook Bert Fine Tuning

---
 BertFineTuning_.ipynb | 1098 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1098 insertions(+)
 create mode 100644 BertFineTuning_.ipynb

diff --git a/BertFineTuning_.ipynb b/BertFineTuning_.ipynb
new file mode 100644
index 0000000..6216aeb
--- /dev/null
+++ b/BertFineTuning_.ipynb
@@ -0,0 +1,1098 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "name": "BertFineTuning_.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8hzEGHl7gmzk"
+      },
+      "source": [
+        "## Setup GPU"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "dPOU-Efhf4ui",
+        "outputId": "1e3142a8-6351-43f3-9147-68406520b7ee"
+      },
+      "source": [
+        "import torch\n",
+        "\n",
+        "# If there's a GPU available...\n",
+        "if torch.cuda.is_available():    \n",
+        "\n",
+        "    # Tell PyTorch to use the GPU.    \n",
+        "    device = torch.device(\"cuda\")\n",
+        "\n",
+        "    print('There are %d GPU(s) available.' % torch.cuda.device_count())\n",
+        "\n",
+        "    print('We will use the GPU:', torch.cuda.get_device_name(0))\n",
+        "\n",
+        "# If not...\n",
+        "else:\n",
+        "    print('No GPU available, using the CPU instead.')\n",
+        "    device = torch.device(\"cpu\")"
+      ],
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "No GPU available, using the CPU instead.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Jr-S9yYIgGkA"
+      },
+      "source": [
+        "## Install packages"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "pwmZ5bBvgGNh",
+        "outputId": "79c5fb08-a9f4-41bc-eb4d-ab448c5fb4a7"
+      },
+      "source": [
+        "pip install transformers"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.10.0)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (21.0)\n",
+            "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.3)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.0)\n",
+            "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.6.4)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (5.4.1)\n",
+            "Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.45)\n",
+            "Requirement already satisfied: huggingface-hub>=0.0.12 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.16)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.12->transformers) (3.7.4.3)\n",
+            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.5.0)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n",
+            "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "VFXEpG00gXkL",
+        "outputId": "2336f39a-78b7-4118-e754-508d876c51f9"
+      },
+      "source": [
+        "pip install sentencepiece"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.7/dist-packages (0.1.96)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "12SA-qPFgsVo"
+      },
+      "source": [
+        "## Utils functions"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "WkIVcabUgxIl"
+      },
+      "source": [
+        "def create_dict(df, classColumnName):\n",
+        "    return dict(df[classColumnName].value_counts())\n",
+        "\n",
+        "def remove_weak_classes(df, classColumnName, threshold):\n",
+        "\n",
+        "    dictOfClassInstances = create_dict(df,classColumnName)\n",
+        "\n",
+        "\n",
+        "    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }\n",
+        "    keys = [*dictionary]\n",
+        "    df_tmp = df[~ df[classColumnName].isin(keys)]\n",
+        "    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)\n",
+        "    return df\n",
+        "\n",
+        "\n",
+        "def resample_classes(df, classColumnName, numberOfInstances):\n",
+        "    \n",
+        "    #random numberOfInstances elements\n",
+        "    replace = False  # with replacement\n",
+        "\n",
+        "    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n",
+        "    return df.groupby(classColumnName, as_index=False).apply(fn)\n",
+        "    "
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "c5QKcXulhNJ-"
+      },
+      "source": [
+        "## Load Data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vonJ-d4Qg1g5"
+      },
+      "source": [
+        "import pandas as pd \n",
+        "import numpy as np\n",
+        "from sklearn import preprocessing\n",
+        "from sklearn.model_selection import train_test_split"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ouU5usvXg4PA"
+      },
+      "source": [
+        "dataPath = 'dataframe_with_ensemble_domaine_enccre.csv'\n",
+        "columnText = 'contentWithoutClass'\n",
+        "columnClass = 'ensemble_domaine_enccre'\n",
+        "minOfInstancePerClass = 200\n",
+        "maxOfInstancePerClass = 1500"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5u1acjunhoxe"
+      },
+      "source": [
+        "df = pd.read_csv(dataPath)\n",
+        "df = remove_weak_classes(df, columnClass, minOfInstancePerClass)\n",
+        "df = resample_classes(df, columnClass, maxOfInstancePerClass)\n",
+        "df = df[df[columnClass] != 'unclassified']"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zrjZvs2dhzAy"
+      },
+      "source": [
+        "y  = df[columnClass]\n",
+        "numberOfClasses = y.nunique()\n",
+        "encoder = preprocessing.LabelEncoder()\n",
+        "y = encoder.fit_transform(y)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "u9AxxaA_h1CM"
+      },
+      "source": [
+        "train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Xt_PhH_6h1_3"
+      },
+      "source": [
+        "sentences = train_x[columnText].values\n",
+        "labels = train_y.tolist()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Gs4Agx_5h43M"
+      },
+      "source": [
+        "# Model\n",
+        "## Tokenisation & Input Formatting"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YZ5PhEYZiCEA"
+      },
+      "source": [
+        "tokeniser_bert = 'bert-base-multilingual-cased'\n",
+        "tokeniser_camembert = 'camembert-base'\n",
+        "\n",
+        "model_bert =  \"bert-base-multilingual-cased\"\n",
+        "model_camembert = 'camembert-base'"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "C4bigx_3ibuN",
+        "outputId": "9d54db26-9920-4a92-bb1e-4534f287140f"
+      },
+      "source": [
+        "from transformers import BertTokenizer, CamembertTokenizer\n",
+        "\n",
+        "# Load the BERT tokenizer.\n",
+        "print('Loading BERT tokenizer...')\n",
+        "tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Loading BERT tokenizer...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "5hNod5X9jDZN",
+        "outputId": "1166b782-d384-4388-de21-21091dc9f925"
+      },
+      "source": [
+        " # Tokenize all of the sentences and map the tokens to thier word IDs.\n",
+        "input_ids = []\n",
+        "\n",
+        "# For every sentence...\n",
+        "for sent in sentences:\n",
+        "    # `encode` will:\n",
+        "    #   (1) Tokenize the sentence.\n",
+        "    #   (2) Prepend the `[CLS]` token to the start.\n",
+        "    #   (3) Append the `[SEP]` token to the end.\n",
+        "    #   (4) Map tokens to their IDs.\n",
+        "    encoded_sent = tokenizer.encode(\n",
+        "                        sent,                      # Sentence to encode.\n",
+        "                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
+        "\n",
+        "                        # This function also supports truncation and conversion\n",
+        "                        # to pytorch tensors, but I need to do padding, so I\n",
+        "                        # can't use these features.\n",
+        "                        #max_length = 128,          # Truncate all sentences.\n",
+        "                        #return_tensors = 'pt',     # Return pytorch tensors.\n",
+        "                   )\n",
+        "    \n",
+        "    # Add the encoded sentence to the list.\n",
+        "    input_ids.append(encoded_sent)\n",
+        "\n"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Token indices sequence length is longer than the specified maximum sequence length for this model (866 > 512). Running this sequence through the model will result in indexing errors\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "W9EWv5JvjGH3",
+        "outputId": "9072122d-3586-40fe-9d75-5b6e9035d6d2"
+      },
+      "source": [
+        "print('Max sentence length: ', max([len(sen) for sen in input_ids])) "
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Max sentence length:  3462\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xh1TQJyvjOx5"
+      },
+      "source": [
+        "max_len = 180\n",
+        "padded = []\n",
+        "for i in input_ids:\n",
+        "\n",
+        "  if len(i) > max_len:\n",
+        "    padded.extend([i[:max_len]])\n",
+        "  else:\n",
+        "    padded.extend([i + [0] * (max_len - len(i))])\n",
+        "\n",
+        "\n",
+        "padded = input_ids = np.array(padded)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ZiwY6gn0jUkD"
+      },
+      "source": [
+        " # Create attention masks\n",
+        "attention_masks = []\n",
+        "\n",
+        "# For each sentence...\n",
+        "for sent in padded:\n",
+        "    \n",
+        "    # Create the attention mask.\n",
+        "    #   - If a token ID is 0, then it's padding, set the mask to 0.\n",
+        "    #   - If a token ID is > 0, then it's a real token, set the mask to 1.\n",
+        "    att_mask = [int(token_id > 0) for token_id in sent]\n",
+        "    \n",
+        "    # Store the attention mask for this sentence.\n",
+        "    attention_masks.append(att_mask)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "oBTR5AfAjXJe"
+      },
+      "source": [
+        "# Use 90% for training and 10% for validation.\n",
+        "train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, \n",
+        "                                                            random_state=2018, test_size=0.1, stratify = labels )\n",
+        "# Do the same for the masks.\n",
+        "train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,\n",
+        "                                             random_state=2018, test_size=0.1, stratify = labels)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "b9Mw5kq3jhTb"
+      },
+      "source": [
+        "# Convert all inputs and labels into torch tensors, the required datatype \n",
+        "# for my model.\n",
+        "train_inputs = torch.tensor(train_inputs)\n",
+        "validation_inputs = torch.tensor(validation_inputs)\n",
+        "\n",
+        "train_labels = torch.tensor(train_labels)\n",
+        "validation_labels = torch.tensor(validation_labels)\n",
+        "\n",
+        "train_masks = torch.tensor(train_masks)\n",
+        "validation_masks = torch.tensor(validation_masks)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "UfFWzbENjnkw"
+      },
+      "source": [
+        "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n",
+        "\n",
+        "# The DataLoader needs to know the batch size for training, so I specify it here.\n",
+        "# For fine-tuning BERT on a specific task, the authors recommend a batch size of\n",
+        "# 16 or 32.\n",
+        "\n",
+        "batch_size = 32\n",
+        "\n",
+        "# Create the DataLoader for training set.\n",
+        "train_data = TensorDataset(train_inputs, train_masks, train_labels)\n",
+        "train_sampler = RandomSampler(train_data)\n",
+        "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)\n",
+        "\n",
+        "# Create the DataLoader for validation set.\n",
+        "validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)\n",
+        "validation_sampler = SequentialSampler(validation_data)\n",
+        "validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x45JNGqhkUn2"
+      },
+      "source": [
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 463
+        },
+        "id": "C7M2Er1ajsTf",
+        "outputId": "fe4c13b7-5157-49b4-e878-6d7676d4d1a3"
+      },
+      "source": [
+        "from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification\n",
+        "\n",
+        "# Load BertForSequenceClassification, the pretrained BERT model with a single \n",
+        "# linear classification layer on top.\n",
+        "\n",
+        "model = BertForSequenceClassification.from_pretrained(\n",
+        "    model_bert, # Use the 12-layer BERT model, with an uncased vocab.\n",
+        "    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.\n",
+        "                    # You can increase this for multi-class tasks.   \n",
+        "    output_attentions = False, # Whether the model returns attentions weights.\n",
+        "    output_hidden_states = False, # Whether the model returns all hidden-states.\n",
+        ")\n",
+        "\n",
+        "# Tell pytorch to run this model on the GPU.\n",
+        "model.cuda()"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']\n",
+            "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+            "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+          ]
+        },
+        {
+          "output_type": "error",
+          "ename": "RuntimeError",
+          "evalue": "ignored",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+            "\u001b[0;32m<ipython-input-120-80c23ac5f353>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;31m# Tell pytorch to run this model on the GPU.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36mcuda\u001b[0;34m(self, device)\u001b[0m\n\u001b[1;32m    635\u001b[0m             \u001b[0mModule\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    636\u001b[0m         \"\"\"\n\u001b[0;32m--> 637\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    638\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    639\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mxpu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m    528\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    529\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m             \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    532\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m    528\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    529\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m             \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    532\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m    528\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    529\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m             \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    532\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m    550\u001b[0m                 \u001b[0;31m# `with torch.no_grad():`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    551\u001b[0m                 \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mno_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 552\u001b[0;31m                     \u001b[0mparam_applied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    553\u001b[0m                 \u001b[0mshould_use_set_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    554\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mshould_use_set_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m    635\u001b[0m             \u001b[0mModule\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    636\u001b[0m         \"\"\"\n\u001b[0;32m--> 637\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    638\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    639\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mxpu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mRuntimeError\u001b[0m: CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 11.17 GiB total capacity; 10.43 GiB already allocated; 91.81 MiB free; 10.63 GiB reserved in total by PyTorch)"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xd_cG-8pj4Iw"
+      },
+      "source": [
+        "#Note: AdamW is a class from the huggingface library (as opposed to pytorch) \n",
+        "# I believe the 'W' stands for 'Weight Decay fix\"\n",
+        "optimizer = AdamW(model.parameters(),\n",
+        "                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n",
+        "                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.\n",
+        "                )"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "65G-uHuLj4_6"
+      },
+      "source": [
+        "from transformers import get_linear_schedule_with_warmup\n",
+        "\n",
+        "# Number of training epochs (authors recommend between 2 and 4)\n",
+        "epochs = 4\n",
+        "\n",
+        "# Total number of training steps is number of batches * number of epochs.\n",
+        "total_steps = len(train_dataloader) * epochs\n",
+        "\n",
+        "# Create the learning rate scheduler.\n",
+        "scheduler = get_linear_schedule_with_warmup(optimizer, \n",
+        "                                            num_warmup_steps = 0, # Default value in run_glue.py\n",
+        "                                            num_training_steps = total_steps)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lHSOuwcMj9jf"
+      },
+      "source": [
+        "import numpy as np\n",
+        "\n",
+        "# Function to calculate the accuracy of our predictions vs labels\n",
+        "def flat_accuracy(preds, labels):\n",
+        "    pred_flat = np.argmax(preds, axis=1).flatten()\n",
+        "    labels_flat = labels.flatten()\n",
+        "    return np.sum(pred_flat == labels_flat) / len(labels_flat) "
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Z0S3br-7kASm"
+      },
+      "source": [
+        "import time\n",
+        "import datetime\n",
+        "\n",
+        "def format_time(elapsed):\n",
+        "    '''\n",
+        "    Takes a time in seconds and returns a string hh:mm:ss\n",
+        "    '''\n",
+        "    # Round to the nearest second.\n",
+        "    elapsed_rounded = int(round((elapsed)))\n",
+        "    \n",
+        "    # Format as hh:mm:ss\n",
+        "    return str(datetime.timedelta(seconds=elapsed_rounded))"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "SbHBbYpwkKaA"
+      },
+      "source": [
+        "import random\n",
+        "\n",
+        "# This training code is based on the `run_glue.py` script here:\n",
+        "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n",
+        "\n",
+        "\n",
+        "# Set the seed value all over the place to make this reproducible.\n",
+        "seed_val = 42\n",
+        "\n",
+        "random.seed(seed_val)\n",
+        "np.random.seed(seed_val)\n",
+        "torch.manual_seed(seed_val)\n",
+        "torch.cuda.manual_seed_all(seed_val)\n",
+        "\n",
+        "# Store the average loss after each epoch so I can plot them.\n",
+        "loss_values = []\n",
+        "\n",
+        "# For each epoch...\n",
+        "for epoch_i in range(0, epochs):\n",
+        "    \n",
+        "    # ========================================\n",
+        "    #               Training\n",
+        "    # ========================================\n",
+        "    \n",
+        "    # Perform one full pass over the training set.\n",
+        "\n",
+        "    print(\"\")\n",
+        "    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n",
+        "    print('Training...')\n",
+        "\n",
+        "    # Measure how long the training epoch takes.\n",
+        "    t0 = time.time()\n",
+        "\n",
+        "    # Reset the total loss for this epoch.\n",
+        "    total_loss = 0\n",
+        "\n",
+        "    # Put the model into training mode.\n",
+        "    model.train()\n",
+        "\n",
+        "    # For each batch of training data...\n",
+        "    for step, batch in enumerate(train_dataloader):\n",
+        "\n",
+        "        # Progress update every 40 batches.\n",
+        "        if step % 40 == 0 and not step == 0:\n",
+        "            # Calculate elapsed time in minutes.\n",
+        "            elapsed = format_time(time.time() - t0)\n",
+        "            \n",
+        "            # Report progress.\n",
+        "            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n",
+        "\n",
+        "        # Unpack this training batch from the dataloader. \n",
+        "        #\n",
+        "        # As I unpack the batch, I'll also copy each tensor to the GPU using the \n",
+        "        # `to` method.\n",
+        "        #\n",
+        "        # `batch` contains three pytorch tensors:\n",
+        "        #   [0]: input ids \n",
+        "        #   [1]: attention masks\n",
+        "        #   [2]: labels \n",
+        "        b_input_ids = batch[0].to(device)\n",
+        "        b_input_mask = batch[1].to(device)\n",
+        "        b_labels = batch[2].to(device)\n",
+        "\n",
+        "        # Always clear any previously calculated gradients before performing a\n",
+        "        # backward pass. PyTorch doesn't do this automatically because \n",
+        "        # accumulating the gradients is \"convenient while training RNNs\". \n",
+        "        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)\n",
+        "        model.zero_grad()        \n",
+        "\n",
+        "        # Perform a forward pass (evaluate the model on this training batch).\n",
+        "        # This will return the loss (rather than the model output) because I\n",
+        "        # have provided the `labels`.\n",
+        "        # The documentation for this `model` function is here: \n",
+        "        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n",
+        "        outputs = model(b_input_ids, \n",
+        "                    token_type_ids=None, \n",
+        "                    attention_mask=b_input_mask, \n",
+        "                    labels=b_labels)\n",
+        "        \n",
+        "        # The call to `model` always returns a tuple, so I need to pull the \n",
+        "        # loss value out of the tuple.\n",
+        "        loss = outputs[0]\n",
+        "\n",
+        "        # Accumulate the training loss over all of the batches so that I can\n",
+        "        # calculate the average loss at the end. `loss` is a Tensor containing a\n",
+        "        # single value; the `.item()` function just returns the Python value \n",
+        "        # from the tensor.\n",
+        "        total_loss += loss.item()\n",
+        "\n",
+        "        # Perform a backward pass to calculate the gradients.\n",
+        "        loss.backward()\n",
+        "\n",
+        "        # Clip the norm of the gradients to 1.0.\n",
+        "        # This is to help prevent the \"exploding gradients\" problem.\n",
+        "        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n",
+        "\n",
+        "        # Update parameters and take a step using the computed gradient.\n",
+        "        # The optimizer dictates the \"update rule\"--how the parameters are\n",
+        "        # modified based on their gradients, the learning rate, etc.\n",
+        "        optimizer.step()\n",
+        "\n",
+        "        # Update the learning rate.\n",
+        "        scheduler.step()\n",
+        "\n",
+        "    # Calculate the average loss over the training data.\n",
+        "    avg_train_loss = total_loss / len(train_dataloader)            \n",
+        "    \n",
+        "    # Store the loss value for plotting the learning curve.\n",
+        "    loss_values.append(avg_train_loss)\n",
+        "\n",
+        "    print(\"\")\n",
+        "    print(\"  Average training loss: {0:.2f}\".format(avg_train_loss))\n",
+        "    print(\"  Training epoch took: {:}\".format(format_time(time.time() - t0)))\n",
+        "        \n",
+        "    # ========================================\n",
+        "    #               Validation\n",
+        "    # ========================================\n",
+        "    # After the completion of each training epoch, measure the performance on\n",
+        "    # the validation set.\n",
+        "\n",
+        "    print(\"\")\n",
+        "    print(\"Running Validation...\")\n",
+        "\n",
+        "    t0 = time.time()\n",
+        "\n",
+        "    # Put the model in evaluation mode--the dropout layers behave differently\n",
+        "    # during evaluation.\n",
+        "    model.eval()\n",
+        "\n",
+        "    # Tracking variables \n",
+        "    eval_loss, eval_accuracy = 0, 0\n",
+        "    nb_eval_steps, nb_eval_examples = 0, 0\n",
+        "\n",
+        "    # Evaluate data for one epoch\n",
+        "    for batch in validation_dataloader:\n",
+        "        \n",
+        "        # Add batch to GPU\n",
+        "        batch = tuple(t.to(device) for t in batch)\n",
+        "        \n",
+        "        # Unpack the inputs from dataloader\n",
+        "        b_input_ids, b_input_mask, b_labels = batch\n",
+        "        \n",
+        "        # Telling the model not to compute or store gradients, saving memory and\n",
+        "        # speeding up validation\n",
+        "        with torch.no_grad():        \n",
+        "\n",
+        "            # Forward pass, calculate logit predictions.\n",
+        "            # This will return the logits rather than the loss because we have\n",
+        "            # not provided labels.\n",
+        "            # token_type_ids is the same as the \"segment ids\", which \n",
+        "            # differentiates sentence 1 and 2 in 2-sentence tasks.\n",
+        "            # The documentation for this `model` function is here: \n",
+        "            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n",
+        "            outputs = model(b_input_ids, \n",
+        "                            token_type_ids=None, \n",
+        "                            attention_mask=b_input_mask)\n",
+        "        \n",
+        "        # Get the \"logits\" output by the model. The \"logits\" are the output\n",
+        "        # values prior to applying an activation function like the softmax.\n",
+        "        logits = outputs[0]\n",
+        "\n",
+        "        # Move logits and labels to CPU\n",
+        "        logits = logits.detach().cpu().numpy()\n",
+        "        label_ids = b_labels.to('cpu').numpy()\n",
+        "        \n",
+        "        # Calculate the accuracy for this batch of test sentences.\n",
+        "        tmp_eval_accuracy = flat_accuracy(logits, label_ids)\n",
+        "        \n",
+        "        # Accumulate the total accuracy.\n",
+        "        eval_accuracy += tmp_eval_accuracy\n",
+        "\n",
+        "        # Track the number of batches\n",
+        "        nb_eval_steps += 1\n",
+        "\n",
+        "    # Report the final accuracy for this validation run.\n",
+        "    print(\"  Accuracy: {0:.2f}\".format(eval_accuracy/nb_eval_steps))\n",
+        "    print(\"  Validation took: {:}\".format(format_time(time.time() - t0)))\n",
+        "\n",
+        "print(\"\")\n",
+        "print(\"Training complete!\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "VJwyfmakkQyj"
+      },
+      "source": [
+        "## Test"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "VAyzmfhZCGZo"
+      },
+      "source": [
+        "sentences_test = test_x[columnText].values\n",
+        "labels_test = test_y.tolist()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lZFXr_sdCJcb"
+      },
+      "source": [
+        "# Tokenize all of the sentences and map the tokens to thier word IDs.\n",
+        "input_ids_test = []\n",
+        "# For every sentence...\n",
+        "for sent in sentences_test:\n",
+        "    # `encode` will:\n",
+        "    #   (1) Tokenize the sentence.\n",
+        "    #   (2) Prepend the `[CLS]` token to the start.\n",
+        "    #   (3) Append the `[SEP]` token to the end.\n",
+        "    #   (4) Map tokens to their IDs.\n",
+        "    encoded_sent = tokenizer.encode(\n",
+        "                        sent,                      # Sentence to encode.\n",
+        "                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
+        "                )\n",
+        "    \n",
+        "    input_ids_test.append(encoded_sent)\n",
+        "\n",
+        "# Pad our input tokens\n",
+        "padded_test = []\n",
+        "for i in input_ids_test:\n",
+        "\n",
+        "  if len(i) > max_len:\n",
+        "    padded_test.extend([i[:max_len]])\n",
+        "  else:\n",
+        "    padded_test.extend([i + [0] * (max_len - len(i))])\n",
+        "input_ids_test = np.array(padded_test)\n",
+        "\n",
+        "# Create attention masks\n",
+        "attention_masks = []\n",
+        "\n",
+        "# Create a mask of 1s for each token followed by 0s for padding\n",
+        "for seq in input_ids_test:\n",
+        "    seq_mask = [float(i>0) for i in seq]\n",
+        "    attention_masks.append(seq_mask) \n",
+        "\n",
+        "# Convert to tensors.\n",
+        "prediction_inputs = torch.tensor(input_ids_test)\n",
+        "prediction_masks = torch.tensor(attention_masks)\n",
+        "prediction_labels = torch.tensor(labels_test)\n",
+        "\n",
+        "# Set the batch size.  \n",
+        "batch_size = 32  \n",
+        "\n",
+        "# Create the DataLoader.\n",
+        "prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)\n",
+        "prediction_sampler = SequentialSampler(prediction_data)\n",
+        "prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "SUDcxi03Cmf-"
+      },
+      "source": [
+        "print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))\n",
+        "\n",
+        "# Put model in evaluation mode\n",
+        "model.eval()\n",
+        "\n",
+        "# Tracking variables \n",
+        "predictions_test , true_labels = [], []\n",
+        "\n",
+        "# Predict \n",
+        "for batch in prediction_dataloader:\n",
+        "# Add batch to GPU\n",
+        "    batch = tuple(t.to(device) for t in batch)\n",
+        "    \n",
+        "    # Unpack the inputs from the dataloader\n",
+        "    b_input_ids, b_input_mask, b_labels = batch\n",
+        "    \n",
+        "    # Telling the model not to compute or store gradients, saving memory and \n",
+        "    # speeding up prediction\n",
+        "    with torch.no_grad():\n",
+        "        # Forward pass, calculate logit predictions\n",
+        "        outputs = model(b_input_ids, token_type_ids=None, \n",
+        "                        attention_mask=b_input_mask)\n",
+        "\n",
+        "    logits = outputs[0]\n",
+        "    #print(logits)\n",
+        "\n",
+        "    # Move logits and labels to CPU\n",
+        "    logits = logits.detach().cpu().numpy()\n",
+        "    label_ids = b_labels.to('cpu').numpy()\n",
+        "    #print(logits)\n",
+        "    \n",
+        "    # Store predictions and true labels\n",
+        "    predictions_test.append(logits)\n",
+        "    true_labels.append(label_ids)\n",
+        "\n",
+        "print('    DONE.')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "c3i7szp3Cn5u"
+      },
+      "source": [
+        "from sklearn.metrics import *\n",
+        "\n",
+        "pred_labels = []\n",
+        "\n",
+        "# Evaluate each test batch using many matrics\n",
+        "print('Calculating the matrics for each batch...')\n",
+        "\n",
+        "for i in range(len(true_labels)):\n",
+        "  \n",
+        "  # The predictions for this batch are a 2-column ndarray (one column for \"0\" \n",
+        "  # and one column for \"1\"). Pick the label with the highest value and turn this\n",
+        "  # in to a list of 0s and 1s.\n",
+        "  pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()\n",
+        "  pred_labels.append(pred_labels_i)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "0bU9-DsBCxSO"
+      },
+      "source": [
+        "pred_labels_ = [item for sublist in pred_labels for item in sublist]\n",
+        "true_labels_ = [item for sublist in true_labels for item in sublist]\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZUM_U2QlC4K5"
+      },
+      "source": [
+        "### Report & Evaluation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "d5n84N0xCfcU"
+      },
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "from sklearn.metrics import plot_confusion_matrix\n",
+        "from sklearn.metrics import confusion_matrix\n",
+        "from sklearn.metrics import classification_report\n",
+        "import seaborn as sns"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "v4hXk-KjC-nq"
+      },
+      "source": [
+        "report = classification_report( pred_labels_, true_labels_, output_dict = True)\n",
+        "    \n",
+        "accuracy = report['accuracy']\n",
+        "weighted_avg = report['weighted avg']"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xETMy1L6DAa5"
+      },
+      "source": [
+        "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
+        "classesName = encoder.classes_"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "dPjV_5g8DDQy"
+      },
+      "source": [
+        "precision = []\n",
+        "recall = []\n",
+        "f1 = []\n",
+        "support = []\n",
+        "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
+        "for c in classes:\n",
+        "  precision.append(report[c]['precision'])\n",
+        "  recall.append(report[c]['recall'])\n",
+        "  f1.append(report[c]['f1-score'])\n",
+        "  support.append(report[c]['support'])\n",
+        "\n",
+        "accuracy = report['accuracy']\n",
+        "weighted_avg = report['weighted avg']\n",
+        "cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n",
+        "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
+        "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
+        "TP = np.diag(cnf_matrix)\n",
+        "TN = cnf_matrix.sum() - (FP + FN + TP)\n",
+        "\n",
+        "dff['className'] = classesName\n",
+        "dff['precision'] = precision\n",
+        "dff['recall'] = recall\n",
+        "dff['f1-score'] = f1\n",
+        "dff['support'] = support\n",
+        "dff['FP'] = FP\n",
+        "dff['FN'] = FN\n",
+        "dff['TP'] = TP\n",
+        "dff['TN'] = TN\n",
+        "  \n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vslzi9bHDKcv"
+      },
+      "source": [
+        "print(weighted_avg)\n",
+        "print(accuracy)\n",
+        "print(dff)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
-- 
GitLab


From e7f6f159fa4b99afbcec38a6c14a8745765f8775 Mon Sep 17 00:00:00 2001
From: Khalleud <ledk14@gmail.com>
Date: Sun, 12 Sep 2021 00:45:06 +0200
Subject: [PATCH 4/6] [ADD] Training mode Bert Fine Tuning

---
 training_bertFineTuning.py | 465 +++++++++++++++++++++++++++++++++++++
 1 file changed, 465 insertions(+)
 create mode 100644 training_bertFineTuning.py

diff --git a/training_bertFineTuning.py b/training_bertFineTuning.py
new file mode 100644
index 0000000..72a5929
--- /dev/null
+++ b/training_bertFineTuning.py
@@ -0,0 +1,465 @@
+import torch
+import pandas as pd
+import numpy as np
+from sklearn import preprocessing
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer, CamembertTokenizer
+from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
+from transformers import get_linear_schedule_with_warmup
+import time
+import datetime
+import random
+
+
+
+
+
+###########################################################################
+########################## Utils Functions ################################
+###########################################################################
+
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+
+def remove_weak_classes(df, classColumnName, threshold):
+
+    dictOfClassInstances = create_dict(df,classColumnName)
+
+
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+
+
+def resample_classes(df, classColumnName, numberOfInstances):
+
+    #random numberOfInstances elements
+    replace = False  # with replacement
+
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
+
+##############################################################################################################
+########################## Setup GPU #########################################################################
+##############################################################################################################
+
+# If there's a GPU available...
+if torch.cuda.is_available():
+
+    # Tell PyTorch to use the GPU.
+    device = torch.device("cuda")
+
+    print('There are %d GPU(s) available.' % torch.cuda.device_count())
+
+    print('We will use the GPU:', torch.cuda.get_device_name(0))
+
+# If not...
+else:
+    print('No GPU available, using the CPU instead.')
+    device = torch.device("cpu")
+
+
+
+
+#############################################################################################################
+########################## parameters ###################################################################
+###########################################################################################################
+
+config = configparser.ConfigParser()
+config.read('settings.conf')
+
+dataPath = config.get('general','dataPath')
+columnText = config.get('general','columnText')
+columnClass = config.get('general','columnClass')
+
+minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
+maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+
+chosen_tokeniser = config.get('model','tokeniser')
+chosen_model = config.get('model','model')
+
+max_len = int(config.get('model','max_len_sequences'))
+
+
+#############################################################################################################
+########################## Load Data ###################################################################
+###########################################################################################################
+
+
+
+
+df = pd.read_csv(dataPath)
+df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
+df = resample_classes(df, columnClass, maxOfInstancePerClass)
+df = df[df[columnClass] != 'unclassified']
+
+
+
+
+
+y  = df[columnClass]
+numberOfClasses = y.nunique()
+encoder = preprocessing.LabelEncoder()
+y = encoder.fit_transform(y)
+
+
+
+sentences = train_x[columnText].values
+labels = train_y.tolist()
+
+
+
+############################################################################################################
+########################## Model: Tokenization & Input Formatting ###################################################################
+###########################################################################################################
+
+
+# Load the BERT tokenizer.
+print('Loading BERT tokenizer...')
+tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)
+
+
+ # Tokenize all of the sentences and map the tokens to thier word IDs.
+input_ids = []
+
+# For every sentence...
+for sent in sentences:
+    # `encode` will:
+    #   (1) Tokenize the sentence.
+    #   (2) Prepend the `[CLS]` token to the start.
+    #   (3) Append the `[SEP]` token to the end.
+    #   (4) Map tokens to their IDs.
+    encoded_sent = tokenizer.encode(
+                        sent,                      # Sentence to encode.
+                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+
+                        # This function also supports truncation and conversion
+                        # to pytorch tensors, but I need to do padding, so I
+                        # can't use these features.
+                        #max_length = 128,          # Truncate all sentences.
+                        #return_tensors = 'pt',     # Return pytorch tensors.
+                   )
+
+    # Add the encoded sentence to the list.
+    input_ids.append(encoded_sent)
+
+
+
+
+padded = []
+for i in input_ids:
+
+  if len(i) > max_len:
+    padded.extend([i[:max_len]])
+  else:
+    padded.extend([i + [0] * (max_len - len(i))])
+
+
+padded = input_ids = np.array(padded)
+
+
+
+ # Create attention masks
+attention_masks = []
+
+# For each sentence...
+for sent in padded:
+
+    # Create the attention mask.
+    #   - If a token ID is 0, then it's padding, set the mask to 0.
+    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
+    att_mask = [int(token_id > 0) for token_id in sent]
+
+    # Store the attention mask for this sentence.
+    attention_masks.append(att_mask)
+
+
+# Use 90% for training and 10% for validation.
+train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
+                                                            random_state=2018, test_size=0.1, stratify = labels )
+# Do the same for the masks.
+train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
+                                             random_state=2018, test_size=0.1, stratify = labels)
+
+
+# Convert all inputs and labels into torch tensors, the required datatype
+# for my model.
+train_inputs = torch.tensor(train_inputs)
+validation_inputs = torch.tensor(validation_inputs)
+
+train_labels = torch.tensor(train_labels)
+validation_labels = torch.tensor(validation_labels)
+
+train_masks = torch.tensor(train_masks)
+validation_masks = torch.tensor(validation_masks)
+
+
+
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+
+# The DataLoader needs to know the batch size for training, so I specify it here.
+# For fine-tuning BERT on a specific task, the authors recommend a batch size of
+# 16 or 32.
+
+batch_size = int(config.get('model','batch_size'))
+
+# Create the DataLoader for training set.
+train_data = TensorDataset(train_inputs, train_masks, train_labels)
+train_sampler = RandomSampler(train_data)
+train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
+
+# Create the DataLoader for validation set.
+validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
+validation_sampler = SequentialSampler(validation_data)
+validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
+
+
+
+
+############################################################################################################
+########################## Model: Training ###################################################################
+###########################################################################################################
+
+
+print(' Selecting a model .....')
+
+
+
+# Load BertForSequenceClassification, the pretrained BERT model with a single
+# linear classification layer on top.
+
+model = BertForSequenceClassification.from_pretrained(
+    chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
+    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+                    # You can increase this for multi-class tasks.
+    output_attentions = False, # Whether the model returns attentions weights.
+    output_hidden_states = False, # Whether the model returns all hidden-states.
+)
+
+# Tell pytorch to run this model on the GPU.
+model.cuda()
+
+
+#Note: AdamW is a class from the huggingface library (as opposed to pytorch)
+# I believe the 'W' stands for 'Weight Decay fix"
+optimizer = AdamW(model.parameters(),
+                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                )
+
+
+
+# Number of training epochs (authors recommend between 2 and 4)
+epochs = int(config.get('model','epochs'))
+
+# Total number of training steps is number of batches * number of epochs.
+total_steps = len(train_dataloader) * epochs
+
+# Create the learning rate scheduler.
+scheduler = get_linear_schedule_with_warmup(optimizer,
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
+
+
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+
+
+
+
+
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+
+
+
+
+# This training code is based on the `run_glue.py` script here:
+# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+
+
+# Set the seed value all over the place to make this reproducible.
+seed_val = 42
+
+random.seed(seed_val)
+np.random.seed(seed_val)
+torch.manual_seed(seed_val)
+torch.cuda.manual_seed_all(seed_val)
+
+# Store the average loss after each epoch so I can plot them.
+loss_values = []
+
+# For each epoch...
+for epoch_i in range(0, epochs):
+
+    # ========================================
+    #               Training
+    # ========================================
+
+    # Perform one full pass over the training set.
+
+    print("")
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('Training...')
+
+    # Measure how long the training epoch takes.
+    t0 = time.time()
+
+    # Reset the total loss for this epoch.
+    total_loss = 0
+
+    # Put the model into training mode.
+    model.train()
+
+    # For each batch of training data...
+    for step, batch in enumerate(train_dataloader):
+
+        # Progress update every 40 batches.
+        if step % 40 == 0 and not step == 0:
+            # Calculate elapsed time in minutes.
+            elapsed = format_time(time.time() - t0)
+
+            # Report progress.
+            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+
+        # Unpack this training batch from the dataloader.
+        #
+        # As I unpack the batch, I'll also copy each tensor to the GPU using the
+        # `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids
+        #   [1]: attention masks
+        #   [2]: labels
+        b_input_ids = batch[0].to(device)
+        b_input_mask = batch[1].to(device)
+        b_labels = batch[2].to(device)
+
+        # Always clear any previously calculated gradients before performing a
+        # backward pass. PyTorch doesn't do this automatically because
+        # accumulating the gradients is "convenient while training RNNs".
+        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+        model.zero_grad()
+
+        # Perform a forward pass (evaluate the model on this training batch).
+        # This will return the loss (rather than the model output) because I
+        # have provided the `labels`.
+        # The documentation for this `model` function is here:
+        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+        outputs = model(b_input_ids,
+                    token_type_ids=None,
+                    attention_mask=b_input_mask,
+                    labels=b_labels)
+
+        # The call to `model` always returns a tuple, so I need to pull the
+        # loss value out of the tuple.
+        loss = outputs[0]
+
+        # Accumulate the training loss over all of the batches so that I can
+        # calculate the average loss at the end. `loss` is a Tensor containing a
+        # single value; the `.item()` function just returns the Python value
+        # from the tensor.
+        total_loss += loss.item()
+
+        # Perform a backward pass to calculate the gradients.
+        loss.backward()
+
+        # Clip the norm of the gradients to 1.0.
+        # This is to help prevent the "exploding gradients" problem.
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+        # Update parameters and take a step using the computed gradient.
+        # The optimizer dictates the "update rule"--how the parameters are
+        # modified based on their gradients, the learning rate, etc.
+        optimizer.step()
+
+        # Update the learning rate.
+        scheduler.step()
+
+    # Calculate the average loss over the training data.
+    avg_train_loss = total_loss / len(train_dataloader)
+
+    # Store the loss value for plotting the learning curve.
+    loss_values.append(avg_train_loss)
+
+    print("")
+    print("  Average training loss: {0:.2f}".format(avg_train_loss))
+    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
+
+    # ========================================
+    #               Validation
+    # ========================================
+    # After the completion of each training epoch, measure the performance on
+    # the validation set.
+
+    print("")
+    print("Running Validation...")
+
+    t0 = time.time()
+
+    # Put the model in evaluation mode--the dropout layers behave differently
+    # during evaluation.
+    model.eval()
+
+    # Tracking variables
+    eval_loss, eval_accuracy = 0, 0
+    nb_eval_steps, nb_eval_examples = 0, 0
+
+    # Evaluate data for one epoch
+    for batch in validation_dataloader:
+
+        # Add batch to GPU
+        batch = tuple(t.to(device) for t in batch)
+
+        # Unpack the inputs from dataloader
+        b_input_ids, b_input_mask, b_labels = batch
+
+        # Telling the model not to compute or store gradients, saving memory and
+        # speeding up validation
+        with torch.no_grad():
+
+            # Forward pass, calculate logit predictions.
+            # This will return the logits rather than the loss because we have
+            # not provided labels.
+            # token_type_ids is the same as the "segment ids", which
+            # differentiates sentence 1 and 2 in 2-sentence tasks.
+            # The documentation for this `model` function is here:
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            outputs = model(b_input_ids,
+                            token_type_ids=None,
+                            attention_mask=b_input_mask)
+
+        # Get the "logits" output by the model. The "logits" are the output
+        # values prior to applying an activation function like the softmax.
+        logits = outputs[0]
+
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+
+        # Calculate the accuracy for this batch of test sentences.
+        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
+
+        # Accumulate the total accuracy.
+        eval_accuracy += tmp_eval_accuracy
+
+        # Track the number of batches
+        nb_eval_steps += 1
+
+    # Report the final accuracy for this validation run.
+    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
+    print("  Validation took: {:}".format(format_time(time.time() - t0)))
+
+print("")
+print("Training complete!")
-- 
GitLab


From a5158391ec1bd55d595b03e34a358cba6253bb56 Mon Sep 17 00:00:00 2001
From: Khalleud <ledk14@gmail.com>
Date: Fri, 17 Sep 2021 00:19:40 +0200
Subject: [PATCH 5/6] [ADD] train bert finetuning & predict & evaluate

---
 evaluate_bertFineTuning.py |  54 ++++
 main.py                    | 120 ++++++++
 predict_bertFineTuning.py  | 168 +++++++++++
 training_bertFineTuning.py | 601 +++++++++++++++++--------------------
 4 files changed, 610 insertions(+), 333 deletions(-)
 create mode 100644 evaluate_bertFineTuning.py
 create mode 100644 main.py
 create mode 100644 predict_bertFineTuning.py

diff --git a/evaluate_bertFineTuning.py b/evaluate_bertFineTuning.py
new file mode 100644
index 0000000..3c9b52b
--- /dev/null
+++ b/evaluate_bertFineTuning.py
@@ -0,0 +1,54 @@
+import matplotlib.pyplot as plt
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+import seaborn as sns
+
+
+
+
+
+
+
+
+
+
+def evaluate_bertFineTuning(pred_labels_, true_labels_, encoder):
+    report = classification_report( pred_labels_, true_labels_, output_dict = True)
+
+    classes = [str(e) for e in encoder.transform(encoder.classes_)]
+    classesName = encoder.classes_
+
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+
+    precision = []
+    recall = []
+    f1 = []
+    support = []
+    dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
+    for c in classes:
+        precision.append(report[c]['precision'])
+        recall.append(report[c]['recall'])
+        f1.append(report[c]['f1-score'])
+        support.append(report[c]['support'])
+
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+    cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
+    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
+    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
+    TP = np.diag(cnf_matrix)
+    TN = cnf_matrix.sum() - (FP + FN + TP)
+
+    dff['className'] = classesName
+    dff['precision'] = precision
+    dff['recall'] = recall
+    dff['f1-score'] = f1
+    dff['support'] = support
+    dff['FP'] = FP
+    dff['FN'] = FN
+    dff['TP'] = TP
+    dff['TN'] = TN
+
+    return dff, accuracy, weighted_avg
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..8301acc
--- /dev/null
+++ b/main.py
@@ -0,0 +1,120 @@
+import pandas as pd
+import numpy as np
+import configparser
+from sklearn import preprocessing
+from sklearn.model_selection import train_test_split
+
+from training_bertFineTuning import training_bertFineTuning
+from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader
+from evaluate_bertFineTuning import evaluate_bertFineTuning
+
+
+
+
+
+
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+
+def remove_weak_classes(df, classColumnName, threshold):
+
+    dictOfClassInstances = create_dict(df,classColumnName)
+
+
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+
+
+def resample_classes(df, classColumnName, numberOfInstances):
+
+    #random numberOfInstances elements
+    replace = False  # with replacement
+
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
+
+
+
+def main():
+
+    config = configparser.ConfigParser()
+    config.read('bert_settings.conf')
+
+    dataPath = config.get('general','dataPath')
+    columnText = config.get('general','columnText')
+    columnClass = config.get('general','columnClass')
+
+    minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
+    maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+
+    chosen_tokeniser = config.get('model','tokeniser')
+    chosen_model = config.get('model','model')
+
+    max_len = int(config.get('model','max_len_sequences'))
+    batch_size = int(config.get('model','batch_size'))
+    epochs = int(config.get('model','epochs'))
+
+    df = pd.read_csv(dataPath)
+    df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
+    df = resample_classes(df, columnClass, maxOfInstancePerClass)
+    df = df[df[columnClass] != 'unclassified']
+
+
+    y  = df[columnClass]
+    numberOfClasses = y.nunique()
+    encoder = preprocessing.LabelEncoder()
+    y = encoder.fit_transform(y)
+
+
+    train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
+
+    sentences = train_x[columnText].values
+    labels = train_y.tolist()
+
+
+    #call train method
+
+    model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs)
+    #save the model
+    model_save_name = config.get('model','modelName')
+    path = config.get('model','path')
+    torch.save(model, os.path.join(path,model_save_name))
+
+    #print the model parameters
+    params = list(model.named_parameters())
+
+    print('The BERT model has {:} different named parameters.\n'.format(len(params)))
+
+    print('==== Embedding Layer ====\n')
+
+    for p in params[0:5]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+        print('\n==== First Transformer ====\n')
+
+    for p in params[5:21]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+        print('\n==== Output Layer ====\n')
+
+    for p in params[-4:]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+    #call predict method
+    prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32)
+    predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader)
+
+    #call Evaluate
+    result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder)
+
+    print(result_df)
+    print(accuracy)
+    print(weighted_avg)
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/predict_bertFineTuning.py b/predict_bertFineTuning.py
new file mode 100644
index 0000000..4276122
--- /dev/null
+++ b/predict_bertFineTuning.py
@@ -0,0 +1,168 @@
+import torch
+
+import pandas as pd
+
+import numpy as np
+
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from transformers import BertTokenizer, CamembertTokenizer
+
+def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32):
+
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
+
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids_test = []
+    # For every sentence...
+    for sent in sentences_to_predict:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                    )
+
+        input_ids_test.append(encoded_sent)
+
+    # Pad our input tokens
+    padded_test = []
+    for i in input_ids_test:
+
+      if len(i) > max_len:
+        padded_test.extend([i[:max_len]])
+      else:
+        padded_test.extend([i + [0] * (max_len - len(i))])
+    input_ids_test = np.array(padded_test)
+
+    # Create attention masks
+    attention_masks = []
+
+    # Create a mask of 1s for each token followed by 0s for padding
+    for seq in input_ids_test:
+        seq_mask = [float(i>0) for i in seq]
+        attention_masks.append(seq_mask)
+
+    # Convert to tensors.
+    prediction_inputs = torch.tensor(input_ids_test)
+    prediction_masks = torch.tensor(attention_masks)
+    prediction_labels = torch.tensor(labels)
+
+    # Set the batch size.
+    batch_size = 32
+
+    # Create the DataLoader.
+    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
+    prediction_sampler = SequentialSampler(prediction_data)
+    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
+
+    return prediction_dataloader
+
+
+
+def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
+
+
+    # If there's a GPU available...
+    if torch.cuda.is_available():
+
+        # Tell PyTorch to use the GPU.
+        device = torch.device("cuda")
+
+        print('There are %d GPU(s) available.' % torch.cuda.device_count())
+
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
+
+        # If not...
+    else:
+        print('No GPU available, using the CPU instead.')
+        device = torch.device("cpu")
+
+    # Put model in evaluation mode
+    model.eval()
+
+    # Tracking variables
+    predictions_test , true_labels = [], []
+
+    # Predict
+    for batch in prediction_dataloader:
+    # Add batch to GPU
+        batch = tuple(t.to(device) for t in batch)
+
+        # Unpack the inputs from the dataloader
+        b_input_ids, b_input_mask, b_labels = batch
+
+        # Telling the model not to compute or store gradients, saving memory and
+        # speeding up prediction
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions
+            outputs = model(b_input_ids, token_type_ids=None,
+                            attention_mask=b_input_mask)
+
+        logits = outputs[0]
+        #print(logits)
+
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+        #print(logits)
+
+        # Store predictions and true labels
+        predictions_test.append(logits)
+        true_labels.append(label_ids)
+
+        print('    DONE.')
+
+        pred_labels = []
+
+
+        for i in range(len(true_labels)):
+
+            # The predictions for this batch are a 2-column ndarray (one column for "0"
+            # and one column for "1"). Pick the label with the highest value and turn this
+            # in to a list of 0s and 1s.
+            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
+            pred_labels.append(pred_labels_i)
+
+        pred_labels_ = [item for sublist in pred_labels for item in sublist]
+        true_labels_ = [item for sublist in true_labels for item in sublist]
+        return predictions_test_, true_labels_
+
+
+def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict):
+    
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
+
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids_test = []
+    # For every sentence...
+    for sent in sentences_to_predict:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                    )
+
+        input_ids_test.append(encoded_sent)
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions
+            outputs = model(b_input_ids, token_type_ids=None,
+                            attention_mask=b_input_mask)
+
+        logits = outputs[0]
diff --git a/training_bertFineTuning.py b/training_bertFineTuning.py
index 72a5929..285be2d 100644
--- a/training_bertFineTuning.py
+++ b/training_bertFineTuning.py
@@ -2,464 +2,399 @@ import torch
 import pandas as pd
 import numpy as np
 from sklearn import preprocessing
-from sklearn.model_selection import train_test_split
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 from transformers import BertTokenizer, CamembertTokenizer
 from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
 from transformers import get_linear_schedule_with_warmup
 import time
 import datetime
 import random
+import os
 
 
 
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
 
 
-###########################################################################
-########################## Utils Functions ################################
-###########################################################################
-
-def create_dict(df, classColumnName):
-    return dict(df[classColumnName].value_counts())
-
-def remove_weak_classes(df, classColumnName, threshold):
-
-    dictOfClassInstances = create_dict(df,classColumnName)
-
-
-    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
-    keys = [*dictionary]
-    df_tmp = df[~ df[classColumnName].isin(keys)]
-    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
-    return df
-
-
-def resample_classes(df, classColumnName, numberOfInstances):
-
-    #random numberOfInstances elements
-    replace = False  # with replacement
-
-    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
-    return df.groupby(classColumnName, as_index=False).apply(fn)
-
-##############################################################################################################
-########################## Setup GPU #########################################################################
-##############################################################################################################
-
-# If there's a GPU available...
-if torch.cuda.is_available():
 
-    # Tell PyTorch to use the GPU.
-    device = torch.device("cuda")
 
-    print('There are %d GPU(s) available.' % torch.cuda.device_count())
 
-    print('We will use the GPU:', torch.cuda.get_device_name(0))
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
 
-# If not...
-else:
-    print('No GPU available, using the CPU instead.')
-    device = torch.device("cpu")
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
 
 
+def training_bertFineTuning(chosen_model,  sentences, labels, max_len,  batch_size, epochs = 4):
 
+    # If there's a GPU available...
+    if torch.cuda.is_available():
 
-#############################################################################################################
-########################## parameters ###################################################################
-###########################################################################################################
+        # Tell PyTorch to use the GPU.
+        device = torch.device("cuda")
 
-config = configparser.ConfigParser()
-config.read('settings.conf')
+        print('There are %d GPU(s) available.' % torch.cuda.device_count())
 
-dataPath = config.get('general','dataPath')
-columnText = config.get('general','columnText')
-columnClass = config.get('general','columnClass')
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
 
-minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
-maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+        # If not...
+    else:
+        print('No GPU available, using the CPU instead.')
+        device = torch.device("cpu")
 
-chosen_tokeniser = config.get('model','tokeniser')
-chosen_model = config.get('model','model')
 
-max_len = int(config.get('model','max_len_sequences'))
 
 
-#############################################################################################################
-########################## Load Data ###################################################################
+############################################################################################################
+########################## Model: Tokenization & Input Formatting ###################################################################
 ###########################################################################################################
 
 
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
 
 
-df = pd.read_csv(dataPath)
-df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
-df = resample_classes(df, columnClass, maxOfInstancePerClass)
-df = df[df[columnClass] != 'unclassified']
-
 
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids = []
 
+    # For every sentence...
+    for sent in sentences:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
 
+                            # This function also supports truncation and conversion
+                            # to pytorch tensors, but I need to do padding, so I
+                            # can't use these features.
+                            #max_length = 128,          # Truncate all sentences.
+                            #return_tensors = 'pt',     # Return pytorch tensors.
+                            )
 
-y  = df[columnClass]
-numberOfClasses = y.nunique()
-encoder = preprocessing.LabelEncoder()
-y = encoder.fit_transform(y)
+        # Add the encoded sentence to the list.
+        input_ids.append(encoded_sent)
 
 
 
-sentences = train_x[columnText].values
-labels = train_y.tolist()
 
+    padded = []
+    for i in input_ids:
 
+        if len(i) > max_len:
+            padded.extend([i[:max_len]])
+        else:
+            padded.extend([i + [0] * (max_len - len(i))])
 
-############################################################################################################
-########################## Model: Tokenization & Input Formatting ###################################################################
-###########################################################################################################
-
-
-# Load the BERT tokenizer.
-print('Loading BERT tokenizer...')
-tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)
 
+    padded  = np.array(padded)
 
- # Tokenize all of the sentences and map the tokens to thier word IDs.
-input_ids = []
 
-# For every sentence...
-for sent in sentences:
-    # `encode` will:
-    #   (1) Tokenize the sentence.
-    #   (2) Prepend the `[CLS]` token to the start.
-    #   (3) Append the `[SEP]` token to the end.
-    #   (4) Map tokens to their IDs.
-    encoded_sent = tokenizer.encode(
-                        sent,                      # Sentence to encode.
-                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
 
-                        # This function also supports truncation and conversion
-                        # to pytorch tensors, but I need to do padding, so I
-                        # can't use these features.
-                        #max_length = 128,          # Truncate all sentences.
-                        #return_tensors = 'pt',     # Return pytorch tensors.
-                   )
+    # Create attention masks
+    attention_masks = []
 
-    # Add the encoded sentence to the list.
-    input_ids.append(encoded_sent)
+    # For each sentence...
+    for sent in padded:
 
+        # Create the attention mask.
+        #   - If a token ID is 0, then it's padding, set the mask to 0.
+        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
+        att_mask = [int(token_id > 0) for token_id in sent]
 
+        # Store the attention mask for this sentence.
+        attention_masks.append(att_mask)
 
 
-padded = []
-for i in input_ids:
+    # Use 90% for training and 10% for validation.
+    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.1, stratify = labels )
+    # Do the same for the masks.
+    train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1, stratify = labels)
 
-  if len(i) > max_len:
-    padded.extend([i[:max_len]])
-  else:
-    padded.extend([i + [0] * (max_len - len(i))])
 
+    # Convert all inputs and labels into torch tensors, the required datatype
+    # for my model.
+    train_inputs = torch.tensor(train_inputs)
+    validation_inputs = torch.tensor(validation_inputs)
 
-padded = input_ids = np.array(padded)
+    train_labels = torch.tensor(train_labels)
+    validation_labels = torch.tensor(validation_labels)
 
+    train_masks = torch.tensor(train_masks)
+    validation_masks = torch.tensor(validation_masks)
 
 
- # Create attention masks
-attention_masks = []
 
-# For each sentence...
-for sent in padded:
 
-    # Create the attention mask.
-    #   - If a token ID is 0, then it's padding, set the mask to 0.
-    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
-    att_mask = [int(token_id > 0) for token_id in sent]
+    # The DataLoader needs to know the batch size for training, so I specify it here.
+    # For fine-tuning BERT on a specific task, the authors recommend a batch size of
+    # 16 or 32.
 
-    # Store the attention mask for this sentence.
-    attention_masks.append(att_mask)
 
+    # Create the DataLoader for training set.
+    train_data = TensorDataset(train_inputs, train_masks, train_labels)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
 
-# Use 90% for training and 10% for validation.
-train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
-                                                            random_state=2018, test_size=0.1, stratify = labels )
-# Do the same for the masks.
-train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
-                                             random_state=2018, test_size=0.1, stratify = labels)
+    # Create the DataLoader for validation set.
+    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
+    validation_sampler = SequentialSampler(validation_data)
+    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
 
 
-# Convert all inputs and labels into torch tensors, the required datatype
-# for my model.
-train_inputs = torch.tensor(train_inputs)
-validation_inputs = torch.tensor(validation_inputs)
 
-train_labels = torch.tensor(train_labels)
-validation_labels = torch.tensor(validation_labels)
 
-train_masks = torch.tensor(train_masks)
-validation_masks = torch.tensor(validation_masks)
 
+    print(' Selecting a model .....')
 
+    numberOfClasses = len(set(labels))
 
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 
-# The DataLoader needs to know the batch size for training, so I specify it here.
-# For fine-tuning BERT on a specific task, the authors recommend a batch size of
-# 16 or 32.
+    # Load BertForSequenceClassification, the pretrained BERT model with a single
+    # linear classification layer on top.
+    if chosen_model == 'bert-base-multilingual-cased':
+        model = BertForSequenceClassification.from_pretrained(
+            chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
+            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+            # You can increase this for multi-class tasks.
+            output_attentions = False, # Whether the model returns attentions weights.
+            output_hidden_states = False, # Whether the model returns all hidden-states.
+            )
+    elif chosen_model == 'camembert-base':
 
-batch_size = int(config.get('model','batch_size'))
+        model = CamembertForSequenceClassification.from_pretrained(
+            chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
+            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+            # You can increase this for multi-class tasks.
+            output_attentions = False, # Whether the model returns attentions weights.
+            output_hidden_states = False, # Whether the model returns all hidden-states.
+            )
 
-# Create the DataLoader for training set.
-train_data = TensorDataset(train_inputs, train_masks, train_labels)
-train_sampler = RandomSampler(train_data)
-train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
 
-# Create the DataLoader for validation set.
-validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
-validation_sampler = SequentialSampler(validation_data)
-validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
+    # Tell pytorch to run this model on the GPU.
+    model.cuda()
 
 
+    #Note: AdamW is a class from the huggingface library (as opposed to pytorch)
+    # I believe the 'W' stands for 'Weight Decay fix"
+    optimizer = AdamW(model.parameters(),
+                    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                    )
 
 
-############################################################################################################
-########################## Model: Training ###################################################################
-###########################################################################################################
 
 
-print(' Selecting a model .....')
+    # Total number of training steps is number of batches * number of epochs.
+    total_steps = len(train_dataloader) * epochs
 
+    # Create the learning rate scheduler.
+    scheduler = get_linear_schedule_with_warmup(optimizer,
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
 
 
-# Load BertForSequenceClassification, the pretrained BERT model with a single
-# linear classification layer on top.
 
-model = BertForSequenceClassification.from_pretrained(
-    chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
-    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
-                    # You can increase this for multi-class tasks.
-    output_attentions = False, # Whether the model returns attentions weights.
-    output_hidden_states = False, # Whether the model returns all hidden-states.
-)
 
-# Tell pytorch to run this model on the GPU.
-model.cuda()
+    # This training code is based on the `run_glue.py` script here:
+    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
 
 
-#Note: AdamW is a class from the huggingface library (as opposed to pytorch)
-# I believe the 'W' stands for 'Weight Decay fix"
-optimizer = AdamW(model.parameters(),
-                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
-                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
-                )
+    # Set the seed value all over the place to make this reproducible.
+    seed_val = 42
 
+    random.seed(seed_val)
+    np.random.seed(seed_val)
+    torch.manual_seed(seed_val)
+    torch.cuda.manual_seed_all(seed_val)
 
+    # Store the average loss after each epoch so I can plot them.
+    loss_values = []
 
-# Number of training epochs (authors recommend between 2 and 4)
-epochs = int(config.get('model','epochs'))
+    # For each epoch...
+    for epoch_i in range(0, epochs):
 
-# Total number of training steps is number of batches * number of epochs.
-total_steps = len(train_dataloader) * epochs
+        # ========================================
+        #               Training
+        # ========================================
 
-# Create the learning rate scheduler.
-scheduler = get_linear_schedule_with_warmup(optimizer,
-                                            num_warmup_steps = 0, # Default value in run_glue.py
-                                            num_training_steps = total_steps)
+        # Perform one full pass over the training set.
 
+        print("")
+        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+        print('Training...')
 
-def flat_accuracy(preds, labels):
-    pred_flat = np.argmax(preds, axis=1).flatten()
-    labels_flat = labels.flatten()
-    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+        # Measure how long the training epoch takes.
+        t0 = time.time()
 
+        # Reset the total loss for this epoch.
+        total_loss = 0
 
+        # Put the model into training mode.
+        model.train()
 
+        # For each batch of training data...
+        for step, batch in enumerate(train_dataloader):
 
+            # Progress update every 40 batches.
+            if step % 40 == 0 and not step == 0:
+                # Calculate elapsed time in minutes.
+                elapsed = format_time(time.time() - t0)
 
-def format_time(elapsed):
-    '''
-    Takes a time in seconds and returns a string hh:mm:ss
-    '''
-    # Round to the nearest second.
-    elapsed_rounded = int(round((elapsed)))
+                # Report progress.
+                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
 
-    # Format as hh:mm:ss
-    return str(datetime.timedelta(seconds=elapsed_rounded))
+            # Unpack this training batch from the dataloader.
+            #
+            # As I unpack the batch, I'll also copy each tensor to the GPU using the
+            # `to` method.
+            #
+            # `batch` contains three pytorch tensors:
+            #   [0]: input ids
+            #   [1]: attention masks
+            #   [2]: labels
+            b_input_ids = batch[0].to(device)
+            b_input_mask = batch[1].to(device)
+            b_labels = batch[2].to(device)
 
+            # Always clear any previously calculated gradients before performing a
+            # backward pass. PyTorch doesn't do this automatically because
+            # accumulating the gradients is "convenient while training RNNs".
+            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+            model.zero_grad()
 
+            # Perform a forward pass (evaluate the model on this training batch).
+            # This will return the loss (rather than the model output) because I
+            # have provided the `labels`.
+            # The documentation for this `model` function is here:
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            outputs = model(b_input_ids,
+                        token_type_ids=None,
+                        attention_mask=b_input_mask,
+                        labels=b_labels)
 
+            # The call to `model` always returns a tuple, so I need to pull the
+            # loss value out of the tuple.
+            loss = outputs[0]
 
-# This training code is based on the `run_glue.py` script here:
-# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+            # Accumulate the training loss over all of the batches so that I can
+            # calculate the average loss at the end. `loss` is a Tensor containing a
+            # single value; the `.item()` function just returns the Python value
+            # from the tensor.
+            total_loss += loss.item()
 
+            #  Perform a backward pass to calculate the gradients.
+            loss.backward()
 
-# Set the seed value all over the place to make this reproducible.
-seed_val = 42
+            # Clip the norm of the gradients to 1.0.
+            # This is to help prevent the "exploding gradients" problem.
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
 
-random.seed(seed_val)
-np.random.seed(seed_val)
-torch.manual_seed(seed_val)
-torch.cuda.manual_seed_all(seed_val)
+            # Update parameters and take a step using the computed gradient.
+            # The optimizer dictates the "update rule"--how the parameters are
+            # modified based on their gradients, the learning rate, etc.
+            optimizer.step()
 
-# Store the average loss after each epoch so I can plot them.
-loss_values = []
+            # Update the learning rate.
+            scheduler.step()
 
-# For each epoch...
-for epoch_i in range(0, epochs):
+        # Calculate the average loss over the training data.
+        avg_train_loss = total_loss / len(train_dataloader)
 
-    # ========================================
-    #               Training
-    # ========================================
+        # Store the loss value for plotting the learning curve.
+        loss_values.append(avg_train_loss)
 
-    # Perform one full pass over the training set.
+        print("")
+        print("  Average training loss: {0:.2f}".format(avg_train_loss))
+        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
 
-    print("")
-    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
-    print('Training...')
-
-    # Measure how long the training epoch takes.
-    t0 = time.time()
-
-    # Reset the total loss for this epoch.
-    total_loss = 0
-
-    # Put the model into training mode.
-    model.train()
-
-    # For each batch of training data...
-    for step, batch in enumerate(train_dataloader):
-
-        # Progress update every 40 batches.
-        if step % 40 == 0 and not step == 0:
-            # Calculate elapsed time in minutes.
-            elapsed = format_time(time.time() - t0)
-
-            # Report progress.
-            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
-
-        # Unpack this training batch from the dataloader.
-        #
-        # As I unpack the batch, I'll also copy each tensor to the GPU using the
-        # `to` method.
-        #
-        # `batch` contains three pytorch tensors:
-        #   [0]: input ids
-        #   [1]: attention masks
-        #   [2]: labels
-        b_input_ids = batch[0].to(device)
-        b_input_mask = batch[1].to(device)
-        b_labels = batch[2].to(device)
-
-        # Always clear any previously calculated gradients before performing a
-        # backward pass. PyTorch doesn't do this automatically because
-        # accumulating the gradients is "convenient while training RNNs".
-        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
-        model.zero_grad()
-
-        # Perform a forward pass (evaluate the model on this training batch).
-        # This will return the loss (rather than the model output) because I
-        # have provided the `labels`.
-        # The documentation for this `model` function is here:
-        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
-        outputs = model(b_input_ids,
-                    token_type_ids=None,
-                    attention_mask=b_input_mask,
-                    labels=b_labels)
-
-        # The call to `model` always returns a tuple, so I need to pull the
-        # loss value out of the tuple.
-        loss = outputs[0]
-
-        # Accumulate the training loss over all of the batches so that I can
-        # calculate the average loss at the end. `loss` is a Tensor containing a
-        # single value; the `.item()` function just returns the Python value
-        # from the tensor.
-        total_loss += loss.item()
-
-        # Perform a backward pass to calculate the gradients.
-        loss.backward()
-
-        # Clip the norm of the gradients to 1.0.
-        # This is to help prevent the "exploding gradients" problem.
-        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-
-        # Update parameters and take a step using the computed gradient.
-        # The optimizer dictates the "update rule"--how the parameters are
-        # modified based on their gradients, the learning rate, etc.
-        optimizer.step()
-
-        # Update the learning rate.
-        scheduler.step()
-
-    # Calculate the average loss over the training data.
-    avg_train_loss = total_loss / len(train_dataloader)
-
-    # Store the loss value for plotting the learning curve.
-    loss_values.append(avg_train_loss)
+        # ========================================
+        #               Validation
+        # ========================================
+        # After the completion of each training epoch, measure the performance on
+        # the validation set.
 
-    print("")
-    print("  Average training loss: {0:.2f}".format(avg_train_loss))
-    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
+        print("")
+        print("Running Validation...")
 
-    # ========================================
-    #               Validation
-    # ========================================
-    # After the completion of each training epoch, measure the performance on
-    # the validation set.
+        t0 = time.time()
 
-    print("")
-    print("Running Validation...")
+        # Put the model in evaluation mode--the dropout layers behave differently
+        # during evaluation.
+        model.eval()
 
-    t0 = time.time()
+        # Tracking variables
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
 
-    # Put the model in evaluation mode--the dropout layers behave differently
-    # during evaluation.
-    model.eval()
+        # Evaluate data for one epoch
+        for batch in validation_dataloader:
 
-    # Tracking variables
-    eval_loss, eval_accuracy = 0, 0
-    nb_eval_steps, nb_eval_examples = 0, 0
+            # Add batch to GPU
+            batch = tuple(t.to(device) for t in batch)
 
-    # Evaluate data for one epoch
-    for batch in validation_dataloader:
+            # Unpack the inputs from dataloader
+            b_input_ids, b_input_mask, b_labels = batch
 
-        # Add batch to GPU
-        batch = tuple(t.to(device) for t in batch)
+            # Telling the model not to compute or store gradients, saving memory and
+            # speeding up validation
+            with torch.no_grad():
 
-        # Unpack the inputs from dataloader
-        b_input_ids, b_input_mask, b_labels = batch
+                # Forward pass, calculate logit predictions.
+                # This will return the logits rather than the loss because we have
+                # not provided labels.
+                # token_type_ids is the same as the "segment ids", which
+                # differentiates sentence 1 and 2 in 2-sentence tasks.
+                # The documentation for this `model` function is here:
+                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+                outputs = model(b_input_ids,
+                                token_type_ids=None,
+                                attention_mask=b_input_mask)
 
-        # Telling the model not to compute or store gradients, saving memory and
-        # speeding up validation
-        with torch.no_grad():
+            # Get the "logits" output by the model. The "logits" are the output
+            # values prior to applying an activation function like the softmax.
+            logits = outputs[0]
 
-            # Forward pass, calculate logit predictions.
-            # This will return the logits rather than the loss because we have
-            # not provided labels.
-            # token_type_ids is the same as the "segment ids", which
-            # differentiates sentence 1 and 2 in 2-sentence tasks.
-            # The documentation for this `model` function is here:
-            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
-            outputs = model(b_input_ids,
-                            token_type_ids=None,
-                            attention_mask=b_input_mask)
+            # Move logits and labels to CPU
+            logits = logits.detach().cpu().numpy()
+            label_ids = b_labels.to('cpu').numpy()
 
-        # Get the "logits" output by the model. The "logits" are the output
-        # values prior to applying an activation function like the softmax.
-        logits = outputs[0]
+            # Calculate the accuracy for this batch of test sentences.
+            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
 
-        # Move logits and labels to CPU
-        logits = logits.detach().cpu().numpy()
-        label_ids = b_labels.to('cpu').numpy()
+            # Accumulate the total accuracy.
+            eval_accuracy += tmp_eval_accuracy
 
-        # Calculate the accuracy for this batch of test sentences.
-        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
+            # Track the number of batches
+            nb_eval_steps += 1
 
-        # Accumulate the total accuracy.
-        eval_accuracy += tmp_eval_accuracy
+        # Report the final accuracy for this validation run.
+        print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
+        print("  Validation took: {:}".format(format_time(time.time() - t0)))
 
-        # Track the number of batches
-        nb_eval_steps += 1
+    print("")
+    print("Training complete!")
+    return model
 
-    # Report the final accuracy for this validation run.
-    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
-    print("  Validation took: {:}".format(format_time(time.time() - t0)))
 
-print("")
-print("Training complete!")
+'''print('Saving Model....')
+model_save_name = config.get('model','modelName')
+path = config.get('model','path')
+#torch.save(model.state_dict(), os.path.join(path,model_save_name))
+torch.save(model, os.path.join(path,model_save_name))'''
-- 
GitLab


From 66234a841ff8cc07647669b2c710ef40800f461f Mon Sep 17 00:00:00 2001
From: Khalleud <ledk14@gmail.com>
Date: Fri, 17 Sep 2021 08:34:24 +0200
Subject: [PATCH 6/6] [ADD] bert config file

---
 bert_settings.conf | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 bert_settings.conf

diff --git a/bert_settings.conf b/bert_settings.conf
new file mode 100644
index 0000000..ccba612
--- /dev/null
+++ b/bert_settings.conf
@@ -0,0 +1,19 @@
+[general]
+dataPath = Data/dataframe_with_ensemble_domaine_enccre.csv
+columnText = contentWithoutClass
+columnClass = ensemble_domaine_enccre
+minOfInstancePerClass = 200
+maxOfInstancePerClass = 1500
+
+
+[model]
+
+tokeniser = bert-base-multilingual-cased
+#tokeniser = camembert-base
+model =  bert-base-multilingual-cased
+#model = camembert-base
+max_len_sequences = 256
+batch_size = 32
+epochs = 4
+pathModel = ' '
+modelName = ' '
-- 
GitLab