[ADD] Notebook Bert Fine Tuning

21df7dd3 · Khalleud · 72178236 · 21df7dd3
Commit 21df7dd3 authored 3 years ago by Khalleud
--- a/BertFineTuning_.ipynb
+++ b/BertFineTuning_.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "name": "BertFineTuning_.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8hzEGHl7gmzk"
+      },
+      "source": [
+        "## Setup GPU"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "dPOU-Efhf4ui",
+        "outputId": "1e3142a8-6351-43f3-9147-68406520b7ee"
+      },
+      "source": [
+        "import torch\n",
+        "\n",
+        "# If there's a GPU available...\n",
+        "if torch.cuda.is_available():    \n",
+        "\n",
+        "    # Tell PyTorch to use the GPU.    \n",
+        "    device = torch.device(\"cuda\")\n",
+        "\n",
+        "    print('There are %d GPU(s) available.' % torch.cuda.device_count())\n",
+        "\n",
+        "    print('We will use the GPU:', torch.cuda.get_device_name(0))\n",
+        "\n",
+        "# If not...\n",
+        "else:\n",
+        "    print('No GPU available, using the CPU instead.')\n",
+        "    device = torch.device(\"cpu\")"
+      ],
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "No GPU available, using the CPU instead.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Jr-S9yYIgGkA"
+      },
+      "source": [
+        "## Install packages"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "pwmZ5bBvgGNh",
+        "outputId": "79c5fb08-a9f4-41bc-eb4d-ab448c5fb4a7"
+      },
+      "source": [
+        "pip install transformers"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.10.0)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (21.0)\n",
+            "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.3)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.0)\n",
+            "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.6.4)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (5.4.1)\n",
+            "Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.45)\n",
+            "Requirement already satisfied: huggingface-hub>=0.0.12 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.16)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.12->transformers) (3.7.4.3)\n",
+            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.5.0)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n",
+            "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "VFXEpG00gXkL",
+        "outputId": "2336f39a-78b7-4118-e754-508d876c51f9"
+      },
+      "source": [
+        "pip install sentencepiece"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.7/dist-packages (0.1.96)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "12SA-qPFgsVo"
+      },
+      "source": [
+        "## Utils functions"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "WkIVcabUgxIl"
+      },
+      "source": [
+        "def create_dict(df, classColumnName):\n",
+        "    return dict(df[classColumnName].value_counts())\n",
+        "\n",
+        "def remove_weak_classes(df, classColumnName, threshold):\n",
+        "\n",
+        "    dictOfClassInstances = create_dict(df,classColumnName)\n",
+        "\n",
+        "\n",
+        "    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }\n",
+        "    keys = [*dictionary]\n",
+        "    df_tmp = df[~ df[classColumnName].isin(keys)]\n",
+        "    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)\n",
+        "    return df\n",
+        "\n",
+        "\n",
+        "def resample_classes(df, classColumnName, numberOfInstances):\n",
+        "    \n",
+        "    #random numberOfInstances elements\n",
+        "    replace = False  # with replacement\n",
+        "\n",
+        "    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n",
+        "    return df.groupby(classColumnName, as_index=False).apply(fn)\n",
+        "    "
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "c5QKcXulhNJ-"
+      },
+      "source": [
+        "## Load Data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vonJ-d4Qg1g5"
+      },
+      "source": [
+        "import pandas as pd \n",
+        "import numpy as np\n",
+        "from sklearn import preprocessing\n",
+        "from sklearn.model_selection import train_test_split"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ouU5usvXg4PA"
+      },
+      "source": [
+        "dataPath = 'dataframe_with_ensemble_domaine_enccre.csv'\n",
+        "columnText = 'contentWithoutClass'\n",
+        "columnClass = 'ensemble_domaine_enccre'\n",
+        "minOfInstancePerClass = 200\n",
+        "maxOfInstancePerClass = 1500"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5u1acjunhoxe"
+      },
+      "source": [
+        "df = pd.read_csv(dataPath)\n",
+        "df = remove_weak_classes(df, columnClass, minOfInstancePerClass)\n",
+        "df = resample_classes(df, columnClass, maxOfInstancePerClass)\n",
+        "df = df[df[columnClass] != 'unclassified']"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zrjZvs2dhzAy"
+      },
+      "source": [
+        "y  = df[columnClass]\n",
+        "numberOfClasses = y.nunique()\n",
+        "encoder = preprocessing.LabelEncoder()\n",
+        "y = encoder.fit_transform(y)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "u9AxxaA_h1CM"
+      },
+      "source": [
+        "train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Xt_PhH_6h1_3"
+      },
+      "source": [
+        "sentences = train_x[columnText].values\n",
+        "labels = train_y.tolist()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Gs4Agx_5h43M"
+      },
+      "source": [
+        "# Model\n",
+        "## Tokenisation & Input Formatting"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YZ5PhEYZiCEA"
+      },
+      "source": [
+        "tokeniser_bert = 'bert-base-multilingual-cased'\n",
+        "tokeniser_camembert = 'camembert-base'\n",
+        "\n",
+        "model_bert =  \"bert-base-multilingual-cased\"\n",
+        "model_camembert = 'camembert-base'"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "C4bigx_3ibuN",
+        "outputId": "9d54db26-9920-4a92-bb1e-4534f287140f"
+      },
+      "source": [
+        "from transformers import BertTokenizer, CamembertTokenizer\n",
+        "\n",
+        "# Load the BERT tokenizer.\n",
+        "print('Loading BERT tokenizer...')\n",
+        "tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Loading BERT tokenizer...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "5hNod5X9jDZN",
+        "outputId": "1166b782-d384-4388-de21-21091dc9f925"
+      },
+      "source": [
+        " # Tokenize all of the sentences and map the tokens to thier word IDs.\n",
+        "input_ids = []\n",
+        "\n",
+        "# For every sentence...\n",
+        "for sent in sentences:\n",
+        "    # `encode` will:\n",
+        "    #   (1) Tokenize the sentence.\n",
+        "    #   (2) Prepend the `[CLS]` token to the start.\n",
+        "    #   (3) Append the `[SEP]` token to the end.\n",
+        "    #   (4) Map tokens to their IDs.\n",
+        "    encoded_sent = tokenizer.encode(\n",
+        "                        sent,                      # Sentence to encode.\n",
+        "                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
+        "\n",
+        "                        # This function also supports truncation and conversion\n",
+        "                        # to pytorch tensors, but I need to do padding, so I\n",
+        "                        # can't use these features.\n",
+        "                        #max_length = 128,          # Truncate all sentences.\n",
+        "                        #return_tensors = 'pt',     # Return pytorch tensors.\n",
+        "                   )\n",
+        "    \n",
+        "    # Add the encoded sentence to the list.\n",
+        "    input_ids.append(encoded_sent)\n",
+        "\n"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Token indices sequence length is longer than the specified maximum sequence length for this model (866 > 512). Running this sequence through the model will result in indexing errors\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "W9EWv5JvjGH3",
+        "outputId": "9072122d-3586-40fe-9d75-5b6e9035d6d2"
+      },
+      "source": [
+        "print('Max sentence length: ', max([len(sen) for sen in input_ids])) "
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Max sentence length:  3462\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xh1TQJyvjOx5"
+      },
+      "source": [
+        "max_len = 180\n",
+        "padded = []\n",
+        "for i in input_ids:\n",
+        "\n",
+        "  if len(i) > max_len:\n",
+        "    padded.extend([i[:max_len]])\n",
+        "  else:\n",
+        "    padded.extend([i + [0] * (max_len - len(i))])\n",
+        "\n",
+        "\n",
+        "padded = input_ids = np.array(padded)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ZiwY6gn0jUkD"
+      },
+      "source": [
+        " # Create attention masks\n",
+        "attention_masks = []\n",
+        "\n",
+        "# For each sentence...\n",
+        "for sent in padded:\n",
+        "    \n",
+        "    # Create the attention mask.\n",
+        "    #   - If a token ID is 0, then it's padding, set the mask to 0.\n",
+        "    #   - If a token ID is > 0, then it's a real token, set the mask to 1.\n",
+        "    att_mask = [int(token_id > 0) for token_id in sent]\n",
+        "    \n",
+        "    # Store the attention mask for this sentence.\n",
+        "    attention_masks.append(att_mask)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "oBTR5AfAjXJe"
+      },
+      "source": [
+        "# Use 90% for training and 10% for validation.\n",
+        "train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, \n",
+        "                                                            random_state=2018, test_size=0.1, stratify = labels )\n",
+        "# Do the same for the masks.\n",
+        "train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,\n",
+        "                                             random_state=2018, test_size=0.1, stratify = labels)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "b9Mw5kq3jhTb"
+      },
+      "source": [
+        "# Convert all inputs and labels into torch tensors, the required datatype \n",
+        "# for my model.\n",
+        "train_inputs = torch.tensor(train_inputs)\n",
+        "validation_inputs = torch.tensor(validation_inputs)\n",
+        "\n",
+        "train_labels = torch.tensor(train_labels)\n",
+        "validation_labels = torch.tensor(validation_labels)\n",
+        "\n",
+        "train_masks = torch.tensor(train_masks)\n",
+        "validation_masks = torch.tensor(validation_masks)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "UfFWzbENjnkw"
+      },
+      "source": [
+        "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n",
+        "\n",
+        "# The DataLoader needs to know the batch size for training, so I specify it here.\n",
+        "# For fine-tuning BERT on a specific task, the authors recommend a batch size of\n",
+        "# 16 or 32.\n",
+        "\n",
+        "batch_size = 32\n",
+        "\n",
+        "# Create the DataLoader for training set.\n",
+        "train_data = TensorDataset(train_inputs, train_masks, train_labels)\n",
+        "train_sampler = RandomSampler(train_data)\n",
+        "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)\n",
+        "\n",
+        "# Create the DataLoader for validation set.\n",
+        "validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)\n",
+        "validation_sampler = SequentialSampler(validation_data)\n",
+        "validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x45JNGqhkUn2"
+      },
+      "source": [
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 463
+        },
+        "id": "C7M2Er1ajsTf",
+        "outputId": "fe4c13b7-5157-49b4-e878-6d7676d4d1a3"
+      },
+      "source": [
+        "from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification\n",
+        "\n",
+        "# Load BertForSequenceClassification, the pretrained BERT model with a single \n",
+        "# linear classification layer on top.\n",
+        "\n",
+        "model = BertForSequenceClassification.from_pretrained(\n",
+        "    model_bert, # Use the 12-layer BERT model, with an uncased vocab.\n",
+        "    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.\n",
+        "                    # You can increase this for multi-class tasks.   \n",
+        "    output_attentions = False, # Whether the model returns attentions weights.\n",
+        "    output_hidden_states = False, # Whether the model returns all hidden-states.\n",
+        ")\n",
+        "\n",
+        "# Tell pytorch to run this model on the GPU.\n",
+        "model.cuda()"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']\n",
+            "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+            "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+          ]
+        },
+        {
+          "output_type": "error",
+          "ename": "RuntimeError",
+          "evalue": "ignored",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+            "\u001b[0;32m<ipython-input-120-80c23ac5f353>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;31m# Tell pytorch to run this model on the GPU.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36mcuda\u001b[0;34m(self, device)\u001b[0m\n\u001b[1;32m    635\u001b[0m             \u001b[0mModule\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    636\u001b[0m         \"\"\"\n\u001b[0;32m--> 637\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    638\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    639\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mxpu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m    528\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    529\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m             \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    532\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m    528\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    529\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m             \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    532\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m    528\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    529\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m             \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    532\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m    550\u001b[0m                 \u001b[0;31m# `with torch.no_grad():`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    551\u001b[0m                 \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mno_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 552\u001b[0;31m                     \u001b[0mparam_applied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    553\u001b[0m                 \u001b[0mshould_use_set_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    554\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mshould_use_set_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m    635\u001b[0m             \u001b[0mModule\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    636\u001b[0m         \"\"\"\n\u001b[0;32m--> 637\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    638\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    639\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mxpu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mRuntimeError\u001b[0m: CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 11.17 GiB total capacity; 10.43 GiB already allocated; 91.81 MiB free; 10.63 GiB reserved in total by PyTorch)"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xd_cG-8pj4Iw"
+      },
+      "source": [
+        "#Note: AdamW is a class from the huggingface library (as opposed to pytorch) \n",
+        "# I believe the 'W' stands for 'Weight Decay fix\"\n",
+        "optimizer = AdamW(model.parameters(),\n",
+        "                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n",
+        "                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.\n",
+        "                )"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "65G-uHuLj4_6"
+      },
+      "source": [
+        "from transformers import get_linear_schedule_with_warmup\n",
+        "\n",
+        "# Number of training epochs (authors recommend between 2 and 4)\n",
+        "epochs = 4\n",
+        "\n",
+        "# Total number of training steps is number of batches * number of epochs.\n",
+        "total_steps = len(train_dataloader) * epochs\n",
+        "\n",
+        "# Create the learning rate scheduler.\n",
+        "scheduler = get_linear_schedule_with_warmup(optimizer, \n",
+        "                                            num_warmup_steps = 0, # Default value in run_glue.py\n",
+        "                                            num_training_steps = total_steps)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lHSOuwcMj9jf"
+      },
+      "source": [
+        "import numpy as np\n",
+        "\n",
+        "# Function to calculate the accuracy of our predictions vs labels\n",
+        "def flat_accuracy(preds, labels):\n",
+        "    pred_flat = np.argmax(preds, axis=1).flatten()\n",
+        "    labels_flat = labels.flatten()\n",
+        "    return np.sum(pred_flat == labels_flat) / len(labels_flat) "
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Z0S3br-7kASm"
+      },
+      "source": [
+        "import time\n",
+        "import datetime\n",
+        "\n",
+        "def format_time(elapsed):\n",
+        "    '''\n",
+        "    Takes a time in seconds and returns a string hh:mm:ss\n",
+        "    '''\n",
+        "    # Round to the nearest second.\n",
+        "    elapsed_rounded = int(round((elapsed)))\n",
+        "    \n",
+        "    # Format as hh:mm:ss\n",
+        "    return str(datetime.timedelta(seconds=elapsed_rounded))"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "SbHBbYpwkKaA"
+      },
+      "source": [
+        "import random\n",
+        "\n",
+        "# This training code is based on the `run_glue.py` script here:\n",
+        "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n",
+        "\n",
+        "\n",
+        "# Set the seed value all over the place to make this reproducible.\n",
+        "seed_val = 42\n",
+        "\n",
+        "random.seed(seed_val)\n",
+        "np.random.seed(seed_val)\n",
+        "torch.manual_seed(seed_val)\n",
+        "torch.cuda.manual_seed_all(seed_val)\n",
+        "\n",
+        "# Store the average loss after each epoch so I can plot them.\n",
+        "loss_values = []\n",
+        "\n",
+        "# For each epoch...\n",
+        "for epoch_i in range(0, epochs):\n",
+        "    \n",
+        "    # ========================================\n",
+        "    #               Training\n",
+        "    # ========================================\n",
+        "    \n",
+        "    # Perform one full pass over the training set.\n",
+        "\n",
+        "    print(\"\")\n",
+        "    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n",
+        "    print('Training...')\n",
+        "\n",
+        "    # Measure how long the training epoch takes.\n",
+        "    t0 = time.time()\n",
+        "\n",
+        "    # Reset the total loss for this epoch.\n",
+        "    total_loss = 0\n",
+        "\n",
+        "    # Put the model into training mode.\n",
+        "    model.train()\n",
+        "\n",
+        "    # For each batch of training data...\n",
+        "    for step, batch in enumerate(train_dataloader):\n",
+        "\n",
+        "        # Progress update every 40 batches.\n",
+        "        if step % 40 == 0 and not step == 0:\n",
+        "            # Calculate elapsed time in minutes.\n",
+        "            elapsed = format_time(time.time() - t0)\n",
+        "            \n",
+        "            # Report progress.\n",
+        "            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n",
+        "\n",
+        "        # Unpack this training batch from the dataloader. \n",
+        "        #\n",
+        "        # As I unpack the batch, I'll also copy each tensor to the GPU using the \n",
+        "        # `to` method.\n",
+        "        #\n",
+        "        # `batch` contains three pytorch tensors:\n",
+        "        #   [0]: input ids \n",
+        "        #   [1]: attention masks\n",
+        "        #   [2]: labels \n",
+        "        b_input_ids = batch[0].to(device)\n",
+        "        b_input_mask = batch[1].to(device)\n",
+        "        b_labels = batch[2].to(device)\n",
+        "\n",
+        "        # Always clear any previously calculated gradients before performing a\n",
+        "        # backward pass. PyTorch doesn't do this automatically because \n",
+        "        # accumulating the gradients is \"convenient while training RNNs\". \n",
+        "        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)\n",
+        "        model.zero_grad()        \n",
+        "\n",
+        "        # Perform a forward pass (evaluate the model on this training batch).\n",
+        "        # This will return the loss (rather than the model output) because I\n",
+        "        # have provided the `labels`.\n",
+        "        # The documentation for this `model` function is here: \n",
+        "        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n",
+        "        outputs = model(b_input_ids, \n",
+        "                    token_type_ids=None, \n",
+        "                    attention_mask=b_input_mask, \n",
+        "                    labels=b_labels)\n",
+        "        \n",
+        "        # The call to `model` always returns a tuple, so I need to pull the \n",
+        "        # loss value out of the tuple.\n",
+        "        loss = outputs[0]\n",
+        "\n",
+        "        # Accumulate the training loss over all of the batches so that I can\n",
+        "        # calculate the average loss at the end. `loss` is a Tensor containing a\n",
+        "        # single value; the `.item()` function just returns the Python value \n",
+        "        # from the tensor.\n",
+        "        total_loss += loss.item()\n",
+        "\n",
+        "        # Perform a backward pass to calculate the gradients.\n",
+        "        loss.backward()\n",
+        "\n",
+        "        # Clip the norm of the gradients to 1.0.\n",
+        "        # This is to help prevent the \"exploding gradients\" problem.\n",
+        "        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n",
+        "\n",
+        "        # Update parameters and take a step using the computed gradient.\n",
+        "        # The optimizer dictates the \"update rule\"--how the parameters are\n",
+        "        # modified based on their gradients, the learning rate, etc.\n",
+        "        optimizer.step()\n",
+        "\n",
+        "        # Update the learning rate.\n",
+        "        scheduler.step()\n",
+        "\n",
+        "    # Calculate the average loss over the training data.\n",
+        "    avg_train_loss = total_loss / len(train_dataloader)            \n",
+        "    \n",
+        "    # Store the loss value for plotting the learning curve.\n",
+        "    loss_values.append(avg_train_loss)\n",
+        "\n",
+        "    print(\"\")\n",
+        "    print(\"  Average training loss: {0:.2f}\".format(avg_train_loss))\n",
+        "    print(\"  Training epoch took: {:}\".format(format_time(time.time() - t0)))\n",
+        "        \n",
+        "    # ========================================\n",
+        "    #               Validation\n",
+        "    # ========================================\n",
+        "    # After the completion of each training epoch, measure the performance on\n",
+        "    # the validation set.\n",
+        "\n",
+        "    print(\"\")\n",
+        "    print(\"Running Validation...\")\n",
+        "\n",
+        "    t0 = time.time()\n",
+        "\n",
+        "    # Put the model in evaluation mode--the dropout layers behave differently\n",
+        "    # during evaluation.\n",
+        "    model.eval()\n",
+        "\n",
+        "    # Tracking variables \n",
+        "    eval_loss, eval_accuracy = 0, 0\n",
+        "    nb_eval_steps, nb_eval_examples = 0, 0\n",
+        "\n",
+        "    # Evaluate data for one epoch\n",
+        "    for batch in validation_dataloader:\n",
+        "        \n",
+        "        # Add batch to GPU\n",
+        "        batch = tuple(t.to(device) for t in batch)\n",
+        "        \n",
+        "        # Unpack the inputs from dataloader\n",
+        "        b_input_ids, b_input_mask, b_labels = batch\n",
+        "        \n",
+        "        # Telling the model not to compute or store gradients, saving memory and\n",
+        "        # speeding up validation\n",
+        "        with torch.no_grad():        \n",
+        "\n",
+        "            # Forward pass, calculate logit predictions.\n",
+        "            # This will return the logits rather than the loss because we have\n",
+        "            # not provided labels.\n",
+        "            # token_type_ids is the same as the \"segment ids\", which \n",
+        "            # differentiates sentence 1 and 2 in 2-sentence tasks.\n",
+        "            # The documentation for this `model` function is here: \n",
+        "            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n",
+        "            outputs = model(b_input_ids, \n",
+        "                            token_type_ids=None, \n",
+        "                            attention_mask=b_input_mask)\n",
+        "        \n",
+        "        # Get the \"logits\" output by the model. The \"logits\" are the output\n",
+        "        # values prior to applying an activation function like the softmax.\n",
+        "        logits = outputs[0]\n",
+        "\n",
+        "        # Move logits and labels to CPU\n",
+        "        logits = logits.detach().cpu().numpy()\n",
+        "        label_ids = b_labels.to('cpu').numpy()\n",
+        "        \n",
+        "        # Calculate the accuracy for this batch of test sentences.\n",
+        "        tmp_eval_accuracy = flat_accuracy(logits, label_ids)\n",
+        "        \n",
+        "        # Accumulate the total accuracy.\n",
+        "        eval_accuracy += tmp_eval_accuracy\n",
+        "\n",
+        "        # Track the number of batches\n",
+        "        nb_eval_steps += 1\n",
+        "\n",
+        "    # Report the final accuracy for this validation run.\n",
+        "    print(\"  Accuracy: {0:.2f}\".format(eval_accuracy/nb_eval_steps))\n",
+        "    print(\"  Validation took: {:}\".format(format_time(time.time() - t0)))\n",
+        "\n",
+        "print(\"\")\n",
+        "print(\"Training complete!\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "VJwyfmakkQyj"
+      },
+      "source": [
+        "## Test"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "VAyzmfhZCGZo"
+      },
+      "source": [
+        "sentences_test = test_x[columnText].values\n",
+        "labels_test = test_y.tolist()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lZFXr_sdCJcb"
+      },
+      "source": [
+        "# Tokenize all of the sentences and map the tokens to thier word IDs.\n",
+        "input_ids_test = []\n",
+        "# For every sentence...\n",
+        "for sent in sentences_test:\n",
+        "    # `encode` will:\n",
+        "    #   (1) Tokenize the sentence.\n",
+        "    #   (2) Prepend the `[CLS]` token to the start.\n",
+        "    #   (3) Append the `[SEP]` token to the end.\n",
+        "    #   (4) Map tokens to their IDs.\n",
+        "    encoded_sent = tokenizer.encode(\n",
+        "                        sent,                      # Sentence to encode.\n",
+        "                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
+        "                )\n",
+        "    \n",
+        "    input_ids_test.append(encoded_sent)\n",
+        "\n",
+        "# Pad our input tokens\n",
+        "padded_test = []\n",
+        "for i in input_ids_test:\n",
+        "\n",
+        "  if len(i) > max_len:\n",
+        "    padded_test.extend([i[:max_len]])\n",
+        "  else:\n",
+        "    padded_test.extend([i + [0] * (max_len - len(i))])\n",
+        "input_ids_test = np.array(padded_test)\n",
+        "\n",
+        "# Create attention masks\n",
+        "attention_masks = []\n",
+        "\n",
+        "# Create a mask of 1s for each token followed by 0s for padding\n",
+        "for seq in input_ids_test:\n",
+        "    seq_mask = [float(i>0) for i in seq]\n",
+        "    attention_masks.append(seq_mask) \n",
+        "\n",
+        "# Convert to tensors.\n",
+        "prediction_inputs = torch.tensor(input_ids_test)\n",
+        "prediction_masks = torch.tensor(attention_masks)\n",
+        "prediction_labels = torch.tensor(labels_test)\n",
+        "\n",
+        "# Set the batch size.  \n",
+        "batch_size = 32  \n",
+        "\n",
+        "# Create the DataLoader.\n",
+        "prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)\n",
+        "prediction_sampler = SequentialSampler(prediction_data)\n",
+        "prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "SUDcxi03Cmf-"
+      },
+      "source": [
+        "print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))\n",
+        "\n",
+        "# Put model in evaluation mode\n",
+        "model.eval()\n",
+        "\n",
+        "# Tracking variables \n",
+        "predictions_test , true_labels = [], []\n",
+        "\n",
+        "# Predict \n",
+        "for batch in prediction_dataloader:\n",
+        "# Add batch to GPU\n",
+        "    batch = tuple(t.to(device) for t in batch)\n",
+        "    \n",
+        "    # Unpack the inputs from the dataloader\n",
+        "    b_input_ids, b_input_mask, b_labels = batch\n",
+        "    \n",
+        "    # Telling the model not to compute or store gradients, saving memory and \n",
+        "    # speeding up prediction\n",
+        "    with torch.no_grad():\n",
+        "        # Forward pass, calculate logit predictions\n",
+        "        outputs = model(b_input_ids, token_type_ids=None, \n",
+        "                        attention_mask=b_input_mask)\n",
+        "\n",
+        "    logits = outputs[0]\n",
+        "    #print(logits)\n",
+        "\n",
+        "    # Move logits and labels to CPU\n",
+        "    logits = logits.detach().cpu().numpy()\n",
+        "    label_ids = b_labels.to('cpu').numpy()\n",
+        "    #print(logits)\n",
+        "    \n",
+        "    # Store predictions and true labels\n",
+        "    predictions_test.append(logits)\n",
+        "    true_labels.append(label_ids)\n",
+        "\n",
+        "print('    DONE.')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "c3i7szp3Cn5u"
+      },
+      "source": [
+        "from sklearn.metrics import *\n",
+        "\n",
+        "pred_labels = []\n",
+        "\n",
+        "# Evaluate each test batch using many matrics\n",
+        "print('Calculating the matrics for each batch...')\n",
+        "\n",
+        "for i in range(len(true_labels)):\n",
+        "  \n",
+        "  # The predictions for this batch are a 2-column ndarray (one column for \"0\" \n",
+        "  # and one column for \"1\"). Pick the label with the highest value and turn this\n",
+        "  # in to a list of 0s and 1s.\n",
+        "  pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()\n",
+        "  pred_labels.append(pred_labels_i)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "0bU9-DsBCxSO"
+      },
+      "source": [
+        "pred_labels_ = [item for sublist in pred_labels for item in sublist]\n",
+        "true_labels_ = [item for sublist in true_labels for item in sublist]\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZUM_U2QlC4K5"
+      },
+      "source": [
+        "### Report & Evaluation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "d5n84N0xCfcU"
+      },
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "from sklearn.metrics import plot_confusion_matrix\n",
+        "from sklearn.metrics import confusion_matrix\n",
+        "from sklearn.metrics import classification_report\n",
+        "import seaborn as sns"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "v4hXk-KjC-nq"
+      },
+      "source": [
+        "report = classification_report( pred_labels_, true_labels_, output_dict = True)\n",
+        "    \n",
+        "accuracy = report['accuracy']\n",
+        "weighted_avg = report['weighted avg']"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xETMy1L6DAa5"
+      },
+      "source": [
+        "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
+        "classesName = encoder.classes_"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "dPjV_5g8DDQy"
+      },
+      "source": [
+        "precision = []\n",
+        "recall = []\n",
+        "f1 = []\n",
+        "support = []\n",
+        "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
+        "for c in classes:\n",
+        "  precision.append(report[c]['precision'])\n",
+        "  recall.append(report[c]['recall'])\n",
+        "  f1.append(report[c]['f1-score'])\n",
+        "  support.append(report[c]['support'])\n",
+        "\n",
+        "accuracy = report['accuracy']\n",
+        "weighted_avg = report['weighted avg']\n",
+        "cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n",
+        "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
+        "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
+        "TP = np.diag(cnf_matrix)\n",
+        "TN = cnf_matrix.sum() - (FP + FN + TP)\n",
+        "\n",
+        "dff['className'] = classesName\n",
+        "dff['precision'] = precision\n",
+        "dff['recall'] = recall\n",
+        "dff['f1-score'] = f1\n",
+        "dff['support'] = support\n",
+        "dff['FP'] = FP\n",
+        "dff['FN'] = FN\n",
+        "dff['TP'] = TP\n",
+        "dff['TN'] = TN\n",
+        "  \n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vslzi9bHDKcv"
+      },
+      "source": [
+        "print(weighted_avg)\n",
+        "print(accuracy)\n",
+        "print(dff)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
+%% Cell type:markdown id: tags:
+## Setup GPU
+%% Cell type:code id: tags:
+``` 
+import torch
+# If there's a GPU available...
+if torch.cuda.is_available():
+    # Tell PyTorch to use the GPU.
+    device = torch.device("cuda")
+    print('There are %d GPU(s) available.' % torch.cuda.device_count())
+    print('We will use the GPU:', torch.cuda.get_device_name(0))
+# If not...
+else:
+    print('No GPU available, using the CPU instead.')
+    device = torch.device("cpu")
+```
+%% Output
+    No GPU available, using the CPU instead.
+%% Cell type:markdown id: tags:
+## Install packages
+%% Cell type:code id: tags:
+``` 
+pip install transformers
+```
+%% Output
+    Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.10.0)
+    Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)
+    Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (21.0)
+    Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.3)
+    Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.0)
+    Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.6.4)
+    Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (5.4.1)
+    Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.45)
+    Requirement already satisfied: huggingface-hub>=0.0.12 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.16)
+    Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)
+    Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)
+    Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)
+    Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.12->transformers) (3.7.4.3)
+    Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)
+    Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.5.0)
+    Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)
+    Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)
+    Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)
+    Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)
+    Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)
+    Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)
+    Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)
+%% Cell type:code id: tags:
+``` 
+pip install sentencepiece
+```
+%% Output
+    Requirement already satisfied: sentencepiece in /usr/local/lib/python3.7/dist-packages (0.1.96)
+%% Cell type:markdown id: tags:
+## Utils functions
+%% Cell type:code id: tags:
+``` 
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+def remove_weak_classes(df, classColumnName, threshold):
+    dictOfClassInstances = create_dict(df,classColumnName)
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+def resample_classes(df, classColumnName, numberOfInstances):
+    #random numberOfInstances elements
+    replace = False  # with replacement
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
+```
+%% Cell type:markdown id: tags:
+## Load Data
+%% Cell type:code id: tags:
+``` 
+import pandas as pd
+import numpy as np
+from sklearn import preprocessing
+from sklearn.model_selection import train_test_split
+```
+%% Cell type:code id: tags:
+``` 
+dataPath = 'dataframe_with_ensemble_domaine_enccre.csv'
+columnText = 'contentWithoutClass'
+columnClass = 'ensemble_domaine_enccre'
+minOfInstancePerClass = 200
+maxOfInstancePerClass = 1500
+```
+%% Cell type:code id: tags:
+``` 
+df = pd.read_csv(dataPath)
+df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
+df = resample_classes(df, columnClass, maxOfInstancePerClass)
+df = df[df[columnClass] != 'unclassified']
+```
+%% Cell type:code id: tags:
+``` 
+y  = df[columnClass]
+numberOfClasses = y.nunique()
+encoder = preprocessing.LabelEncoder()
+y = encoder.fit_transform(y)
+```
+%% Cell type:code id: tags:
+``` 
+train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
+```
+%% Cell type:code id: tags:
+``` 
+sentences = train_x[columnText].values
+labels = train_y.tolist()
+```
+%% Cell type:markdown id: tags:
+# Model
+## Tokenisation & Input Formatting
+%% Cell type:code id: tags:
+``` 
+tokeniser_bert = 'bert-base-multilingual-cased'
+tokeniser_camembert = 'camembert-base'
+model_bert =  "bert-base-multilingual-cased"
+model_camembert = 'camembert-base'
+```
+%% Cell type:code id: tags:
+``` 
+from transformers import BertTokenizer, CamembertTokenizer
+# Load the BERT tokenizer.
+print('Loading BERT tokenizer...')
+tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)
+```
+%% Output
+    Loading BERT tokenizer...
+%% Cell type:code id: tags:
+``` 
+ # Tokenize all of the sentences and map the tokens to thier word IDs.
+input_ids = []
+# For every sentence...
+for sent in sentences:
+    # `encode` will:
+    #   (1) Tokenize the sentence.
+    #   (2) Prepend the `[CLS]` token to the start.
+    #   (3) Append the `[SEP]` token to the end.
+    #   (4) Map tokens to their IDs.
+    encoded_sent = tokenizer.encode(
+                        sent,                      # Sentence to encode.
+                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                        # This function also supports truncation and conversion
+                        # to pytorch tensors, but I need to do padding, so I
+                        # can't use these features.
+                        #max_length = 128,          # Truncate all sentences.
+                        #return_tensors = 'pt',     # Return pytorch tensors.
+                   )
+    # Add the encoded sentence to the list.
+    input_ids.append(encoded_sent)
+```
+%% Output
+    Token indices sequence length is longer than the specified maximum sequence length for this model (866 > 512). Running this sequence through the model will result in indexing errors
+%% Cell type:code id: tags:
+``` 
+print('Max sentence length: ', max([len(sen) for sen in input_ids]))
+```
+%% Output
+    Max sentence length:  3462
+%% Cell type:code id: tags:
+``` 
+max_len = 180
+padded = []
+for i in input_ids:
+  if len(i) > max_len:
+    padded.extend([i[:max_len]])
+  else:
+    padded.extend([i + [0] * (max_len - len(i))])
+padded = input_ids = np.array(padded)
+```
+%% Cell type:code id: tags:
+``` 
+ # Create attention masks
+attention_masks = []
+# For each sentence...
+for sent in padded:
+    # Create the attention mask.
+    #   - If a token ID is 0, then it's padding, set the mask to 0.
+    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
+    att_mask = [int(token_id > 0) for token_id in sent]
+    # Store the attention mask for this sentence.
+    attention_masks.append(att_mask)
+```
+%% Cell type:code id: tags:
+``` 
+# Use 90% for training and 10% for validation.
+train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
+                                                            random_state=2018, test_size=0.1, stratify = labels )
+# Do the same for the masks.
+train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
+                                             random_state=2018, test_size=0.1, stratify = labels)
+```
+%% Cell type:code id: tags:
+``` 
+# Convert all inputs and labels into torch tensors, the required datatype
+# for my model.
+train_inputs = torch.tensor(train_inputs)
+validation_inputs = torch.tensor(validation_inputs)
+train_labels = torch.tensor(train_labels)
+validation_labels = torch.tensor(validation_labels)
+train_masks = torch.tensor(train_masks)
+validation_masks = torch.tensor(validation_masks)
+```
+%% Cell type:code id: tags:
+``` 
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+# The DataLoader needs to know the batch size for training, so I specify it here.
+# For fine-tuning BERT on a specific task, the authors recommend a batch size of
+# 16 or 32.
+batch_size = 32
+# Create the DataLoader for training set.
+train_data = TensorDataset(train_inputs, train_masks, train_labels)
+train_sampler = RandomSampler(train_data)
+train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
+# Create the DataLoader for validation set.
+validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
+validation_sampler = SequentialSampler(validation_data)
+validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
+```
+%% Cell type:markdown id: tags:
+## Training
+%% Cell type:code id: tags:
+``` 
+from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
+# Load BertForSequenceClassification, the pretrained BERT model with a single
+# linear classification layer on top.
+model = BertForSequenceClassification.from_pretrained(
+    model_bert, # Use the 12-layer BERT model, with an uncased vocab.
+    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+                    # You can increase this for multi-class tasks.
+    output_attentions = False, # Whether the model returns attentions weights.
+    output_hidden_states = False, # Whether the model returns all hidden-states.
+)
+# Tell pytorch to run this model on the GPU.
+model.cuda()
+```
+%% Output
+    Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
+    - This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+    - This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+    Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
+    You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+    ---------------------------------------------------------------------------
+    RuntimeError                              Traceback (most recent call last)
+    <ipython-input-120-80c23ac5f353> in <module>()
+         13
+         14 # Tell pytorch to run this model on the GPU.
+    ---> 15 model.cuda()
+    /usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in cuda(self, device)
+        635             Module: self
+        636         """
+    --> 637         return self._apply(lambda t: t.cuda(device))
+        638
+        639     def xpu(self: T, device: Optional[Union[int, device]] = None) -> T:
+    /usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _apply(self, fn)
+        528     def _apply(self, fn):
+        529         for module in self.children():
+    --> 530             module._apply(fn)
+        531
+        532         def compute_should_use_set_data(tensor, tensor_applied):
+    /usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _apply(self, fn)
+        528     def _apply(self, fn):
+        529         for module in self.children():
+    --> 530             module._apply(fn)
+        531
+        532         def compute_should_use_set_data(tensor, tensor_applied):
+    /usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _apply(self, fn)
+        528     def _apply(self, fn):
+        529         for module in self.children():
+    --> 530             module._apply(fn)
+        531
+        532         def compute_should_use_set_data(tensor, tensor_applied):
+    /usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _apply(self, fn)
+        550                 # `with torch.no_grad():`
+        551                 with torch.no_grad():
+    --> 552                     param_applied = fn(param)
+        553                 should_use_set_data = compute_should_use_set_data(param, param_applied)
+        554                 if should_use_set_data:
+    /usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in <lambda>(t)
+        635             Module: self
+        636         """
+    --> 637         return self._apply(lambda t: t.cuda(device))
+        638
+        639     def xpu(self: T, device: Optional[Union[int, device]] = None) -> T:
+    RuntimeError: CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 11.17 GiB total capacity; 10.43 GiB already allocated; 91.81 MiB free; 10.63 GiB reserved in total by PyTorch)
+%% Cell type:code id: tags:
+``` 
+#Note: AdamW is a class from the huggingface library (as opposed to pytorch)
+# I believe the 'W' stands for 'Weight Decay fix"
+optimizer = AdamW(model.parameters(),
+                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                )
+```
+%% Cell type:code id: tags:
+``` 
+from transformers import get_linear_schedule_with_warmup
+# Number of training epochs (authors recommend between 2 and 4)
+epochs = 4
+# Total number of training steps is number of batches * number of epochs.
+total_steps = len(train_dataloader) * epochs
+# Create the learning rate scheduler.
+scheduler = get_linear_schedule_with_warmup(optimizer,
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
+```
+%% Cell type:code id: tags:
+``` 
+import numpy as np
+# Function to calculate the accuracy of our predictions vs labels
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+```
+%% Cell type:code id: tags:
+``` 
+import time
+import datetime
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+```
+%% Cell type:code id: tags:
+``` 
+import random
+# This training code is based on the `run_glue.py` script here:
+# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+# Set the seed value all over the place to make this reproducible.
+seed_val = 42
+random.seed(seed_val)
+np.random.seed(seed_val)
+torch.manual_seed(seed_val)
+torch.cuda.manual_seed_all(seed_val)
+# Store the average loss after each epoch so I can plot them.
+loss_values = []
+# For each epoch...
+for epoch_i in range(0, epochs):
+    # ========================================
+    #               Training
+    # ========================================
+    # Perform one full pass over the training set.
+    print("")
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('Training...')
+    # Measure how long the training epoch takes.
+    t0 = time.time()
+    # Reset the total loss for this epoch.
+    total_loss = 0
+    # Put the model into training mode.
+    model.train()
+    # For each batch of training data...
+    for step, batch in enumerate(train_dataloader):
+        # Progress update every 40 batches.
+        if step % 40 == 0 and not step == 0:
+            # Calculate elapsed time in minutes.
+            elapsed = format_time(time.time() - t0)
+            # Report progress.
+            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+        # Unpack this training batch from the dataloader.
+        #
+        # As I unpack the batch, I'll also copy each tensor to the GPU using the
+        # `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids
+        #   [1]: attention masks
+        #   [2]: labels
+        b_input_ids = batch[0].to(device)
+        b_input_mask = batch[1].to(device)
+        b_labels = batch[2].to(device)
+        # Always clear any previously calculated gradients before performing a
+        # backward pass. PyTorch doesn't do this automatically because
+        # accumulating the gradients is "convenient while training RNNs".
+        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+        model.zero_grad()
+        # Perform a forward pass (evaluate the model on this training batch).
+        # This will return the loss (rather than the model output) because I
+        # have provided the `labels`.
+        # The documentation for this `model` function is here:
+        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+        outputs = model(b_input_ids,
+                    token_type_ids=None,
+                    attention_mask=b_input_mask,
+                    labels=b_labels)
+        # The call to `model` always returns a tuple, so I need to pull the
+        # loss value out of the tuple.
+        loss = outputs[0]
+        # Accumulate the training loss over all of the batches so that I can
+        # calculate the average loss at the end. `loss` is a Tensor containing a
+        # single value; the `.item()` function just returns the Python value
+        # from the tensor.
+        total_loss += loss.item()
+        # Perform a backward pass to calculate the gradients.
+        loss.backward()
+        # Clip the norm of the gradients to 1.0.
+        # This is to help prevent the "exploding gradients" problem.
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        # Update parameters and take a step using the computed gradient.
+        # The optimizer dictates the "update rule"--how the parameters are
+        # modified based on their gradients, the learning rate, etc.
+        optimizer.step()
+        # Update the learning rate.
+        scheduler.step()
+    # Calculate the average loss over the training data.
+    avg_train_loss = total_loss / len(train_dataloader)
+    # Store the loss value for plotting the learning curve.
+    loss_values.append(avg_train_loss)
+    print("")
+    print("  Average training loss: {0:.2f}".format(avg_train_loss))
+    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
+    # ========================================
+    #               Validation
+    # ========================================
+    # After the completion of each training epoch, measure the performance on
+    # the validation set.
+    print("")
+    print("Running Validation...")
+    t0 = time.time()
+    # Put the model in evaluation mode--the dropout layers behave differently
+    # during evaluation.
+    model.eval()
+    # Tracking variables
+    eval_loss, eval_accuracy = 0, 0
+    nb_eval_steps, nb_eval_examples = 0, 0
+    # Evaluate data for one epoch
+    for batch in validation_dataloader:
+        # Add batch to GPU
+        batch = tuple(t.to(device) for t in batch)
+        # Unpack the inputs from dataloader
+        b_input_ids, b_input_mask, b_labels = batch
+        # Telling the model not to compute or store gradients, saving memory and
+        # speeding up validation
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions.
+            # This will return the logits rather than the loss because we have
+            # not provided labels.
+            # token_type_ids is the same as the "segment ids", which
+            # differentiates sentence 1 and 2 in 2-sentence tasks.
+            # The documentation for this `model` function is here:
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            outputs = model(b_input_ids,
+                            token_type_ids=None,
+                            attention_mask=b_input_mask)
+        # Get the "logits" output by the model. The "logits" are the output
+        # values prior to applying an activation function like the softmax.
+        logits = outputs[0]
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+        # Calculate the accuracy for this batch of test sentences.
+        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
+        # Accumulate the total accuracy.
+        eval_accuracy += tmp_eval_accuracy
+        # Track the number of batches
+        nb_eval_steps += 1
+    # Report the final accuracy for this validation run.
+    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
+    print("  Validation took: {:}".format(format_time(time.time() - t0)))
+print("")
+print("Training complete!")
+```
+%% Cell type:markdown id: tags:
+## Test
+%% Cell type:code id: tags:
+``` 
+sentences_test = test_x[columnText].values
+labels_test = test_y.tolist()
+```
+%% Cell type:code id: tags:
+``` 
+# Tokenize all of the sentences and map the tokens to thier word IDs.
+input_ids_test = []
+# For every sentence...
+for sent in sentences_test:
+    # `encode` will:
+    #   (1) Tokenize the sentence.
+    #   (2) Prepend the `[CLS]` token to the start.
+    #   (3) Append the `[SEP]` token to the end.
+    #   (4) Map tokens to their IDs.
+    encoded_sent = tokenizer.encode(
+                        sent,                      # Sentence to encode.
+                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                )
+    input_ids_test.append(encoded_sent)
+# Pad our input tokens
+padded_test = []
+for i in input_ids_test:
+  if len(i) > max_len:
+    padded_test.extend([i[:max_len]])
+  else:
+    padded_test.extend([i + [0] * (max_len - len(i))])
+input_ids_test = np.array(padded_test)
+# Create attention masks
+attention_masks = []
+# Create a mask of 1s for each token followed by 0s for padding
+for seq in input_ids_test:
+    seq_mask = [float(i>0) for i in seq]
+    attention_masks.append(seq_mask)
+# Convert to tensors.
+prediction_inputs = torch.tensor(input_ids_test)
+prediction_masks = torch.tensor(attention_masks)
+prediction_labels = torch.tensor(labels_test)
+# Set the batch size.
+batch_size = 32
+# Create the DataLoader.
+prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
+prediction_sampler = SequentialSampler(prediction_data)
+prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
+```
+%% Cell type:code id: tags:
+``` 
+print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
+# Put model in evaluation mode
+model.eval()
+# Tracking variables
+predictions_test , true_labels = [], []
+# Predict
+for batch in prediction_dataloader:
+# Add batch to GPU
+    batch = tuple(t.to(device) for t in batch)
+    # Unpack the inputs from the dataloader
+    b_input_ids, b_input_mask, b_labels = batch
+    # Telling the model not to compute or store gradients, saving memory and
+    # speeding up prediction
+    with torch.no_grad():
+        # Forward pass, calculate logit predictions
+        outputs = model(b_input_ids, token_type_ids=None,
+                        attention_mask=b_input_mask)
+    logits = outputs[0]
+    #print(logits)
+    # Move logits and labels to CPU
+    logits = logits.detach().cpu().numpy()
+    label_ids = b_labels.to('cpu').numpy()
+    #print(logits)
+    # Store predictions and true labels
+    predictions_test.append(logits)
+    true_labels.append(label_ids)
+print('    DONE.')
+```
+%% Cell type:code id: tags:
+``` 
+from sklearn.metrics import *
+pred_labels = []
+# Evaluate each test batch using many matrics
+print('Calculating the matrics for each batch...')
+for i in range(len(true_labels)):
+  # The predictions for this batch are a 2-column ndarray (one column for "0"
+  # and one column for "1"). Pick the label with the highest value and turn this
+  # in to a list of 0s and 1s.
+  pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
+  pred_labels.append(pred_labels_i)
+```
+%% Cell type:code id: tags:
+``` 
+pred_labels_ = [item for sublist in pred_labels for item in sublist]
+true_labels_ = [item for sublist in true_labels for item in sublist]
+```
+%% Cell type:markdown id: tags:
+### Report & Evaluation
+%% Cell type:code id: tags:
+``` 
+import matplotlib.pyplot as plt
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+import seaborn as sns
+```
+%% Cell type:code id: tags:
+``` 
+report = classification_report( pred_labels_, true_labels_, output_dict = True)
+accuracy = report['accuracy']
+weighted_avg = report['weighted avg']
+```
+%% Cell type:code id: tags:
+``` 
+classes = [str(e) for e in encoder.transform(encoder.classes_)]
+classesName = encoder.classes_
+```
+%% Cell type:code id: tags:
+``` 
+precision = []
+recall = []
+f1 = []
+support = []
+dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
+for c in classes:
+  precision.append(report[c]['precision'])
+  recall.append(report[c]['recall'])
+  f1.append(report[c]['f1-score'])
+  support.append(report[c]['support'])
+accuracy = report['accuracy']
+weighted_avg = report['weighted avg']
+cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
+FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
+FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
+TP = np.diag(cnf_matrix)
+TN = cnf_matrix.sum() - (FP + FN + TP)
+dff['className'] = classesName
+dff['precision'] = precision
+dff['recall'] = recall
+dff['f1-score'] = f1
+dff['support'] = support
+dff['FP'] = FP
+dff['FN'] = FN
+dff['TP'] = TP
+dff['TN'] = TN
+```
+%% Cell type:code id: tags:
+``` 
+print(weighted_avg)
+print(accuracy)
+print(dff)
+```