{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "accelerator": "GPU", "colab": { "name": "BertFineTuning_.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "8hzEGHl7gmzk" }, "source": [ "## Setup GPU" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dPOU-Efhf4ui", "outputId": "1e3142a8-6351-43f3-9147-68406520b7ee" }, "source": [ "import torch\n", "\n", "# If there's a GPU available...\n", "if torch.cuda.is_available(): \n", "\n", " # Tell PyTorch to use the GPU. \n", " device = torch.device(\"cuda\")\n", "\n", " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n", "\n", " print('We will use the GPU:', torch.cuda.get_device_name(0))\n", "\n", "# If not...\n", "else:\n", " print('No GPU available, using the CPU instead.')\n", " device = torch.device(\"cpu\")" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "No GPU available, using the CPU instead.\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "Jr-S9yYIgGkA" }, "source": [ "## Install packages" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pwmZ5bBvgGNh", "outputId": "79c5fb08-a9f4-41bc-eb4d-ab448c5fb4a7" }, "source": [ "pip install transformers" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.10.0)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (21.0)\n", "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.3)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.0)\n", "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.6.4)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (5.4.1)\n", "Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.45)\n", "Requirement already satisfied: huggingface-hub>=0.0.12 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.16)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.12->transformers) (3.7.4.3)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.5.0)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VFXEpG00gXkL", "outputId": "2336f39a-78b7-4118-e754-508d876c51f9" }, "source": [ "pip install sentencepiece" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.7/dist-packages (0.1.96)\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "12SA-qPFgsVo" }, "source": [ "## Utils functions" ] }, { "cell_type": "code", "metadata": { "id": "WkIVcabUgxIl" }, "source": [ "def create_dict(df, classColumnName):\n", " return dict(df[classColumnName].value_counts())\n", "\n", "def remove_weak_classes(df, classColumnName, threshold):\n", "\n", " dictOfClassInstances = create_dict(df,classColumnName)\n", "\n", "\n", " dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }\n", " keys = [*dictionary]\n", " df_tmp = df[~ df[classColumnName].isin(keys)]\n", " df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)\n", " return df\n", "\n", "\n", "def resample_classes(df, classColumnName, numberOfInstances):\n", " \n", " #random numberOfInstances elements\n", " replace = False # with replacement\n", "\n", " fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n", " return df.groupby(classColumnName, as_index=False).apply(fn)\n", " " ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "c5QKcXulhNJ-" }, "source": [ "## Load Data" ] }, { "cell_type": "code", "metadata": { "id": "vonJ-d4Qg1g5" }, "source": [ "import pandas as pd \n", "import numpy as np\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import train_test_split" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ouU5usvXg4PA" }, "source": [ "dataPath = 'dataframe_with_ensemble_domaine_enccre.csv'\n", "columnText = 'contentWithoutClass'\n", "columnClass = 'ensemble_domaine_enccre'\n", "minOfInstancePerClass = 200\n", "maxOfInstancePerClass = 1500" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "5u1acjunhoxe" }, "source": [ "df = pd.read_csv(dataPath)\n", "df = remove_weak_classes(df, columnClass, minOfInstancePerClass)\n", "df = resample_classes(df, columnClass, maxOfInstancePerClass)\n", "df = df[df[columnClass] != 'unclassified']" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "zrjZvs2dhzAy" }, "source": [ "y = df[columnClass]\n", "numberOfClasses = y.nunique()\n", "encoder = preprocessing.LabelEncoder()\n", "y = encoder.fit_transform(y)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "u9AxxaA_h1CM" }, "source": [ "train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Xt_PhH_6h1_3" }, "source": [ "sentences = train_x[columnText].values\n", "labels = train_y.tolist()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Gs4Agx_5h43M" }, "source": [ "# Model\n", "## Tokenisation & Input Formatting" ] }, { "cell_type": "code", "metadata": { "id": "YZ5PhEYZiCEA" }, "source": [ "tokeniser_bert = 'bert-base-multilingual-cased'\n", "tokeniser_camembert = 'camembert-base'\n", "\n", "model_bert = \"bert-base-multilingual-cased\"\n", "model_camembert = 'camembert-base'" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "C4bigx_3ibuN", "outputId": "9d54db26-9920-4a92-bb1e-4534f287140f" }, "source": [ "from transformers import BertTokenizer, CamembertTokenizer\n", "\n", "# Load the BERT tokenizer.\n", "print('Loading BERT tokenizer...')\n", "tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Loading BERT tokenizer...\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5hNod5X9jDZN", "outputId": "1166b782-d384-4388-de21-21091dc9f925" }, "source": [ " # Tokenize all of the sentences and map the tokens to thier word IDs.\n", "input_ids = []\n", "\n", "# For every sentence...\n", "for sent in sentences:\n", " # `encode` will:\n", " # (1) Tokenize the sentence.\n", " # (2) Prepend the `[CLS]` token to the start.\n", " # (3) Append the `[SEP]` token to the end.\n", " # (4) Map tokens to their IDs.\n", " encoded_sent = tokenizer.encode(\n", " sent, # Sentence to encode.\n", " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", "\n", " # This function also supports truncation and conversion\n", " # to pytorch tensors, but I need to do padding, so I\n", " # can't use these features.\n", " #max_length = 128, # Truncate all sentences.\n", " #return_tensors = 'pt', # Return pytorch tensors.\n", " )\n", " \n", " # Add the encoded sentence to the list.\n", " input_ids.append(encoded_sent)\n", "\n" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Token indices sequence length is longer than the specified maximum sequence length for this model (866 > 512). Running this sequence through the model will result in indexing errors\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "W9EWv5JvjGH3", "outputId": "9072122d-3586-40fe-9d75-5b6e9035d6d2" }, "source": [ "print('Max sentence length: ', max([len(sen) for sen in input_ids])) " ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Max sentence length: 3462\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "xh1TQJyvjOx5" }, "source": [ "max_len = 180\n", "padded = []\n", "for i in input_ids:\n", "\n", " if len(i) > max_len:\n", " padded.extend([i[:max_len]])\n", " else:\n", " padded.extend([i + [0] * (max_len - len(i))])\n", "\n", "\n", "padded = input_ids = np.array(padded)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ZiwY6gn0jUkD" }, "source": [ " # Create attention masks\n", "attention_masks = []\n", "\n", "# For each sentence...\n", "for sent in padded:\n", " \n", " # Create the attention mask.\n", " # - If a token ID is 0, then it's padding, set the mask to 0.\n", " # - If a token ID is > 0, then it's a real token, set the mask to 1.\n", " att_mask = [int(token_id > 0) for token_id in sent]\n", " \n", " # Store the attention mask for this sentence.\n", " attention_masks.append(att_mask)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "oBTR5AfAjXJe" }, "source": [ "# Use 90% for training and 10% for validation.\n", "train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, \n", " random_state=2018, test_size=0.1, stratify = labels )\n", "# Do the same for the masks.\n", "train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,\n", " random_state=2018, test_size=0.1, stratify = labels)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "b9Mw5kq3jhTb" }, "source": [ "# Convert all inputs and labels into torch tensors, the required datatype \n", "# for my model.\n", "train_inputs = torch.tensor(train_inputs)\n", "validation_inputs = torch.tensor(validation_inputs)\n", "\n", "train_labels = torch.tensor(train_labels)\n", "validation_labels = torch.tensor(validation_labels)\n", "\n", "train_masks = torch.tensor(train_masks)\n", "validation_masks = torch.tensor(validation_masks)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "UfFWzbENjnkw" }, "source": [ "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", "\n", "# The DataLoader needs to know the batch size for training, so I specify it here.\n", "# For fine-tuning BERT on a specific task, the authors recommend a batch size of\n", "# 16 or 32.\n", "\n", "batch_size = 32\n", "\n", "# Create the DataLoader for training set.\n", "train_data = TensorDataset(train_inputs, train_masks, train_labels)\n", "train_sampler = RandomSampler(train_data)\n", "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)\n", "\n", "# Create the DataLoader for validation set.\n", "validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)\n", "validation_sampler = SequentialSampler(validation_data)\n", "validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "x45JNGqhkUn2" }, "source": [ "## Training" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 463 }, "id": "C7M2Er1ajsTf", "outputId": "fe4c13b7-5157-49b4-e878-6d7676d4d1a3" }, "source": [ "from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification\n", "\n", "# Load BertForSequenceClassification, the pretrained BERT model with a single \n", "# linear classification layer on top.\n", "\n", "model = BertForSequenceClassification.from_pretrained(\n", " model_bert, # Use the 12-layer BERT model, with an uncased vocab.\n", " num_labels = numberOfClasses, # The number of output labels--2 for binary classification.\n", " # You can increase this for multi-class tasks. \n", " output_attentions = False, # Whether the model returns attentions weights.\n", " output_hidden_states = False, # Whether the model returns all hidden-states.\n", ")\n", "\n", "# Tell pytorch to run this model on the GPU.\n", "model.cuda()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']\n", "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "output_type": "error", "ename": "RuntimeError", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-120-80c23ac5f353>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# Tell pytorch to run this model on the GPU.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36mcuda\u001b[0;34m(self, device)\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0mModule\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 636\u001b[0m \"\"\"\n\u001b[0;32m--> 637\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 638\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 639\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mxpu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m 550\u001b[0m \u001b[0;31m# `with torch.no_grad():`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 551\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mno_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 552\u001b[0;31m \u001b[0mparam_applied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 553\u001b[0m \u001b[0mshould_use_set_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 554\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mshould_use_set_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0mModule\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 636\u001b[0m \"\"\"\n\u001b[0;32m--> 637\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 638\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 639\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mxpu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mRuntimeError\u001b[0m: CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 11.17 GiB total capacity; 10.43 GiB already allocated; 91.81 MiB free; 10.63 GiB reserved in total by PyTorch)" ] } ] }, { "cell_type": "code", "metadata": { "id": "xd_cG-8pj4Iw" }, "source": [ "#Note: AdamW is a class from the huggingface library (as opposed to pytorch) \n", "# I believe the 'W' stands for 'Weight Decay fix\"\n", "optimizer = AdamW(model.parameters(),\n", " lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n", " eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n", " )" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "65G-uHuLj4_6" }, "source": [ "from transformers import get_linear_schedule_with_warmup\n", "\n", "# Number of training epochs (authors recommend between 2 and 4)\n", "epochs = 4\n", "\n", "# Total number of training steps is number of batches * number of epochs.\n", "total_steps = len(train_dataloader) * epochs\n", "\n", "# Create the learning rate scheduler.\n", "scheduler = get_linear_schedule_with_warmup(optimizer, \n", " num_warmup_steps = 0, # Default value in run_glue.py\n", " num_training_steps = total_steps)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "lHSOuwcMj9jf" }, "source": [ "import numpy as np\n", "\n", "# Function to calculate the accuracy of our predictions vs labels\n", "def flat_accuracy(preds, labels):\n", " pred_flat = np.argmax(preds, axis=1).flatten()\n", " labels_flat = labels.flatten()\n", " return np.sum(pred_flat == labels_flat) / len(labels_flat) " ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Z0S3br-7kASm" }, "source": [ "import time\n", "import datetime\n", "\n", "def format_time(elapsed):\n", " '''\n", " Takes a time in seconds and returns a string hh:mm:ss\n", " '''\n", " # Round to the nearest second.\n", " elapsed_rounded = int(round((elapsed)))\n", " \n", " # Format as hh:mm:ss\n", " return str(datetime.timedelta(seconds=elapsed_rounded))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "SbHBbYpwkKaA" }, "source": [ "import random\n", "\n", "# This training code is based on the `run_glue.py` script here:\n", "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n", "\n", "\n", "# Set the seed value all over the place to make this reproducible.\n", "seed_val = 42\n", "\n", "random.seed(seed_val)\n", "np.random.seed(seed_val)\n", "torch.manual_seed(seed_val)\n", "torch.cuda.manual_seed_all(seed_val)\n", "\n", "# Store the average loss after each epoch so I can plot them.\n", "loss_values = []\n", "\n", "# For each epoch...\n", "for epoch_i in range(0, epochs):\n", " \n", " # ========================================\n", " # Training\n", " # ========================================\n", " \n", " # Perform one full pass over the training set.\n", "\n", " print(\"\")\n", " print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n", " print('Training...')\n", "\n", " # Measure how long the training epoch takes.\n", " t0 = time.time()\n", "\n", " # Reset the total loss for this epoch.\n", " total_loss = 0\n", "\n", " # Put the model into training mode.\n", " model.train()\n", "\n", " # For each batch of training data...\n", " for step, batch in enumerate(train_dataloader):\n", "\n", " # Progress update every 40 batches.\n", " if step % 40 == 0 and not step == 0:\n", " # Calculate elapsed time in minutes.\n", " elapsed = format_time(time.time() - t0)\n", " \n", " # Report progress.\n", " print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n", "\n", " # Unpack this training batch from the dataloader. \n", " #\n", " # As I unpack the batch, I'll also copy each tensor to the GPU using the \n", " # `to` method.\n", " #\n", " # `batch` contains three pytorch tensors:\n", " # [0]: input ids \n", " # [1]: attention masks\n", " # [2]: labels \n", " b_input_ids = batch[0].to(device)\n", " b_input_mask = batch[1].to(device)\n", " b_labels = batch[2].to(device)\n", "\n", " # Always clear any previously calculated gradients before performing a\n", " # backward pass. PyTorch doesn't do this automatically because \n", " # accumulating the gradients is \"convenient while training RNNs\". \n", " # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)\n", " model.zero_grad() \n", "\n", " # Perform a forward pass (evaluate the model on this training batch).\n", " # This will return the loss (rather than the model output) because I\n", " # have provided the `labels`.\n", " # The documentation for this `model` function is here: \n", " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n", " outputs = model(b_input_ids, \n", " token_type_ids=None, \n", " attention_mask=b_input_mask, \n", " labels=b_labels)\n", " \n", " # The call to `model` always returns a tuple, so I need to pull the \n", " # loss value out of the tuple.\n", " loss = outputs[0]\n", "\n", " # Accumulate the training loss over all of the batches so that I can\n", " # calculate the average loss at the end. `loss` is a Tensor containing a\n", " # single value; the `.item()` function just returns the Python value \n", " # from the tensor.\n", " total_loss += loss.item()\n", "\n", " # Perform a backward pass to calculate the gradients.\n", " loss.backward()\n", "\n", " # Clip the norm of the gradients to 1.0.\n", " # This is to help prevent the \"exploding gradients\" problem.\n", " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n", "\n", " # Update parameters and take a step using the computed gradient.\n", " # The optimizer dictates the \"update rule\"--how the parameters are\n", " # modified based on their gradients, the learning rate, etc.\n", " optimizer.step()\n", "\n", " # Update the learning rate.\n", " scheduler.step()\n", "\n", " # Calculate the average loss over the training data.\n", " avg_train_loss = total_loss / len(train_dataloader) \n", " \n", " # Store the loss value for plotting the learning curve.\n", " loss_values.append(avg_train_loss)\n", "\n", " print(\"\")\n", " print(\" Average training loss: {0:.2f}\".format(avg_train_loss))\n", " print(\" Training epoch took: {:}\".format(format_time(time.time() - t0)))\n", " \n", " # ========================================\n", " # Validation\n", " # ========================================\n", " # After the completion of each training epoch, measure the performance on\n", " # the validation set.\n", "\n", " print(\"\")\n", " print(\"Running Validation...\")\n", "\n", " t0 = time.time()\n", "\n", " # Put the model in evaluation mode--the dropout layers behave differently\n", " # during evaluation.\n", " model.eval()\n", "\n", " # Tracking variables \n", " eval_loss, eval_accuracy = 0, 0\n", " nb_eval_steps, nb_eval_examples = 0, 0\n", "\n", " # Evaluate data for one epoch\n", " for batch in validation_dataloader:\n", " \n", " # Add batch to GPU\n", " batch = tuple(t.to(device) for t in batch)\n", " \n", " # Unpack the inputs from dataloader\n", " b_input_ids, b_input_mask, b_labels = batch\n", " \n", " # Telling the model not to compute or store gradients, saving memory and\n", " # speeding up validation\n", " with torch.no_grad(): \n", "\n", " # Forward pass, calculate logit predictions.\n", " # This will return the logits rather than the loss because we have\n", " # not provided labels.\n", " # token_type_ids is the same as the \"segment ids\", which \n", " # differentiates sentence 1 and 2 in 2-sentence tasks.\n", " # The documentation for this `model` function is here: \n", " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n", " outputs = model(b_input_ids, \n", " token_type_ids=None, \n", " attention_mask=b_input_mask)\n", " \n", " # Get the \"logits\" output by the model. The \"logits\" are the output\n", " # values prior to applying an activation function like the softmax.\n", " logits = outputs[0]\n", "\n", " # Move logits and labels to CPU\n", " logits = logits.detach().cpu().numpy()\n", " label_ids = b_labels.to('cpu').numpy()\n", " \n", " # Calculate the accuracy for this batch of test sentences.\n", " tmp_eval_accuracy = flat_accuracy(logits, label_ids)\n", " \n", " # Accumulate the total accuracy.\n", " eval_accuracy += tmp_eval_accuracy\n", "\n", " # Track the number of batches\n", " nb_eval_steps += 1\n", "\n", " # Report the final accuracy for this validation run.\n", " print(\" Accuracy: {0:.2f}\".format(eval_accuracy/nb_eval_steps))\n", " print(\" Validation took: {:}\".format(format_time(time.time() - t0)))\n", "\n", "print(\"\")\n", "print(\"Training complete!\")" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "VJwyfmakkQyj" }, "source": [ "## Test" ] }, { "cell_type": "code", "metadata": { "id": "VAyzmfhZCGZo" }, "source": [ "sentences_test = test_x[columnText].values\n", "labels_test = test_y.tolist()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "lZFXr_sdCJcb" }, "source": [ "# Tokenize all of the sentences and map the tokens to thier word IDs.\n", "input_ids_test = []\n", "# For every sentence...\n", "for sent in sentences_test:\n", " # `encode` will:\n", " # (1) Tokenize the sentence.\n", " # (2) Prepend the `[CLS]` token to the start.\n", " # (3) Append the `[SEP]` token to the end.\n", " # (4) Map tokens to their IDs.\n", " encoded_sent = tokenizer.encode(\n", " sent, # Sentence to encode.\n", " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", " )\n", " \n", " input_ids_test.append(encoded_sent)\n", "\n", "# Pad our input tokens\n", "padded_test = []\n", "for i in input_ids_test:\n", "\n", " if len(i) > max_len:\n", " padded_test.extend([i[:max_len]])\n", " else:\n", " padded_test.extend([i + [0] * (max_len - len(i))])\n", "input_ids_test = np.array(padded_test)\n", "\n", "# Create attention masks\n", "attention_masks = []\n", "\n", "# Create a mask of 1s for each token followed by 0s for padding\n", "for seq in input_ids_test:\n", " seq_mask = [float(i>0) for i in seq]\n", " attention_masks.append(seq_mask) \n", "\n", "# Convert to tensors.\n", "prediction_inputs = torch.tensor(input_ids_test)\n", "prediction_masks = torch.tensor(attention_masks)\n", "prediction_labels = torch.tensor(labels_test)\n", "\n", "# Set the batch size. \n", "batch_size = 32 \n", "\n", "# Create the DataLoader.\n", "prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)\n", "prediction_sampler = SequentialSampler(prediction_data)\n", "prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "SUDcxi03Cmf-" }, "source": [ "print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))\n", "\n", "# Put model in evaluation mode\n", "model.eval()\n", "\n", "# Tracking variables \n", "predictions_test , true_labels = [], []\n", "\n", "# Predict \n", "for batch in prediction_dataloader:\n", "# Add batch to GPU\n", " batch = tuple(t.to(device) for t in batch)\n", " \n", " # Unpack the inputs from the dataloader\n", " b_input_ids, b_input_mask, b_labels = batch\n", " \n", " # Telling the model not to compute or store gradients, saving memory and \n", " # speeding up prediction\n", " with torch.no_grad():\n", " # Forward pass, calculate logit predictions\n", " outputs = model(b_input_ids, token_type_ids=None, \n", " attention_mask=b_input_mask)\n", "\n", " logits = outputs[0]\n", " #print(logits)\n", "\n", " # Move logits and labels to CPU\n", " logits = logits.detach().cpu().numpy()\n", " label_ids = b_labels.to('cpu').numpy()\n", " #print(logits)\n", " \n", " # Store predictions and true labels\n", " predictions_test.append(logits)\n", " true_labels.append(label_ids)\n", "\n", "print(' DONE.')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "c3i7szp3Cn5u" }, "source": [ "from sklearn.metrics import *\n", "\n", "pred_labels = []\n", "\n", "# Evaluate each test batch using many matrics\n", "print('Calculating the matrics for each batch...')\n", "\n", "for i in range(len(true_labels)):\n", " \n", " # The predictions for this batch are a 2-column ndarray (one column for \"0\" \n", " # and one column for \"1\"). Pick the label with the highest value and turn this\n", " # in to a list of 0s and 1s.\n", " pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()\n", " pred_labels.append(pred_labels_i)\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "0bU9-DsBCxSO" }, "source": [ "pred_labels_ = [item for sublist in pred_labels for item in sublist]\n", "true_labels_ = [item for sublist in true_labels for item in sublist]\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ZUM_U2QlC4K5" }, "source": [ "### Report & Evaluation" ] }, { "cell_type": "code", "metadata": { "id": "d5n84N0xCfcU" }, "source": [ "import matplotlib.pyplot as plt\n", "from sklearn.metrics import plot_confusion_matrix\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import classification_report\n", "import seaborn as sns" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "v4hXk-KjC-nq" }, "source": [ "report = classification_report( pred_labels_, true_labels_, output_dict = True)\n", " \n", "accuracy = report['accuracy']\n", "weighted_avg = report['weighted avg']" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "xETMy1L6DAa5" }, "source": [ "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n", "classesName = encoder.classes_" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dPjV_5g8DDQy" }, "source": [ "precision = []\n", "recall = []\n", "f1 = []\n", "support = []\n", "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", "for c in classes:\n", " precision.append(report[c]['precision'])\n", " recall.append(report[c]['recall'])\n", " f1.append(report[c]['f1-score'])\n", " support.append(report[c]['support'])\n", "\n", "accuracy = report['accuracy']\n", "weighted_avg = report['weighted avg']\n", "cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n", "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", "TP = np.diag(cnf_matrix)\n", "TN = cnf_matrix.sum() - (FP + FN + TP)\n", "\n", "dff['className'] = classesName\n", "dff['precision'] = precision\n", "dff['recall'] = recall\n", "dff['f1-score'] = f1\n", "dff['support'] = support\n", "dff['FP'] = FP\n", "dff['FN'] = FN\n", "dff['TP'] = TP\n", "dff['TN'] = TN\n", " \n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "vslzi9bHDKcv" }, "source": [ "print(weighted_avg)\n", "print(accuracy)\n", "print(dff)" ], "execution_count": null, "outputs": [] } ] }