diff --git a/BertFineTuning_.ipynb b/BertFineTuning_.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6216aebc6cc8899321e8a36d1b1f2e5eadf44530 --- /dev/null +++ b/BertFineTuning_.ipynb @@ -0,0 +1,1098 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "BertFineTuning_.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "8hzEGHl7gmzk" + }, + "source": [ + "## Setup GPU" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dPOU-Efhf4ui", + "outputId": "1e3142a8-6351-43f3-9147-68406520b7ee" + }, + "source": [ + "import torch\n", + "\n", + "# If there's a GPU available...\n", + "if torch.cuda.is_available(): \n", + "\n", + " # Tell PyTorch to use the GPU. \n", + " device = torch.device(\"cuda\")\n", + "\n", + " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n", + "\n", + " print('We will use the GPU:', torch.cuda.get_device_name(0))\n", + "\n", + "# If not...\n", + "else:\n", + " print('No GPU available, using the CPU instead.')\n", + " device = torch.device(\"cpu\")" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "No GPU available, using the CPU instead.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Jr-S9yYIgGkA" + }, + "source": [ + "## Install packages" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pwmZ5bBvgGNh", + "outputId": "79c5fb08-a9f4-41bc-eb4d-ab448c5fb4a7" + }, + "source": [ + "pip install transformers" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.10.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (21.0)\n", + "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.0)\n", + "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.6.4)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (5.4.1)\n", + "Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.45)\n", + "Requirement already satisfied: huggingface-hub>=0.0.12 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.16)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.12->transformers) (3.7.4.3)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.5.0)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VFXEpG00gXkL", + "outputId": "2336f39a-78b7-4118-e754-508d876c51f9" + }, + "source": [ + "pip install sentencepiece" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.7/dist-packages (0.1.96)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "12SA-qPFgsVo" + }, + "source": [ + "## Utils functions" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WkIVcabUgxIl" + }, + "source": [ + "def create_dict(df, classColumnName):\n", + " return dict(df[classColumnName].value_counts())\n", + "\n", + "def remove_weak_classes(df, classColumnName, threshold):\n", + "\n", + " dictOfClassInstances = create_dict(df,classColumnName)\n", + "\n", + "\n", + " dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }\n", + " keys = [*dictionary]\n", + " df_tmp = df[~ df[classColumnName].isin(keys)]\n", + " df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)\n", + " return df\n", + "\n", + "\n", + "def resample_classes(df, classColumnName, numberOfInstances):\n", + " \n", + " #random numberOfInstances elements\n", + " replace = False # with replacement\n", + "\n", + " fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n", + " return df.groupby(classColumnName, as_index=False).apply(fn)\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c5QKcXulhNJ-" + }, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "vonJ-d4Qg1g5" + }, + "source": [ + "import pandas as pd \n", + "import numpy as np\n", + "from sklearn import preprocessing\n", + "from sklearn.model_selection import train_test_split" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ouU5usvXg4PA" + }, + "source": [ + "dataPath = 'dataframe_with_ensemble_domaine_enccre.csv'\n", + "columnText = 'contentWithoutClass'\n", + "columnClass = 'ensemble_domaine_enccre'\n", + "minOfInstancePerClass = 200\n", + "maxOfInstancePerClass = 1500" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5u1acjunhoxe" + }, + "source": [ + "df = pd.read_csv(dataPath)\n", + "df = remove_weak_classes(df, columnClass, minOfInstancePerClass)\n", + "df = resample_classes(df, columnClass, maxOfInstancePerClass)\n", + "df = df[df[columnClass] != 'unclassified']" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zrjZvs2dhzAy" + }, + "source": [ + "y = df[columnClass]\n", + "numberOfClasses = y.nunique()\n", + "encoder = preprocessing.LabelEncoder()\n", + "y = encoder.fit_transform(y)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "u9AxxaA_h1CM" + }, + "source": [ + "train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Xt_PhH_6h1_3" + }, + "source": [ + "sentences = train_x[columnText].values\n", + "labels = train_y.tolist()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gs4Agx_5h43M" + }, + "source": [ + "# Model\n", + "## Tokenisation & Input Formatting" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YZ5PhEYZiCEA" + }, + "source": [ + "tokeniser_bert = 'bert-base-multilingual-cased'\n", + "tokeniser_camembert = 'camembert-base'\n", + "\n", + "model_bert = \"bert-base-multilingual-cased\"\n", + "model_camembert = 'camembert-base'" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "C4bigx_3ibuN", + "outputId": "9d54db26-9920-4a92-bb1e-4534f287140f" + }, + "source": [ + "from transformers import BertTokenizer, CamembertTokenizer\n", + "\n", + "# Load the BERT tokenizer.\n", + "print('Loading BERT tokenizer...')\n", + "tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading BERT tokenizer...\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5hNod5X9jDZN", + "outputId": "1166b782-d384-4388-de21-21091dc9f925" + }, + "source": [ + " # Tokenize all of the sentences and map the tokens to thier word IDs.\n", + "input_ids = []\n", + "\n", + "# For every sentence...\n", + "for sent in sentences:\n", + " # `encode` will:\n", + " # (1) Tokenize the sentence.\n", + " # (2) Prepend the `[CLS]` token to the start.\n", + " # (3) Append the `[SEP]` token to the end.\n", + " # (4) Map tokens to their IDs.\n", + " encoded_sent = tokenizer.encode(\n", + " sent, # Sentence to encode.\n", + " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", + "\n", + " # This function also supports truncation and conversion\n", + " # to pytorch tensors, but I need to do padding, so I\n", + " # can't use these features.\n", + " #max_length = 128, # Truncate all sentences.\n", + " #return_tensors = 'pt', # Return pytorch tensors.\n", + " )\n", + " \n", + " # Add the encoded sentence to the list.\n", + " input_ids.append(encoded_sent)\n", + "\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (866 > 512). Running this sequence through the model will result in indexing errors\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "W9EWv5JvjGH3", + "outputId": "9072122d-3586-40fe-9d75-5b6e9035d6d2" + }, + "source": [ + "print('Max sentence length: ', max([len(sen) for sen in input_ids])) " + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Max sentence length: 3462\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xh1TQJyvjOx5" + }, + "source": [ + "max_len = 180\n", + "padded = []\n", + "for i in input_ids:\n", + "\n", + " if len(i) > max_len:\n", + " padded.extend([i[:max_len]])\n", + " else:\n", + " padded.extend([i + [0] * (max_len - len(i))])\n", + "\n", + "\n", + "padded = input_ids = np.array(padded)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZiwY6gn0jUkD" + }, + "source": [ + " # Create attention masks\n", + "attention_masks = []\n", + "\n", + "# For each sentence...\n", + "for sent in padded:\n", + " \n", + " # Create the attention mask.\n", + " # - If a token ID is 0, then it's padding, set the mask to 0.\n", + " # - If a token ID is > 0, then it's a real token, set the mask to 1.\n", + " att_mask = [int(token_id > 0) for token_id in sent]\n", + " \n", + " # Store the attention mask for this sentence.\n", + " attention_masks.append(att_mask)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "oBTR5AfAjXJe" + }, + "source": [ + "# Use 90% for training and 10% for validation.\n", + "train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, \n", + " random_state=2018, test_size=0.1, stratify = labels )\n", + "# Do the same for the masks.\n", + "train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,\n", + " random_state=2018, test_size=0.1, stratify = labels)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "b9Mw5kq3jhTb" + }, + "source": [ + "# Convert all inputs and labels into torch tensors, the required datatype \n", + "# for my model.\n", + "train_inputs = torch.tensor(train_inputs)\n", + "validation_inputs = torch.tensor(validation_inputs)\n", + "\n", + "train_labels = torch.tensor(train_labels)\n", + "validation_labels = torch.tensor(validation_labels)\n", + "\n", + "train_masks = torch.tensor(train_masks)\n", + "validation_masks = torch.tensor(validation_masks)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "UfFWzbENjnkw" + }, + "source": [ + "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", + "\n", + "# The DataLoader needs to know the batch size for training, so I specify it here.\n", + "# For fine-tuning BERT on a specific task, the authors recommend a batch size of\n", + "# 16 or 32.\n", + "\n", + "batch_size = 32\n", + "\n", + "# Create the DataLoader for training set.\n", + "train_data = TensorDataset(train_inputs, train_masks, train_labels)\n", + "train_sampler = RandomSampler(train_data)\n", + "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)\n", + "\n", + "# Create the DataLoader for validation set.\n", + "validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)\n", + "validation_sampler = SequentialSampler(validation_data)\n", + "validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "x45JNGqhkUn2" + }, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 463 + }, + "id": "C7M2Er1ajsTf", + "outputId": "fe4c13b7-5157-49b4-e878-6d7676d4d1a3" + }, + "source": [ + "from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification\n", + "\n", + "# Load BertForSequenceClassification, the pretrained BERT model with a single \n", + "# linear classification layer on top.\n", + "\n", + "model = BertForSequenceClassification.from_pretrained(\n", + " model_bert, # Use the 12-layer BERT model, with an uncased vocab.\n", + " num_labels = numberOfClasses, # The number of output labels--2 for binary classification.\n", + " # You can increase this for multi-class tasks. \n", + " output_attentions = False, # Whether the model returns attentions weights.\n", + " output_hidden_states = False, # Whether the model returns all hidden-states.\n", + ")\n", + "\n", + "# Tell pytorch to run this model on the GPU.\n", + "model.cuda()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']\n", + "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "output_type": "error", + "ename": "RuntimeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-120-80c23ac5f353>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# Tell pytorch to run this model on the GPU.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36mcuda\u001b[0;34m(self, device)\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0mModule\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 636\u001b[0m \"\"\"\n\u001b[0;32m--> 637\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 638\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 639\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mxpu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m 550\u001b[0m \u001b[0;31m# `with torch.no_grad():`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 551\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mno_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 552\u001b[0;31m \u001b[0mparam_applied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 553\u001b[0m \u001b[0mshould_use_set_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 554\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mshould_use_set_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0mModule\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 636\u001b[0m \"\"\"\n\u001b[0;32m--> 637\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 638\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 639\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mxpu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 11.17 GiB total capacity; 10.43 GiB already allocated; 91.81 MiB free; 10.63 GiB reserved in total by PyTorch)" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xd_cG-8pj4Iw" + }, + "source": [ + "#Note: AdamW is a class from the huggingface library (as opposed to pytorch) \n", + "# I believe the 'W' stands for 'Weight Decay fix\"\n", + "optimizer = AdamW(model.parameters(),\n", + " lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n", + " eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n", + " )" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "65G-uHuLj4_6" + }, + "source": [ + "from transformers import get_linear_schedule_with_warmup\n", + "\n", + "# Number of training epochs (authors recommend between 2 and 4)\n", + "epochs = 4\n", + "\n", + "# Total number of training steps is number of batches * number of epochs.\n", + "total_steps = len(train_dataloader) * epochs\n", + "\n", + "# Create the learning rate scheduler.\n", + "scheduler = get_linear_schedule_with_warmup(optimizer, \n", + " num_warmup_steps = 0, # Default value in run_glue.py\n", + " num_training_steps = total_steps)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "lHSOuwcMj9jf" + }, + "source": [ + "import numpy as np\n", + "\n", + "# Function to calculate the accuracy of our predictions vs labels\n", + "def flat_accuracy(preds, labels):\n", + " pred_flat = np.argmax(preds, axis=1).flatten()\n", + " labels_flat = labels.flatten()\n", + " return np.sum(pred_flat == labels_flat) / len(labels_flat) " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Z0S3br-7kASm" + }, + "source": [ + "import time\n", + "import datetime\n", + "\n", + "def format_time(elapsed):\n", + " '''\n", + " Takes a time in seconds and returns a string hh:mm:ss\n", + " '''\n", + " # Round to the nearest second.\n", + " elapsed_rounded = int(round((elapsed)))\n", + " \n", + " # Format as hh:mm:ss\n", + " return str(datetime.timedelta(seconds=elapsed_rounded))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "SbHBbYpwkKaA" + }, + "source": [ + "import random\n", + "\n", + "# This training code is based on the `run_glue.py` script here:\n", + "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n", + "\n", + "\n", + "# Set the seed value all over the place to make this reproducible.\n", + "seed_val = 42\n", + "\n", + "random.seed(seed_val)\n", + "np.random.seed(seed_val)\n", + "torch.manual_seed(seed_val)\n", + "torch.cuda.manual_seed_all(seed_val)\n", + "\n", + "# Store the average loss after each epoch so I can plot them.\n", + "loss_values = []\n", + "\n", + "# For each epoch...\n", + "for epoch_i in range(0, epochs):\n", + " \n", + " # ========================================\n", + " # Training\n", + " # ========================================\n", + " \n", + " # Perform one full pass over the training set.\n", + "\n", + " print(\"\")\n", + " print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n", + " print('Training...')\n", + "\n", + " # Measure how long the training epoch takes.\n", + " t0 = time.time()\n", + "\n", + " # Reset the total loss for this epoch.\n", + " total_loss = 0\n", + "\n", + " # Put the model into training mode.\n", + " model.train()\n", + "\n", + " # For each batch of training data...\n", + " for step, batch in enumerate(train_dataloader):\n", + "\n", + " # Progress update every 40 batches.\n", + " if step % 40 == 0 and not step == 0:\n", + " # Calculate elapsed time in minutes.\n", + " elapsed = format_time(time.time() - t0)\n", + " \n", + " # Report progress.\n", + " print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n", + "\n", + " # Unpack this training batch from the dataloader. \n", + " #\n", + " # As I unpack the batch, I'll also copy each tensor to the GPU using the \n", + " # `to` method.\n", + " #\n", + " # `batch` contains three pytorch tensors:\n", + " # [0]: input ids \n", + " # [1]: attention masks\n", + " # [2]: labels \n", + " b_input_ids = batch[0].to(device)\n", + " b_input_mask = batch[1].to(device)\n", + " b_labels = batch[2].to(device)\n", + "\n", + " # Always clear any previously calculated gradients before performing a\n", + " # backward pass. PyTorch doesn't do this automatically because \n", + " # accumulating the gradients is \"convenient while training RNNs\". \n", + " # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)\n", + " model.zero_grad() \n", + "\n", + " # Perform a forward pass (evaluate the model on this training batch).\n", + " # This will return the loss (rather than the model output) because I\n", + " # have provided the `labels`.\n", + " # The documentation for this `model` function is here: \n", + " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n", + " outputs = model(b_input_ids, \n", + " token_type_ids=None, \n", + " attention_mask=b_input_mask, \n", + " labels=b_labels)\n", + " \n", + " # The call to `model` always returns a tuple, so I need to pull the \n", + " # loss value out of the tuple.\n", + " loss = outputs[0]\n", + "\n", + " # Accumulate the training loss over all of the batches so that I can\n", + " # calculate the average loss at the end. `loss` is a Tensor containing a\n", + " # single value; the `.item()` function just returns the Python value \n", + " # from the tensor.\n", + " total_loss += loss.item()\n", + "\n", + " # Perform a backward pass to calculate the gradients.\n", + " loss.backward()\n", + "\n", + " # Clip the norm of the gradients to 1.0.\n", + " # This is to help prevent the \"exploding gradients\" problem.\n", + " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n", + "\n", + " # Update parameters and take a step using the computed gradient.\n", + " # The optimizer dictates the \"update rule\"--how the parameters are\n", + " # modified based on their gradients, the learning rate, etc.\n", + " optimizer.step()\n", + "\n", + " # Update the learning rate.\n", + " scheduler.step()\n", + "\n", + " # Calculate the average loss over the training data.\n", + " avg_train_loss = total_loss / len(train_dataloader) \n", + " \n", + " # Store the loss value for plotting the learning curve.\n", + " loss_values.append(avg_train_loss)\n", + "\n", + " print(\"\")\n", + " print(\" Average training loss: {0:.2f}\".format(avg_train_loss))\n", + " print(\" Training epoch took: {:}\".format(format_time(time.time() - t0)))\n", + " \n", + " # ========================================\n", + " # Validation\n", + " # ========================================\n", + " # After the completion of each training epoch, measure the performance on\n", + " # the validation set.\n", + "\n", + " print(\"\")\n", + " print(\"Running Validation...\")\n", + "\n", + " t0 = time.time()\n", + "\n", + " # Put the model in evaluation mode--the dropout layers behave differently\n", + " # during evaluation.\n", + " model.eval()\n", + "\n", + " # Tracking variables \n", + " eval_loss, eval_accuracy = 0, 0\n", + " nb_eval_steps, nb_eval_examples = 0, 0\n", + "\n", + " # Evaluate data for one epoch\n", + " for batch in validation_dataloader:\n", + " \n", + " # Add batch to GPU\n", + " batch = tuple(t.to(device) for t in batch)\n", + " \n", + " # Unpack the inputs from dataloader\n", + " b_input_ids, b_input_mask, b_labels = batch\n", + " \n", + " # Telling the model not to compute or store gradients, saving memory and\n", + " # speeding up validation\n", + " with torch.no_grad(): \n", + "\n", + " # Forward pass, calculate logit predictions.\n", + " # This will return the logits rather than the loss because we have\n", + " # not provided labels.\n", + " # token_type_ids is the same as the \"segment ids\", which \n", + " # differentiates sentence 1 and 2 in 2-sentence tasks.\n", + " # The documentation for this `model` function is here: \n", + " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n", + " outputs = model(b_input_ids, \n", + " token_type_ids=None, \n", + " attention_mask=b_input_mask)\n", + " \n", + " # Get the \"logits\" output by the model. The \"logits\" are the output\n", + " # values prior to applying an activation function like the softmax.\n", + " logits = outputs[0]\n", + "\n", + " # Move logits and labels to CPU\n", + " logits = logits.detach().cpu().numpy()\n", + " label_ids = b_labels.to('cpu').numpy()\n", + " \n", + " # Calculate the accuracy for this batch of test sentences.\n", + " tmp_eval_accuracy = flat_accuracy(logits, label_ids)\n", + " \n", + " # Accumulate the total accuracy.\n", + " eval_accuracy += tmp_eval_accuracy\n", + "\n", + " # Track the number of batches\n", + " nb_eval_steps += 1\n", + "\n", + " # Report the final accuracy for this validation run.\n", + " print(\" Accuracy: {0:.2f}\".format(eval_accuracy/nb_eval_steps))\n", + " print(\" Validation took: {:}\".format(format_time(time.time() - t0)))\n", + "\n", + "print(\"\")\n", + "print(\"Training complete!\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VJwyfmakkQyj" + }, + "source": [ + "## Test" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "VAyzmfhZCGZo" + }, + "source": [ + "sentences_test = test_x[columnText].values\n", + "labels_test = test_y.tolist()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "lZFXr_sdCJcb" + }, + "source": [ + "# Tokenize all of the sentences and map the tokens to thier word IDs.\n", + "input_ids_test = []\n", + "# For every sentence...\n", + "for sent in sentences_test:\n", + " # `encode` will:\n", + " # (1) Tokenize the sentence.\n", + " # (2) Prepend the `[CLS]` token to the start.\n", + " # (3) Append the `[SEP]` token to the end.\n", + " # (4) Map tokens to their IDs.\n", + " encoded_sent = tokenizer.encode(\n", + " sent, # Sentence to encode.\n", + " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", + " )\n", + " \n", + " input_ids_test.append(encoded_sent)\n", + "\n", + "# Pad our input tokens\n", + "padded_test = []\n", + "for i in input_ids_test:\n", + "\n", + " if len(i) > max_len:\n", + " padded_test.extend([i[:max_len]])\n", + " else:\n", + " padded_test.extend([i + [0] * (max_len - len(i))])\n", + "input_ids_test = np.array(padded_test)\n", + "\n", + "# Create attention masks\n", + "attention_masks = []\n", + "\n", + "# Create a mask of 1s for each token followed by 0s for padding\n", + "for seq in input_ids_test:\n", + " seq_mask = [float(i>0) for i in seq]\n", + " attention_masks.append(seq_mask) \n", + "\n", + "# Convert to tensors.\n", + "prediction_inputs = torch.tensor(input_ids_test)\n", + "prediction_masks = torch.tensor(attention_masks)\n", + "prediction_labels = torch.tensor(labels_test)\n", + "\n", + "# Set the batch size. \n", + "batch_size = 32 \n", + "\n", + "# Create the DataLoader.\n", + "prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)\n", + "prediction_sampler = SequentialSampler(prediction_data)\n", + "prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "SUDcxi03Cmf-" + }, + "source": [ + "print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))\n", + "\n", + "# Put model in evaluation mode\n", + "model.eval()\n", + "\n", + "# Tracking variables \n", + "predictions_test , true_labels = [], []\n", + "\n", + "# Predict \n", + "for batch in prediction_dataloader:\n", + "# Add batch to GPU\n", + " batch = tuple(t.to(device) for t in batch)\n", + " \n", + " # Unpack the inputs from the dataloader\n", + " b_input_ids, b_input_mask, b_labels = batch\n", + " \n", + " # Telling the model not to compute or store gradients, saving memory and \n", + " # speeding up prediction\n", + " with torch.no_grad():\n", + " # Forward pass, calculate logit predictions\n", + " outputs = model(b_input_ids, token_type_ids=None, \n", + " attention_mask=b_input_mask)\n", + "\n", + " logits = outputs[0]\n", + " #print(logits)\n", + "\n", + " # Move logits and labels to CPU\n", + " logits = logits.detach().cpu().numpy()\n", + " label_ids = b_labels.to('cpu').numpy()\n", + " #print(logits)\n", + " \n", + " # Store predictions and true labels\n", + " predictions_test.append(logits)\n", + " true_labels.append(label_ids)\n", + "\n", + "print(' DONE.')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "c3i7szp3Cn5u" + }, + "source": [ + "from sklearn.metrics import *\n", + "\n", + "pred_labels = []\n", + "\n", + "# Evaluate each test batch using many matrics\n", + "print('Calculating the matrics for each batch...')\n", + "\n", + "for i in range(len(true_labels)):\n", + " \n", + " # The predictions for this batch are a 2-column ndarray (one column for \"0\" \n", + " # and one column for \"1\"). Pick the label with the highest value and turn this\n", + " # in to a list of 0s and 1s.\n", + " pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()\n", + " pred_labels.append(pred_labels_i)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "0bU9-DsBCxSO" + }, + "source": [ + "pred_labels_ = [item for sublist in pred_labels for item in sublist]\n", + "true_labels_ = [item for sublist in true_labels for item in sublist]\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZUM_U2QlC4K5" + }, + "source": [ + "### Report & Evaluation" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "d5n84N0xCfcU" + }, + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import plot_confusion_matrix\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import classification_report\n", + "import seaborn as sns" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "v4hXk-KjC-nq" + }, + "source": [ + "report = classification_report( pred_labels_, true_labels_, output_dict = True)\n", + " \n", + "accuracy = report['accuracy']\n", + "weighted_avg = report['weighted avg']" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "xETMy1L6DAa5" + }, + "source": [ + "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n", + "classesName = encoder.classes_" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "dPjV_5g8DDQy" + }, + "source": [ + "precision = []\n", + "recall = []\n", + "f1 = []\n", + "support = []\n", + "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", + "for c in classes:\n", + " precision.append(report[c]['precision'])\n", + " recall.append(report[c]['recall'])\n", + " f1.append(report[c]['f1-score'])\n", + " support.append(report[c]['support'])\n", + "\n", + "accuracy = report['accuracy']\n", + "weighted_avg = report['weighted avg']\n", + "cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n", + "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", + "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", + "TP = np.diag(cnf_matrix)\n", + "TN = cnf_matrix.sum() - (FP + FN + TP)\n", + "\n", + "dff['className'] = classesName\n", + "dff['precision'] = precision\n", + "dff['recall'] = recall\n", + "dff['f1-score'] = f1\n", + "dff['support'] = support\n", + "dff['FP'] = FP\n", + "dff['FN'] = FN\n", + "dff['TP'] = TP\n", + "dff['TN'] = TN\n", + " \n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vslzi9bHDKcv" + }, + "source": [ + "print(weighted_avg)\n", + "print(accuracy)\n", + "print(dff)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 4151912781645bcb220d2367277686a3501640e0..b216c0954d76a3a78dd664f1ac1b15bdd27d203e 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,13 @@ In order to run the classifiers, use the following command : python experimentsClassicClassifiers.py <dataset_tsv_file> <content_column_name> <labels_column_name> <min_sample_per_class> <max_sample_per_class> +In order to run Classification with pre-trained models, use the following command : + + + cd experiments/ + + python bert_experiments.py <model_Name> <classifier> # Acknowledgment -The authors are grateful to the ASLAN project (ANR-10-LABX-0081) of the Université de Lyon, for its financial support within the French program "Investments for the Future" operated by the National Research Agency (ANR). \ No newline at end of file +The authors are grateful to the ASLAN project (ANR-10-LABX-0081) of the Université de Lyon, for its financial support within the French program "Investments for the Future" operated by the National Research Agency (ANR). diff --git a/bert_settings.conf b/bert_settings.conf new file mode 100644 index 0000000000000000000000000000000000000000..ccba612bd378d3084b8ec973861a9f6e63c41c15 --- /dev/null +++ b/bert_settings.conf @@ -0,0 +1,19 @@ +[general] +dataPath = Data/dataframe_with_ensemble_domaine_enccre.csv +columnText = contentWithoutClass +columnClass = ensemble_domaine_enccre +minOfInstancePerClass = 200 +maxOfInstancePerClass = 1500 + + +[model] + +tokeniser = bert-base-multilingual-cased +#tokeniser = camembert-base +model = bert-base-multilingual-cased +#model = camembert-base +max_len_sequences = 256 +batch_size = 32 +epochs = 4 +pathModel = ' ' +modelName = ' ' diff --git a/evaluate_bertFineTuning.py b/evaluate_bertFineTuning.py new file mode 100644 index 0000000000000000000000000000000000000000..3c9b52bb3dac78506110b0d49716fe97e18e10bf --- /dev/null +++ b/evaluate_bertFineTuning.py @@ -0,0 +1,54 @@ +import matplotlib.pyplot as plt +from sklearn.metrics import plot_confusion_matrix +from sklearn.metrics import confusion_matrix +from sklearn.metrics import classification_report +import seaborn as sns + + + + + + + + + + +def evaluate_bertFineTuning(pred_labels_, true_labels_, encoder): + report = classification_report( pred_labels_, true_labels_, output_dict = True) + + classes = [str(e) for e in encoder.transform(encoder.classes_)] + classesName = encoder.classes_ + + accuracy = report['accuracy'] + weighted_avg = report['weighted avg'] + + precision = [] + recall = [] + f1 = [] + support = [] + dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN']) + for c in classes: + precision.append(report[c]['precision']) + recall.append(report[c]['recall']) + f1.append(report[c]['f1-score']) + support.append(report[c]['support']) + + accuracy = report['accuracy'] + weighted_avg = report['weighted avg'] + cnf_matrix = confusion_matrix(true_labels_, pred_labels_) + FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) + FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) + TP = np.diag(cnf_matrix) + TN = cnf_matrix.sum() - (FP + FN + TP) + + dff['className'] = classesName + dff['precision'] = precision + dff['recall'] = recall + dff['f1-score'] = f1 + dff['support'] = support + dff['FP'] = FP + dff['FN'] = FN + dff['TP'] = TP + dff['TN'] = TN + + return dff, accuracy, weighted_avg diff --git a/experiments/bert_experiments.py b/experiments/bert_experiments.py new file mode 100644 index 0000000000000000000000000000000000000000..b4e4bcd52219cb92d830f7d03654e631e113906f --- /dev/null +++ b/experiments/bert_experiments.py @@ -0,0 +1,349 @@ +import pandas as pd +import numpy as np +import torch +import transformers as ppb +from sklearn.model_selection import train_test_split +from sklearn import preprocessing +import statistics +import os +import sys +import argparse +import configparser +from transformers import CamembertModel, CamembertTokenizer +from transformers import FlaubertModel, FlaubertTokenizer + + +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import SGDClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import GridSearchCV + + +import matplotlib.pyplot as plt +from sklearn.metrics import plot_confusion_matrix +from sklearn.metrics import confusion_matrix +from sklearn.metrics import classification_report +import seaborn as sns + + + + + + +def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave): + + #classifier, label_list, test_x, valid_y, title = "Confusion matrix"): + precision = [] + recall = [] + f1 = [] + support = [] + weighted_avg = None + accuracy = None + + df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN']) + report = classification_report( y_pred, valid_y, output_dict = True) + for c in classes: + precision.append(report[c]['precision']) + recall.append(report[c]['recall']) + f1.append(report[c]['f1-score']) + support.append(report[c]['support']) + + accuracy = report['accuracy'] + weighted_avg = report['weighted avg'] + cnf_matrix = confusion_matrix(valid_y, y_pred) + FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) + FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) + TP = np.diag(cnf_matrix) + TN = cnf_matrix.sum() - (FP + FN + TP) + + df['className'] = classesName + df['precision'] = precision + df['recall'] = recall + df['f1-score'] = f1 + df['support'] = support + df['FP'] = FP + df['FN'] = FN + df['TP'] = TP + df['TN'] = TN + #disp = plot_confusion_matrix(classifier, test_x, valid_y, + # display_labels= label_list, + # cmap=plt.cm.Blues, + # normalize=None) + #disp.ax_.set_title(title) + + #print(title) + #print(disp.confusion_matrix) + + #plt.show() + plt.rcParams["font.size"] = 3 + plot_confusion_matrix(clf, X_test, y_test) + plt.savefig(pathSave) + return df, accuracy, weighted_avg + + + +def create_dict(df, classColumnName): + return dict(df[classColumnName].value_counts()) + +def remove_weak_classes(df, classColumnName, threshold): + + dictOfClassInstances = create_dict(df,classColumnName) + + + dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } + keys = [*dictionary] + df_tmp = df[~ df[classColumnName].isin(keys)] + #df = df[df[columnTarget] not in keys] + #df = df.merge(df_tmp, how = 'outer' ,indicator=True) + df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) + return df + + +def split_class(df, columnProcessed): + i = 0 + new_df = pd.DataFrame(columns= df.columns) + for index, row in df.iterrows(): + #cls = re.split(';', row[columnProcessed]) + cls = filter(None, row[columnProcessed].split(';')) + cls = list(cls) + #cls = re.findall(r"[\w']+", row [columnProcessed]) + r = row + for categ in cls: + r[columnProcessed] = categ + #new_df.append(r, ignore_index = True) + new_df.loc[i] = r + i = i + 1 + + return new_df + + +def resample_classes(df, classColumnName, numberOfInstances): + # numberOfInstances first elements + #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns]) + #random numberOfInstances elements + replace = False # with replacement + + fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] + return df.groupby(classColumnName, as_index=False).apply(fn) + + +def select_classifier(argument): + + classifiers = { + + 'lr' :LogisticRegression(), + 'sgd' :SGDClassifier(), + 'svm' :SVC() , + 'decisionTree' :DecisionTreeClassifier(), + 'rfc' :RandomForestClassifier(), + 'knn' : KNeighborsClassifier() + } + + param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} + param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } + param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } + param_grid_lr = { "penalty":['none',"l2"]} + param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} + param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } + + grid_params = { + + 'lr': param_grid_lr, + 'sgd': param_grid_sgd , + 'svm': param_grid_svm, + 'decisionTree': param_grid_decisionTree, + 'rfc': param_grid_rfc , + 'knn': param_grid_knn, + + } + + return classifiers.get(argument), grid_params.get(argument) + + +if __name__ == "__main__": + + + + + + print('ok') + parser = argparse.ArgumentParser() + parser.add_argument("modelName", help="bert or distilBert or camembert or flaubert") + parser.add_argument("classifier", help="lr or knn or rfc or decisionTree or sgd or svm") + + + args = parser.parse_args() + arg = args.modelName + classifier = args.classifier + + config = configparser.ConfigParser() + config.read('parameters.conf') + + minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) + maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) + + dataPath = config.get('data','dataPath') + columnText = config.get('data','columnText') + columnClass = config.get('data','columnClass') + + + + if not os.path.exists('reports'): + os.makedirs('reports') + + if not os.path.exists(os.path.join('reports', columnClass)): + os.makedirs(os.path.join('reports', columnClass)) + + + dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) + if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)): + os.makedirs(os.path.join('reports', columnClass, dir_name_report)) + + + + # read data + print(dataPath) + df = pd.read_csv(dataPath) + df = remove_weak_classes(df, columnClass, minOfInstancePerClass) + df = resample_classes(df, columnClass, maxOfInstancePerClass) + + print(df.head()) + print(df.shape) + #encode labels + df = df[df[columnClass] != 'unclassified'] + y = df[columnClass] + encoder = preprocessing.LabelEncoder() + y = encoder.fit_transform(y) + + + sentences = df['firstParagraph'] + labels = y.tolist() + + + + # Features Extraction + #Bert + model_class_bert, tokenizer_class_bert, pretrained_weights_bert = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased') + tokenizer_bert = tokenizer_class_bert.from_pretrained(pretrained_weights_bert) + model_bert = model_class_bert.from_pretrained(pretrained_weights_bert) + #DistilBert + model_class_distilBert, tokenizer_class_distilBert, pretrained_weights_distilBert = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased') + tokenizer_distilBert = tokenizer_class_distilBert.from_pretrained(pretrained_weights_distilBert) + model_distilBert = model_class_distilBert.from_pretrained(pretrained_weights_distilBert) + #Camembert + camembert_tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base") + camembert = CamembertModel.from_pretrained("camembert/camembert-base") + #Flaubert + + flaubert, log = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased', output_loading_info=True) + flaubert_tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', do_lowercase=False) + + + + models = { + 'bert': model_bert, + 'distilbert': model_distilBert , + 'camembert': camembert, + 'flaubert': flaubert + } + + tokenizers = { + 'bert': tokenizer_bert, + 'distilbert': tokenizer_distilBert , + 'camembert': camembert_tokenizer, + 'flaubert': flaubert_tokenizer + + } + + + + + + + if arg == 'flaubert': + model = flaubert + tokenizer = flaubert_tokenizer + elif arg == 'camembert': + model = camembert + tokenizer = camembert_tokenizer + + elif arg == 'distilbert': + model = model_distilBert + tokenizer = tokenizer_distilBert + + elif arg == 'bert': + model = model_bert + tokenizer = tokenizer_bert + + + + + + + tokenized = sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length = 512, truncation = True))) + + # padding the sequences + max_len = 0 + for i in tokenized.values: + if len(i) > max_len: + max_len = len(i) + + padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values]) + + + + # attention mask + + attention_mask = np.where(padded != 0, 1, 0) + + + + # get features + input_ids = torch.tensor(padded) + attention_mask = torch.tensor(attention_mask) + + with torch.no_grad(): + last_hidden_states = model(input_ids, attention_mask=attention_mask) + + features = last_hidden_states[0][:,0,:].numpy() + print(features.shape) + + train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) + + + # classification + + + clf, grid_param = select_classifier(classifier) + + print(features) + + + + clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) + + clf.fit(train_x, train_y) + + #evaluation + + + y_pred = clf.predict(test_x) + + + report, accuracy, weighted_avg = evaluate_model(clf, test_x, test_y, y_pred, test_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, arg+ '_' + classifier+'.pdf')) + + report.to_csv(os.path.join('reports', columnClass, dir_name_report, arg + '_' + classifier +'.csv')) + with open(os.path.join('reports', columnClass, dir_name_report, arg + '_' + classifier+'.txt'), 'w') as f: + + sys.stdout = f # Change the standard output to the file we created. + print('accuracy : {}'.format(accuracy)) + print('weighted_Precision : {}'.format(weighted_avg['precision'])) + print('weighted_Recall : {}'.format(weighted_avg['recall'])) + print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) + print('weighted_Support : {}'.format(weighted_avg['support'])) + print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) + #sys.stdout = sys.stdout # Reset the standard output to its original value + sys.stdout = sys.__stdout__ diff --git a/experiments/parameters.conf b/experiments/parameters.conf new file mode 100644 index 0000000000000000000000000000000000000000..df584e4ab43603f86828c17c4c7cacaaaf6437ee --- /dev/null +++ b/experiments/parameters.conf @@ -0,0 +1,10 @@ +[general] + +minOfInstancePerClass = 1200 +maxOfInstancePerClass = 7 + +[data] + +dataPath = ../Data/dataframe_with_ensemble_domaine_enccre.csv +columnText = contentWithoutClass +columnClass = ensemble_domaine_enccre diff --git a/experiments/requierements.txt b/experiments/requierements.txt new file mode 100644 index 0000000000000000000000000000000000000000..076fef87790fd62e4a4101a85d64ceff09083b9a --- /dev/null +++ b/experiments/requierements.txt @@ -0,0 +1,7 @@ +transformers==4.3.2 +sentencepiece +sklearn +pandas +numpy +torch==1.8.1 + diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..8301acc2f929d750e0cea915a905a721ab8150fb --- /dev/null +++ b/main.py @@ -0,0 +1,120 @@ +import pandas as pd +import numpy as np +import configparser +from sklearn import preprocessing +from sklearn.model_selection import train_test_split + +from training_bertFineTuning import training_bertFineTuning +from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader +from evaluate_bertFineTuning import evaluate_bertFineTuning + + + + + + +def create_dict(df, classColumnName): + return dict(df[classColumnName].value_counts()) + +def remove_weak_classes(df, classColumnName, threshold): + + dictOfClassInstances = create_dict(df,classColumnName) + + + dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } + keys = [*dictionary] + df_tmp = df[~ df[classColumnName].isin(keys)] + df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) + return df + + +def resample_classes(df, classColumnName, numberOfInstances): + + #random numberOfInstances elements + replace = False # with replacement + + fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] + return df.groupby(classColumnName, as_index=False).apply(fn) + + + +def main(): + + config = configparser.ConfigParser() + config.read('bert_settings.conf') + + dataPath = config.get('general','dataPath') + columnText = config.get('general','columnText') + columnClass = config.get('general','columnClass') + + minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) + maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) + + chosen_tokeniser = config.get('model','tokeniser') + chosen_model = config.get('model','model') + + max_len = int(config.get('model','max_len_sequences')) + batch_size = int(config.get('model','batch_size')) + epochs = int(config.get('model','epochs')) + + df = pd.read_csv(dataPath) + df = remove_weak_classes(df, columnClass, minOfInstancePerClass) + df = resample_classes(df, columnClass, maxOfInstancePerClass) + df = df[df[columnClass] != 'unclassified'] + + + y = df[columnClass] + numberOfClasses = y.nunique() + encoder = preprocessing.LabelEncoder() + y = encoder.fit_transform(y) + + + train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) + + sentences = train_x[columnText].values + labels = train_y.tolist() + + + #call train method + + model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs) + #save the model + model_save_name = config.get('model','modelName') + path = config.get('model','path') + torch.save(model, os.path.join(path,model_save_name)) + + #print the model parameters + params = list(model.named_parameters()) + + print('The BERT model has {:} different named parameters.\n'.format(len(params))) + + print('==== Embedding Layer ====\n') + + for p in params[0:5]: + print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) + + print('\n==== First Transformer ====\n') + + for p in params[5:21]: + print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) + + print('\n==== Output Layer ====\n') + + for p in params[-4:]: + print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) + + #call predict method + prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32) + predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader) + + #call Evaluate + result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder) + + print(result_df) + print(accuracy) + print(weighted_avg) + + + +if __name__ == "__main__": + main() diff --git a/predict_bertFineTuning.py b/predict_bertFineTuning.py new file mode 100644 index 0000000000000000000000000000000000000000..4276122d0b88159c6631fe2dd2db9d14603558c3 --- /dev/null +++ b/predict_bertFineTuning.py @@ -0,0 +1,168 @@ +import torch + +import pandas as pd + +import numpy as np + +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from transformers import BertTokenizer, CamembertTokenizer + +def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32): + + if chosen_model == 'bert-base-multilingual-cased' : + print('Loading Bert Tokenizer...') + tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True) + elif chosen_model == 'camembert-base': + print('Loading Camembert Tokenizer...') + tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True) + + # Tokenize all of the sentences and map the tokens to thier word IDs. + input_ids_test = [] + # For every sentence... + for sent in sentences_to_predict: + # `encode` will: + # (1) Tokenize the sentence. + # (2) Prepend the `[CLS]` token to the start. + # (3) Append the `[SEP]` token to the end. + # (4) Map tokens to their IDs. + encoded_sent = tokenizer.encode( + sent, # Sentence to encode. + add_special_tokens = True, # Add '[CLS]' and '[SEP]' + ) + + input_ids_test.append(encoded_sent) + + # Pad our input tokens + padded_test = [] + for i in input_ids_test: + + if len(i) > max_len: + padded_test.extend([i[:max_len]]) + else: + padded_test.extend([i + [0] * (max_len - len(i))]) + input_ids_test = np.array(padded_test) + + # Create attention masks + attention_masks = [] + + # Create a mask of 1s for each token followed by 0s for padding + for seq in input_ids_test: + seq_mask = [float(i>0) for i in seq] + attention_masks.append(seq_mask) + + # Convert to tensors. + prediction_inputs = torch.tensor(input_ids_test) + prediction_masks = torch.tensor(attention_masks) + prediction_labels = torch.tensor(labels) + + # Set the batch size. + batch_size = 32 + + # Create the DataLoader. + prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) + + return prediction_dataloader + + + +def predict_class_bertFineTuning(model, sentences_to_predict_dataloader): + + + # If there's a GPU available... + if torch.cuda.is_available(): + + # Tell PyTorch to use the GPU. + device = torch.device("cuda") + + print('There are %d GPU(s) available.' % torch.cuda.device_count()) + + print('We will use the GPU:', torch.cuda.get_device_name(0)) + + # If not... + else: + print('No GPU available, using the CPU instead.') + device = torch.device("cpu") + + # Put model in evaluation mode + model.eval() + + # Tracking variables + predictions_test , true_labels = [], [] + + # Predict + for batch in prediction_dataloader: + # Add batch to GPU + batch = tuple(t.to(device) for t in batch) + + # Unpack the inputs from the dataloader + b_input_ids, b_input_mask, b_labels = batch + + # Telling the model not to compute or store gradients, saving memory and + # speeding up prediction + with torch.no_grad(): + # Forward pass, calculate logit predictions + outputs = model(b_input_ids, token_type_ids=None, + attention_mask=b_input_mask) + + logits = outputs[0] + #print(logits) + + # Move logits and labels to CPU + logits = logits.detach().cpu().numpy() + label_ids = b_labels.to('cpu').numpy() + #print(logits) + + # Store predictions and true labels + predictions_test.append(logits) + true_labels.append(label_ids) + + print(' DONE.') + + pred_labels = [] + + + for i in range(len(true_labels)): + + # The predictions for this batch are a 2-column ndarray (one column for "0" + # and one column for "1"). Pick the label with the highest value and turn this + # in to a list of 0s and 1s. + pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten() + pred_labels.append(pred_labels_i) + + pred_labels_ = [item for sublist in pred_labels for item in sublist] + true_labels_ = [item for sublist in true_labels for item in sublist] + return predictions_test_, true_labels_ + + +def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict): + + if chosen_model == 'bert-base-multilingual-cased' : + print('Loading Bert Tokenizer...') + tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True) + elif chosen_model == 'camembert-base': + print('Loading Camembert Tokenizer...') + tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True) + + # Tokenize all of the sentences and map the tokens to thier word IDs. + input_ids_test = [] + # For every sentence... + for sent in sentences_to_predict: + # `encode` will: + # (1) Tokenize the sentence. + # (2) Prepend the `[CLS]` token to the start. + # (3) Append the `[SEP]` token to the end. + # (4) Map tokens to their IDs. + encoded_sent = tokenizer.encode( + sent, # Sentence to encode. + add_special_tokens = True, # Add '[CLS]' and '[SEP]' + ) + + input_ids_test.append(encoded_sent) + with torch.no_grad(): + # Forward pass, calculate logit predictions + outputs = model(b_input_ids, token_type_ids=None, + attention_mask=b_input_mask) + + logits = outputs[0] diff --git a/training_bertFineTuning.py b/training_bertFineTuning.py new file mode 100644 index 0000000000000000000000000000000000000000..285be2d9a72d13d6cc693ad2a1c373b571e5fe86 --- /dev/null +++ b/training_bertFineTuning.py @@ -0,0 +1,400 @@ +import torch +import pandas as pd +import numpy as np +from sklearn import preprocessing +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from transformers import BertTokenizer, CamembertTokenizer +from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification +from transformers import get_linear_schedule_with_warmup +import time +import datetime +import random +import os + + + +def flat_accuracy(preds, labels): + pred_flat = np.argmax(preds, axis=1).flatten() + labels_flat = labels.flatten() + return np.sum(pred_flat == labels_flat) / len(labels_flat) + + + + + +def format_time(elapsed): + ''' + Takes a time in seconds and returns a string hh:mm:ss + ''' + # Round to the nearest second. + elapsed_rounded = int(round((elapsed))) + + # Format as hh:mm:ss + return str(datetime.timedelta(seconds=elapsed_rounded)) + + +def training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs = 4): + + # If there's a GPU available... + if torch.cuda.is_available(): + + # Tell PyTorch to use the GPU. + device = torch.device("cuda") + + print('There are %d GPU(s) available.' % torch.cuda.device_count()) + + print('We will use the GPU:', torch.cuda.get_device_name(0)) + + # If not... + else: + print('No GPU available, using the CPU instead.') + device = torch.device("cpu") + + + + +############################################################################################################ +########################## Model: Tokenization & Input Formatting ################################################################### +########################################################################################################### + + + if chosen_model == 'bert-base-multilingual-cased' : + print('Loading Bert Tokenizer...') + tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True) + elif chosen_model == 'camembert-base': + print('Loading Camembert Tokenizer...') + tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True) + + + + # Tokenize all of the sentences and map the tokens to thier word IDs. + input_ids = [] + + # For every sentence... + for sent in sentences: + # `encode` will: + # (1) Tokenize the sentence. + # (2) Prepend the `[CLS]` token to the start. + # (3) Append the `[SEP]` token to the end. + # (4) Map tokens to their IDs. + encoded_sent = tokenizer.encode( + sent, # Sentence to encode. + add_special_tokens = True, # Add '[CLS]' and '[SEP]' + + # This function also supports truncation and conversion + # to pytorch tensors, but I need to do padding, so I + # can't use these features. + #max_length = 128, # Truncate all sentences. + #return_tensors = 'pt', # Return pytorch tensors. + ) + + # Add the encoded sentence to the list. + input_ids.append(encoded_sent) + + + + + padded = [] + for i in input_ids: + + if len(i) > max_len: + padded.extend([i[:max_len]]) + else: + padded.extend([i + [0] * (max_len - len(i))]) + + + padded = np.array(padded) + + + + # Create attention masks + attention_masks = [] + + # For each sentence... + for sent in padded: + + # Create the attention mask. + # - If a token ID is 0, then it's padding, set the mask to 0. + # - If a token ID is > 0, then it's a real token, set the mask to 1. + att_mask = [int(token_id > 0) for token_id in sent] + + # Store the attention mask for this sentence. + attention_masks.append(att_mask) + + + # Use 90% for training and 10% for validation. + train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.1, stratify = labels ) + # Do the same for the masks. + train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1, stratify = labels) + + + # Convert all inputs and labels into torch tensors, the required datatype + # for my model. + train_inputs = torch.tensor(train_inputs) + validation_inputs = torch.tensor(validation_inputs) + + train_labels = torch.tensor(train_labels) + validation_labels = torch.tensor(validation_labels) + + train_masks = torch.tensor(train_masks) + validation_masks = torch.tensor(validation_masks) + + + + + # The DataLoader needs to know the batch size for training, so I specify it here. + # For fine-tuning BERT on a specific task, the authors recommend a batch size of + # 16 or 32. + + + # Create the DataLoader for training set. + train_data = TensorDataset(train_inputs, train_masks, train_labels) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) + + # Create the DataLoader for validation set. + validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) + validation_sampler = SequentialSampler(validation_data) + validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) + + + + + + print(' Selecting a model .....') + + numberOfClasses = len(set(labels)) + + + # Load BertForSequenceClassification, the pretrained BERT model with a single + # linear classification layer on top. + if chosen_model == 'bert-base-multilingual-cased': + model = BertForSequenceClassification.from_pretrained( + chosen_model, # Use the 12-layer BERT model, with an uncased vocab. + num_labels = numberOfClasses, # The number of output labels--2 for binary classification. + # You can increase this for multi-class tasks. + output_attentions = False, # Whether the model returns attentions weights. + output_hidden_states = False, # Whether the model returns all hidden-states. + ) + elif chosen_model == 'camembert-base': + + model = CamembertForSequenceClassification.from_pretrained( + chosen_model, # Use the 12-layer BERT model, with an uncased vocab. + num_labels = numberOfClasses, # The number of output labels--2 for binary classification. + # You can increase this for multi-class tasks. + output_attentions = False, # Whether the model returns attentions weights. + output_hidden_states = False, # Whether the model returns all hidden-states. + ) + + + # Tell pytorch to run this model on the GPU. + model.cuda() + + + #Note: AdamW is a class from the huggingface library (as opposed to pytorch) + # I believe the 'W' stands for 'Weight Decay fix" + optimizer = AdamW(model.parameters(), + lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 + eps = 1e-8 # args.adam_epsilon - default is 1e-8. + ) + + + + + # Total number of training steps is number of batches * number of epochs. + total_steps = len(train_dataloader) * epochs + + # Create the learning rate scheduler. + scheduler = get_linear_schedule_with_warmup(optimizer, + num_warmup_steps = 0, # Default value in run_glue.py + num_training_steps = total_steps) + + + + + # This training code is based on the `run_glue.py` script here: + # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 + + + # Set the seed value all over the place to make this reproducible. + seed_val = 42 + + random.seed(seed_val) + np.random.seed(seed_val) + torch.manual_seed(seed_val) + torch.cuda.manual_seed_all(seed_val) + + # Store the average loss after each epoch so I can plot them. + loss_values = [] + + # For each epoch... + for epoch_i in range(0, epochs): + + # ======================================== + # Training + # ======================================== + + # Perform one full pass over the training set. + + print("") + print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) + print('Training...') + + # Measure how long the training epoch takes. + t0 = time.time() + + # Reset the total loss for this epoch. + total_loss = 0 + + # Put the model into training mode. + model.train() + + # For each batch of training data... + for step, batch in enumerate(train_dataloader): + + # Progress update every 40 batches. + if step % 40 == 0 and not step == 0: + # Calculate elapsed time in minutes. + elapsed = format_time(time.time() - t0) + + # Report progress. + print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) + + # Unpack this training batch from the dataloader. + # + # As I unpack the batch, I'll also copy each tensor to the GPU using the + # `to` method. + # + # `batch` contains three pytorch tensors: + # [0]: input ids + # [1]: attention masks + # [2]: labels + b_input_ids = batch[0].to(device) + b_input_mask = batch[1].to(device) + b_labels = batch[2].to(device) + + # Always clear any previously calculated gradients before performing a + # backward pass. PyTorch doesn't do this automatically because + # accumulating the gradients is "convenient while training RNNs". + # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) + model.zero_grad() + + # Perform a forward pass (evaluate the model on this training batch). + # This will return the loss (rather than the model output) because I + # have provided the `labels`. + # The documentation for this `model` function is here: + # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification + outputs = model(b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask, + labels=b_labels) + + # The call to `model` always returns a tuple, so I need to pull the + # loss value out of the tuple. + loss = outputs[0] + + # Accumulate the training loss over all of the batches so that I can + # calculate the average loss at the end. `loss` is a Tensor containing a + # single value; the `.item()` function just returns the Python value + # from the tensor. + total_loss += loss.item() + + # Perform a backward pass to calculate the gradients. + loss.backward() + + # Clip the norm of the gradients to 1.0. + # This is to help prevent the "exploding gradients" problem. + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + + # Update parameters and take a step using the computed gradient. + # The optimizer dictates the "update rule"--how the parameters are + # modified based on their gradients, the learning rate, etc. + optimizer.step() + + # Update the learning rate. + scheduler.step() + + # Calculate the average loss over the training data. + avg_train_loss = total_loss / len(train_dataloader) + + # Store the loss value for plotting the learning curve. + loss_values.append(avg_train_loss) + + print("") + print(" Average training loss: {0:.2f}".format(avg_train_loss)) + print(" Training epoch took: {:}".format(format_time(time.time() - t0))) + + # ======================================== + # Validation + # ======================================== + # After the completion of each training epoch, measure the performance on + # the validation set. + + print("") + print("Running Validation...") + + t0 = time.time() + + # Put the model in evaluation mode--the dropout layers behave differently + # during evaluation. + model.eval() + + # Tracking variables + eval_loss, eval_accuracy = 0, 0 + nb_eval_steps, nb_eval_examples = 0, 0 + + # Evaluate data for one epoch + for batch in validation_dataloader: + + # Add batch to GPU + batch = tuple(t.to(device) for t in batch) + + # Unpack the inputs from dataloader + b_input_ids, b_input_mask, b_labels = batch + + # Telling the model not to compute or store gradients, saving memory and + # speeding up validation + with torch.no_grad(): + + # Forward pass, calculate logit predictions. + # This will return the logits rather than the loss because we have + # not provided labels. + # token_type_ids is the same as the "segment ids", which + # differentiates sentence 1 and 2 in 2-sentence tasks. + # The documentation for this `model` function is here: + # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification + outputs = model(b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask) + + # Get the "logits" output by the model. The "logits" are the output + # values prior to applying an activation function like the softmax. + logits = outputs[0] + + # Move logits and labels to CPU + logits = logits.detach().cpu().numpy() + label_ids = b_labels.to('cpu').numpy() + + # Calculate the accuracy for this batch of test sentences. + tmp_eval_accuracy = flat_accuracy(logits, label_ids) + + # Accumulate the total accuracy. + eval_accuracy += tmp_eval_accuracy + + # Track the number of batches + nb_eval_steps += 1 + + # Report the final accuracy for this validation run. + print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps)) + print(" Validation took: {:}".format(format_time(time.time() - t0))) + + print("") + print("Training complete!") + return model + + +'''print('Saving Model....') +model_save_name = config.get('model','modelName') +path = config.get('model','path') +#torch.save(model.state_dict(), os.path.join(path,model_save_name)) +torch.save(model, os.path.join(path,model_save_name))'''