{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "EDdA-Classification_CNN_Conv2D.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0yFsoHXX8Iyy"
      },
      "source": [
        "# Deep learning for EDdA classification"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tFlUCDL2778i"
      },
      "source": [
        "## Setup colab environment"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Sp8d_Uus7SHJ",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "2791f79e-a849-4b95-92c5-72150ea4d6e2"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jQBu-p6hBU-j"
      },
      "source": [
        "### Install packages"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bTIXsF6kBUdh"
      },
      "source": [
        "#!pip install zeugma\n",
        "#!pip install plot_model"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "56-04SNF8BMx"
      },
      "source": [
        "### Import librairies"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HwWkSznz7SEv"
      },
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import pickle\n",
        "import os\n",
        "\n",
        "from tqdm import tqdm\n",
        "import requests, zipfile, io\n",
        "import codecs\n",
        "\n",
        "from sklearn import preprocessing # LabelEncoder\n",
        "from sklearn.metrics import classification_report\n",
        "from sklearn.metrics import confusion_matrix\n",
        "\n",
        "from keras.preprocessing import sequence\n",
        "from keras.preprocessing.text import Tokenizer\n",
        "\n",
        "from keras.layers import BatchNormalization, Input, Reshape, Conv2D, MaxPool2D, Concatenate\n",
        "from keras.layers import Embedding, Dropout, Flatten, Dense\n",
        "from keras.models import Model, load_model\n",
        "from keras.callbacks import ModelCheckpoint\n"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xrekV6W978l4"
      },
      "source": [
        "### Utils functions"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4LJ5blQR7PUe"
      },
      "source": [
        "\n",
        "def resample_classes(df, classColumnName, numberOfInstances):\n",
        "  #random numberOfInstances elements\n",
        "  replace = False  # with replacement\n",
        "  fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n",
        "  return df.groupby(classColumnName, as_index=False).apply(fn)\n",
        "    \n"
      ],
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-Rh3JMDh7zYd"
      },
      "source": [
        ""
      ],
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MtLr35eM753e"
      },
      "source": [
        "## Load Data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FnbNT4NF7zal",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "ca2c2751-6ac2-4ec8-ec1d-03927e9bd358"
      },
      "source": [
        "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n",
        "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2022-02-18 07:27:50--  https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n",
            "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n",
            "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 175634219 (167M) [text/tab-separated-values]\n",
            "Saving to: ‘training_set.tsv’\n",
            "\n",
            "training_set.tsv    100%[===================>] 167.50M  23.4MB/s    in 7.7s    \n",
            "\n",
            "2022-02-18 07:27:58 (21.8 MB/s) - ‘training_set.tsv’ saved [175634219/175634219]\n",
            "\n",
            "--2022-02-18 07:27:58--  https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv\n",
            "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n",
            "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 42730598 (41M) [text/tab-separated-values]\n",
            "Saving to: ‘test_set.tsv’\n",
            "\n",
            "test_set.tsv        100%[===================>]  40.75M  17.1MB/s    in 2.4s    \n",
            "\n",
            "2022-02-18 07:28:01 (17.1 MB/s) - ‘test_set.tsv’ saved [42730598/42730598]\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Loading dataset"
      ],
      "metadata": {
        "id": "UHushJ1XfUj9"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "train_path = 'training_set.tsv'\n",
        "test_path =  'test_set.tsv'"
      ],
      "metadata": {
        "id": "Q4te2c0bfvaJ"
      },
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nRLaQUO97zcq"
      },
      "source": [
        "df_train = pd.read_csv(train_path, sep=\"\\t\")\n"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df_train.sample(5)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 461
        },
        "id": "2MvHEc7zVK1N",
        "outputId": "e5cfc8df-4ed9-4dd4-ce64-3350a8f9f6bc"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-3f5f81f9-5804-4f8c-8424-4b90ee905792\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>volume</th>\n",
              "      <th>numero</th>\n",
              "      <th>head</th>\n",
              "      <th>normClass</th>\n",
              "      <th>classEDdA</th>\n",
              "      <th>author</th>\n",
              "      <th>id_enccre</th>\n",
              "      <th>domaine_enccre</th>\n",
              "      <th>ensemble_domaine_enccre</th>\n",
              "      <th>content</th>\n",
              "      <th>contentWithoutClass</th>\n",
              "      <th>firstParagraph</th>\n",
              "      <th>nb_words</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>5965</th>\n",
              "      <td>9</td>\n",
              "      <td>712</td>\n",
              "      <td>KNAWEL</td>\n",
              "      <td>Botanique</td>\n",
              "      <td>Botan.</td>\n",
              "      <td>Jaucourt</td>\n",
              "      <td>v9-452-0</td>\n",
              "      <td>botanique</td>\n",
              "      <td>Histoire naturelle</td>\n",
              "      <td>KNAWEL, (Botan.) genre de plante ainsi nommée ...</td>\n",
              "      <td>knawel   genre plante nommée \\n gérard ray par...</td>\n",
              "      <td>knawel   genre plante nommée \\n gérard ray par...</td>\n",
              "      <td>169</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>21406</th>\n",
              "      <td>4</td>\n",
              "      <td>3605</td>\n",
              "      <td>DECRETE</td>\n",
              "      <td>Jurisprudence</td>\n",
              "      <td>Jurispr.</td>\n",
              "      <td>Boucher d'Argis</td>\n",
              "      <td>v4-1826-0</td>\n",
              "      <td>jurisprudence</td>\n",
              "      <td>Droit - Jurisprudence</td>\n",
              "      <td>DECRETE, adj. (Jurispr.) se dit communément\\nd...</td>\n",
              "      <td>decrete adj   communément \\n contre a ordonné ...</td>\n",
              "      <td>decrete adj   communément \\n contre a ordonné ...</td>\n",
              "      <td>80</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>46481</th>\n",
              "      <td>12</td>\n",
              "      <td>2389</td>\n",
              "      <td>Piece nette</td>\n",
              "      <td>Artillerie</td>\n",
              "      <td>Artillerie.</td>\n",
              "      <td>Jaucourt</td>\n",
              "      <td>v12-1440-16</td>\n",
              "      <td>artillerie</td>\n",
              "      <td>Militaire (Art) - Guerre - Arme</td>\n",
              "      <td>Piece nette, (Artillerie.) on appelle pieces n...</td>\n",
              "      <td>piece nette   appelle pieces nestes pieces art...</td>\n",
              "      <td>piece nette   appelle pieces nestes pieces art...</td>\n",
              "      <td>68</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>32540</th>\n",
              "      <td>7</td>\n",
              "      <td>1375</td>\n",
              "      <td>Gale</td>\n",
              "      <td>Manège | Maréchallerie</td>\n",
              "      <td>Manége &amp; Maréchallerie.</td>\n",
              "      <td>Bourgelat</td>\n",
              "      <td>v7-622-1</td>\n",
              "      <td>manège</td>\n",
              "      <td>Maréchage - Manège</td>\n",
              "      <td>Gale, (Manége &amp; Maréchallerie.) maladie prurig...</td>\n",
              "      <td>gale   maladie prurigineuse \\n cutanée manifes...</td>\n",
              "      <td>gale   maladie prurigineuse \\n cutanée manifes...</td>\n",
              "      <td>3052</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>27748</th>\n",
              "      <td>13</td>\n",
              "      <td>4039</td>\n",
              "      <td>Récit historique</td>\n",
              "      <td>Histoire</td>\n",
              "      <td>Histoire.</td>\n",
              "      <td>unsigned</td>\n",
              "      <td>v13-2396-2</td>\n",
              "      <td>histoire</td>\n",
              "      <td>Histoire</td>\n",
              "      <td>Récit historique, (Histoire.) le récit histori...</td>\n",
              "      <td>récit historique   récit historique \\n exposé ...</td>\n",
              "      <td>récit historique   récit historique \\n exposé ...</td>\n",
              "      <td>122</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3f5f81f9-5804-4f8c-8424-4b90ee905792')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-3f5f81f9-5804-4f8c-8424-4b90ee905792 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-3f5f81f9-5804-4f8c-8424-4b90ee905792');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "       volume  ...  nb_words\n",
              "5965        9  ...       169\n",
              "21406       4  ...        80\n",
              "46481      12  ...        68\n",
              "32540       7  ...      3052\n",
              "27748      13  ...       122\n",
              "\n",
              "[5 rows x 13 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Configuration\n"
      ],
      "metadata": {
        "id": "-63bh_cKfN4p"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "columnText = 'contentWithoutClass'\n",
        "columnClass = 'ensemble_domaine_enccre'\n",
        "\n",
        "maxOfInstancePerClass = 1500\n",
        "\n",
        "batch_size = 64\n",
        "validation_split = 0.20\n",
        "max_nb_words = 20000      # taille du vocabulaire\n",
        "max_sequence_length = 512  # taille max du 'document' \n",
        "epochs = 10\n",
        "\n",
        "#embedding_name = \"fasttext\" \n",
        "#embedding_dim = 300 \n",
        "\n",
        "embedding_name = \"glove.6B.100d\"\n",
        "embedding_dim = 100 \n",
        "\n",
        "path = \"drive/MyDrive/Classification-EDdA/\"\n",
        "\n",
        "encoder_filename = \"label_encoder.pkl\"\n",
        "tokenizer_filename = \"tokenizer_keras.pkl\""
      ],
      "metadata": {
        "id": "nsRuyzYUfOBg"
      },
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Preprocessing\n"
      ],
      "metadata": {
        "id": "ZDz-Y1LCfQt0"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "if maxOfInstancePerClass != 10000:\n",
        "  df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)"
      ],
      "metadata": {
        "id": "4r41Z6T_yNND"
      },
      "execution_count": 9,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vGWAgBH87ze8"
      },
      "source": [
        "labels  = df_train[columnClass]\n",
        "numberOfClasses = labels.nunique()\n",
        "\n",
        "if os.path.isfile(path+encoder_filename):    \n",
        "    # load existing encoder \n",
        "    with open(path+encoder_filename, 'rb') as file:\n",
        "      encoder = pickle.load(file)\n",
        "\n",
        "else:\n",
        "  encoder = preprocessing.LabelEncoder()\n",
        "  encoder.fit(labels)\n",
        "\n",
        "  with open(path+encoder_filename, 'wb') as file:\n",
        "      pickle.dump(encoder, file)\n",
        "\n",
        "\n",
        "labels = encoder.transform(labels)"
      ],
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "encoder.classes_"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SME4vvVhW9Sn",
        "outputId": "5774cc43-ef3f-4874-bbef-c68f28ca9880"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array(['Agriculture - Economie rustique', 'Anatomie', 'Antiquité',\n",
              "       'Architecture', 'Arts et métiers', 'Beaux-arts',\n",
              "       'Belles-lettres - Poésie', 'Blason', 'Caractères', 'Chasse',\n",
              "       'Chimie', 'Commerce', 'Droit - Jurisprudence',\n",
              "       'Economie domestique', 'Grammaire', 'Géographie', 'Histoire',\n",
              "       'Histoire naturelle', 'Jeu', 'Marine', 'Maréchage - Manège',\n",
              "       'Mathématiques', 'Mesure', 'Militaire (Art) - Guerre - Arme',\n",
              "       'Minéralogie', 'Monnaie', 'Musique', 'Médailles',\n",
              "       'Médecine - Chirurgie', 'Métiers', 'Pharmacie', 'Philosophie',\n",
              "       'Physique - [Sciences physico-mathématiques]', 'Politique',\n",
              "       'Pêche', 'Religion', 'Spectacle', 'Superstition'], dtype=object)"
            ]
          },
          "metadata": {},
          "execution_count": 11
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "labels_index = dict(zip(list(encoder.classes_), encoder.transform(list(encoder.classes_))))"
      ],
      "metadata": {
        "id": "nIzWQ2VbW_UO"
      },
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "labels_index"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4e7ggEGiXC_W",
        "outputId": "77c1fdd0-1c20-479f-eb47-9473484b424e"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'Agriculture - Economie rustique': 0,\n",
              " 'Anatomie': 1,\n",
              " 'Antiquité': 2,\n",
              " 'Architecture': 3,\n",
              " 'Arts et métiers': 4,\n",
              " 'Beaux-arts': 5,\n",
              " 'Belles-lettres - Poésie': 6,\n",
              " 'Blason': 7,\n",
              " 'Caractères': 8,\n",
              " 'Chasse': 9,\n",
              " 'Chimie': 10,\n",
              " 'Commerce': 11,\n",
              " 'Droit - Jurisprudence': 12,\n",
              " 'Economie domestique': 13,\n",
              " 'Grammaire': 14,\n",
              " 'Géographie': 15,\n",
              " 'Histoire': 16,\n",
              " 'Histoire naturelle': 17,\n",
              " 'Jeu': 18,\n",
              " 'Marine': 19,\n",
              " 'Maréchage - Manège': 20,\n",
              " 'Mathématiques': 21,\n",
              " 'Mesure': 22,\n",
              " 'Militaire (Art) - Guerre - Arme': 23,\n",
              " 'Minéralogie': 24,\n",
              " 'Monnaie': 25,\n",
              " 'Musique': 26,\n",
              " 'Médailles': 27,\n",
              " 'Médecine - Chirurgie': 28,\n",
              " 'Métiers': 29,\n",
              " 'Pharmacie': 30,\n",
              " 'Philosophie': 31,\n",
              " 'Physique - [Sciences physico-mathématiques]': 32,\n",
              " 'Politique': 33,\n",
              " 'Pêche': 34,\n",
              " 'Religion': 35,\n",
              " 'Spectacle': 36,\n",
              " 'Superstition': 37}"
            ]
          },
          "metadata": {},
          "execution_count": 13
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Loading pre-trained embeddings\n",
        "\n",
        "#### FastText"
      ],
      "metadata": {
        "id": "Xo47i2WdIP7M"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# download FastText (prend trop de place pour le laisser sur le drive)\n",
        "zip_file_url = \"https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip\"\n",
        "r = requests.get(zip_file_url)\n",
        "z = zipfile.ZipFile(io.BytesIO(r.content))\n",
        "z.extractall()"
      ],
      "metadata": {
        "id": "1yLSez4GIPu9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print('loading word embeddings FastText...')\n",
        "\n",
        "embeddings_index = {}\n",
        "f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')\n",
        "\n",
        "for line in tqdm(f):\n",
        "    values = line.rstrip().rsplit(' ')\n",
        "    word = values[0]\n",
        "    coefs = np.asarray(values[1:], dtype='float32')\n",
        "    embeddings_index[word] = coefs\n",
        "f.close()\n",
        "\n",
        "print('found %s word vectors' % len(embeddings_index))"
      ],
      "metadata": {
        "id": "3HqbPxx6IPsV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### GLOVE"
      ],
      "metadata": {
        "id": "y359qxKWIPam"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# download Glove\n",
        "#zip_file_url = \"https://nlp.stanford.edu/data/glove.6B.zip\"\n",
        "#r = requests.get(zip_file_url)\n",
        "#z = zipfile.ZipFile(io.BytesIO(r.content))\n",
        "#z.extractall()"
      ],
      "metadata": {
        "id": "5e_yuRDFIPKL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print('loading word embeddings GLOVE...')\n",
        "\n",
        "embeddings_index = {}\n",
        "f = open(path+\"embeddings/\"+embedding_name+\".txt\", encoding='utf-8')\n",
        "for line in tqdm(f):\n",
        "    values = line.split()\n",
        "    word = values[0]\n",
        "    coefs = np.asarray(values[1:], dtype='float32')\n",
        "    embeddings_index[word] = coefs\n",
        "f.close()\n",
        "\n",
        "print('Found %s word vectors.' % len(embeddings_index))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9rEI90qGIPHm",
        "outputId": "8e8abb83-97a8-4465-d10f-ae0e5715f9df"
      },
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "loading word embeddings GLOVE...\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "400000it [00:13, 30217.64it/s]"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Found 400000 word vectors.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HuUVfklf-dSR"
      },
      "source": [
        "## Training models"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "NTNh6kMTp_eU",
        "outputId": "aab501a9-16f7-47a9-bf4c-3134a431c114"
      },
      "source": [
        "\n",
        "raw_docs_train = df_train[columnText].tolist()\n",
        "\n",
        "\n",
        "print(\"pre-processing train data...\")\n",
        "\n",
        "if os.path.isfile(path+tokenizer_filename):\n",
        "  with open(path+tokenizer_filename, 'rb') as file:\n",
        "      tokenizer = pickle.load(file)\n",
        "else:\n",
        "  tokenizer = Tokenizer(num_words = max_nb_words)\n",
        "  tokenizer.fit_on_texts(raw_docs_train) \n",
        "\n",
        "  with open(path+tokenizer_filename, 'wb') as file:\n",
        "        pickle.dump(tokenizer, file)\n",
        "\n",
        "sequences = tokenizer.texts_to_sequences(raw_docs_train)\n",
        "\n",
        "word_index = tokenizer.word_index\n",
        "print(\"dictionary size: \", len(word_index))\n",
        "\n",
        "#pad sequences\n",
        "data = sequence.pad_sequences(sequences, maxlen=max_sequence_length)\n",
        "\n",
        "print('Shape of data tensor:', data.shape)\n",
        "print('Shape of label tensor:', labels.shape)\n",
        "print(labels)"
      ],
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "pre-processing train data...\n",
            "dictionary size:  190508\n",
            "Shape of data tensor: (27381, 512)\n",
            "Shape of label tensor: (27381,)\n",
            "[ 0  0  0 ... 37 37 37]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# split the data into a training set and a validation set\n",
        "\n",
        "indices = np.arange(data.shape[0])\n",
        "np.random.shuffle(indices)\n",
        "data = data[indices]\n",
        "labels = labels[indices]\n",
        "\n",
        "nb_validation_samples = int(validation_split * data.shape[0])\n",
        "\n",
        "x_train = data[:-nb_validation_samples]\n",
        "y_train = labels[:-nb_validation_samples]\n",
        "x_val = data[-nb_validation_samples:]\n",
        "y_val = labels[-nb_validation_samples:]\n"
      ],
      "metadata": {
        "id": "sHYJ4P-YDfFb"
      },
      "execution_count": 17,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wGjQI0YgpQAS",
        "outputId": "0eb50fb4-92b2-4fca-e277-c531b7b474e9"
      },
      "source": [
        "#embedding matrix\n",
        "\n",
        "print('preparing embedding matrix...')\n",
        "\n",
        "embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))\n",
        "\n",
        "for word, i in word_index.items():\n",
        "    embedding_vector = embeddings_index.get(word)\n",
        "    if embedding_vector is not None : \n",
        "        embedding_matrix[i] = embedding_vector\n"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "preparing embedding matrix...\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "filter_sizes = [2,  3, 5]\n",
        "drop = 0.5\n",
        "\n",
        "embedding_layer = Embedding(len(word_index)+1, embedding_dim, input_length = max_sequence_length,\n",
        "                    weights=[embedding_matrix], trainable=False)\n",
        "inputs = Input(shape=(max_sequence_length), dtype='int32')\n",
        "embedding = embedding_layer(inputs)\n",
        "\n",
        "print(embedding.shape)\n",
        "reshape = Reshape((max_sequence_length, embedding_dim, 1))(embedding)\n",
        "print(reshape.shape)\n",
        "\n",
        "# https://github.com/elvinaqa/Text-Classification-GloVe-CNN\n",
        "\n",
        "conv_0 = Conv2D(max_sequence_length, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)\n",
        "conv_1 = Conv2D(max_sequence_length, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)\n",
        "conv_2 = Conv2D(max_sequence_length, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)\n",
        "\n",
        "maxpool_0 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)\n",
        "maxpool_1 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)\n",
        "maxpool_2 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)\n",
        "\n",
        "concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])\n",
        "flatten = Flatten()(concatenated_tensor)\n",
        "dropout = Dropout(drop)(flatten)\n",
        "output = Dense(len(labels_index), activation='softmax')(dropout)\n",
        "\n",
        "# this creates a model that includes\n",
        "model = Model(inputs=inputs, outputs=output)\n",
        "\n",
        "checkpoint = ModelCheckpoint('weights_cnn_sentece.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')\n",
        "#adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)\n",
        "\n",
        "model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])\n",
        "model.summary()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "OUphqlYJCC9n",
        "outputId": "d51c889c-5582-411e-a039-ef5911fbeb9a"
      },
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "(None, 512, 100)\n",
            "(None, 512, 100, 1)\n",
            "Model: \"model\"\n",
            "__________________________________________________________________________________________________\n",
            " Layer (type)                   Output Shape         Param #     Connected to                     \n",
            "==================================================================================================\n",
            " input_1 (InputLayer)           [(None, 512)]        0           []                               \n",
            "                                                                                                  \n",
            " embedding (Embedding)          (None, 512, 100)     19050900    ['input_1[0][0]']                \n",
            "                                                                                                  \n",
            " reshape (Reshape)              (None, 512, 100, 1)  0           ['embedding[0][0]']              \n",
            "                                                                                                  \n",
            " conv2d (Conv2D)                (None, 511, 1, 512)  102912      ['reshape[0][0]']                \n",
            "                                                                                                  \n",
            " conv2d_1 (Conv2D)              (None, 510, 1, 512)  154112      ['reshape[0][0]']                \n",
            "                                                                                                  \n",
            " conv2d_2 (Conv2D)              (None, 508, 1, 512)  256512      ['reshape[0][0]']                \n",
            "                                                                                                  \n",
            " max_pooling2d (MaxPooling2D)   (None, 1, 1, 512)    0           ['conv2d[0][0]']                 \n",
            "                                                                                                  \n",
            " max_pooling2d_1 (MaxPooling2D)  (None, 1, 1, 512)   0           ['conv2d_1[0][0]']               \n",
            "                                                                                                  \n",
            " max_pooling2d_2 (MaxPooling2D)  (None, 1, 1, 512)   0           ['conv2d_2[0][0]']               \n",
            "                                                                                                  \n",
            " concatenate (Concatenate)      (None, 3, 1, 512)    0           ['max_pooling2d[0][0]',          \n",
            "                                                                  'max_pooling2d_1[0][0]',        \n",
            "                                                                  'max_pooling2d_2[0][0]']        \n",
            "                                                                                                  \n",
            " flatten (Flatten)              (None, 1536)         0           ['concatenate[0][0]']            \n",
            "                                                                                                  \n",
            " dropout (Dropout)              (None, 1536)         0           ['flatten[0][0]']                \n",
            "                                                                                                  \n",
            " dense (Dense)                  (None, 38)           58406       ['dropout[0][0]']                \n",
            "                                                                                                  \n",
            "==================================================================================================\n",
            "Total params: 19,622,842\n",
            "Trainable params: 571,942\n",
            "Non-trainable params: 19,050,900\n",
            "__________________________________________________________________________________________________\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "history = model.fit(x_train, y_train, \n",
        "                    batch_size=batch_size, \n",
        "                    epochs=epochs, \n",
        "                    verbose=1,\n",
        "                    callbacks=[checkpoint],\n",
        "                    validation_data=(x_val, y_val))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "3aUBdLdNCEGK",
        "outputId": "421839de-e503-4c04-e96f-ed3496c8e0ee"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Epoch 1/10\n",
            "343/343 [==============================] - ETA: 0s - loss: 2.6879 - acc: 0.3116\n",
            "Epoch 1: val_acc improved from -inf to 0.54164, saving model to weights_cnn_sentece.hdf5\n",
            "343/343 [==============================] - 573s 2s/step - loss: 2.6879 - acc: 0.3116 - val_loss: 1.7624 - val_acc: 0.5416\n",
            "Epoch 2/10\n",
            "343/343 [==============================] - ETA: 0s - loss: 1.6911 - acc: 0.5385\n",
            "Epoch 2: val_acc improved from 0.54164 to 0.61158, saving model to weights_cnn_sentece.hdf5\n",
            "343/343 [==============================] - 559s 2s/step - loss: 1.6911 - acc: 0.5385 - val_loss: 1.4473 - val_acc: 0.6116\n",
            "Epoch 3/10\n",
            "177/343 [==============>...............] - ETA: 4:07 - loss: 1.3971 - acc: 0.6153"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "ZcYbQsQEJKLq"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "plt.plot(history.history['acc'])\n",
        "plt.plot(history.history['val_acc'])\n",
        "plt.title('model accuracy')\n",
        "plt.ylabel('accuracy')\n",
        "plt.xlabel('epoch')\n",
        "plt.legend(['train', 'validation'], loc='lower right')\n",
        "plt.show()\n",
        "\n",
        "# summarize history for loss\n",
        "plt.plot(history.history['loss'])\n",
        "plt.plot(history.history['val_loss'])\n",
        "plt.title('model loss')\n",
        "plt.ylabel('loss')\n",
        "plt.xlabel('epoch')\n",
        "plt.legend(['train', 'validation'], loc='upper right')\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "Job-3uMvJKN_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Uw6YR76p_AF0"
      },
      "source": [
        "## Saving models"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "name = \"cnn_conv2D_\"+embedding_name+\"_s\"+str(maxOfInstancePerClass)"
      ],
      "metadata": {
        "id": "TEzya-KCIyE7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ykTp9lyRaAma"
      },
      "source": [
        "model.save(path+name+\".h5\")\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5J4xDoqRUSfS"
      },
      "source": [
        "# save embeddings\n",
        "\n",
        "# saving embeddings index \n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HHlEtipG_Cp0"
      },
      "source": [
        "## Loading models"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fKt8ft1t_Cxx"
      },
      "source": [
        "model = load_model(path+name+\".h5\")\n",
        "\n",
        "with open(path+tokenizer_filename, 'rb') as file:\n",
        "  tokenizer = pickle.load(file)\n",
        "\n",
        "with open(path+encoder_filename, 'rb') as file:\n",
        "  encoder = pickle.load(file)\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zbS4poso-3k7"
      },
      "source": [
        "## Evaluation"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df_test = pd.read_csv(test_path, sep=\"\\t\")\n"
      ],
      "metadata": {
        "id": "KWORvsadvBbr"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "test_texts = df_test[columnText].tolist()\n",
        "test_labels  = df_test[columnClass].tolist()\n",
        "\n",
        "test_sequences = tokenizer.texts_to_sequences(test_texts)\n",
        "test_input = sequence.pad_sequences(test_sequences, maxlen=max_sequence_length)\n",
        "\n",
        "# Get predictions\n",
        "test_predictions_probas = model.predict(test_input)\n",
        "test_predictions = test_predictions_probas.argmax(axis=-1)"
      ],
      "metadata": {
        "id": "Xr0o-0i5t38G"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "test_intent_predictions = encoder.inverse_transform(test_predictions)\n",
        "#test_intent_original = encoder.inverse_transform(test_labels)\n",
        "\n",
        "print('accuracy: ', sum(test_intent_predictions == test_labels) / len(test_labels))\n",
        "print(\"Precision, Recall and F1-Score:\\n\\n\", classification_report(test_labels, test_intent_predictions))\n",
        "\n"
      ],
      "metadata": {
        "id": "lSn8yZ0gt3-d"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "report = classification_report(test_labels, test_intent_predictions, output_dict = True)\n",
        "\n",
        "precision = []\n",
        "recall = []\n",
        "f1 = []\n",
        "support = []\n",
        "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
        "for c in encoder.classes_:\n",
        "  precision.append(report[c]['precision'])\n",
        "  recall.append(report[c]['recall'])\n",
        "  f1.append(report[c]['f1-score'])\n",
        "  support.append(report[c]['support'])\n",
        "\n",
        "accuracy = report['accuracy']\n",
        "weighted_avg = report['weighted avg']\n",
        "\n",
        "\n",
        "cnf_matrix = confusion_matrix(test_labels, test_intent_predictions)\n",
        "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
        "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
        "TP = np.diag(cnf_matrix)\n",
        "TN = cnf_matrix.sum() - (FP + FN + TP)\n",
        "\n",
        "dff['className'] = encoder.classes_\n",
        "dff['precision'] = precision\n",
        "dff['recall'] = recall\n",
        "dff['f1-score'] = f1\n",
        "dff['support'] = support\n",
        "dff['FP'] = FP\n",
        "dff['FN'] = FN\n",
        "dff['TP'] = TP\n",
        "dff['TN'] = TN\n",
        "\n",
        "\n",
        "      \n",
        "content = name + \"\\n\"\n",
        "print(name)\n",
        "content += str(weighted_avg) + \"\\n\"\n",
        "print(weighted_avg)\n",
        "print(accuracy)\n",
        "print(dff)\n",
        "\n",
        "dff.to_csv(path+\"/reports/report_\"+name+\".csv\", index=False)\n",
        "\n",
        "# enregistrer les predictions\n",
        "pd.DataFrame({'labels': pd.Series(df_test[columnClass]), 'predictions': pd.Series(test_intent_predictions)}).to_csv(path+\"/predictions/predictions_\"+name+\".csv\")\n",
        "\n",
        "with open(path+\"/reports/report_\"+name+\".txt\", 'w') as f:\n",
        "  f.write(content)\n"
      ],
      "metadata": {
        "id": "RQ0LYGuOt4A4"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "Lbwg2H8sJRe7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "4mX5g55AJRhj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "bIzOUHZnu8s_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "AsDsTNWdu8vf"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "m9S-2wbeu8yK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "qDpAbSWKu80r"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "-D8Gj6kzJRjv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "Lwz5cO2eJRmD"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "yr_UWq14JRoI"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "WJM6J6_EJRqx"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "CtYR7NTvJRs2"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "Qppm6jATJRvM"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "rK5nK4gyJRx0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "vSjWwcQKJRz7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "G9pjdMdNW_KS"
      },
      "source": [
        "predictions = model.predict(word_seq_validation)\n",
        "predictions = np.argmax(predictions,axis=1)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "IHpVJ79IW_M0",
        "outputId": "2e1657b3-04d1-42f1-ea8b-9bbcd4744108"
      },
      "source": [
        "report = classification_report(predictions, y_validation, output_dict = True)\n",
        "\n",
        "accuracy = report['accuracy']\n",
        "weighted_avg = report['weighted avg']\n",
        "\n",
        "print(accuracy, weighted_avg)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0.5726683109527725 {'precision': 0.6118028288513718, 'recall': 0.5726683109527725, 'f1-score': 0.5870482221489528, 'support': 10947}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
            "  _warn_prf(average, modifier, msg_start, len(result))\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
            "  _warn_prf(average, modifier, msg_start, len(result))\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
            "  _warn_prf(average, modifier, msg_start, len(result))\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9SKjWffUW_PC"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LpgkGq-fW_RN"
      },
      "source": [
        "df_test = pd.read_csv(test_path, sep=\"\\t\")\n",
        "\n",
        "encoder = preprocessing.LabelEncoder()\n",
        "y_test = encoder.fit_transform(df_test[columnClass])\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Q9eYqi5SW_Ta",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "31e45f20-583a-4ca6-eac8-21863f6fef5b"
      },
      "source": [
        "raw_docs_test = df_test[columnText].tolist()\n",
        "\n",
        "print(\"pre-processing test data...\")\n",
        "\n",
        "stop_words = set(stopwords.words('french'))\n",
        "\n",
        "processed_docs_test = []\n",
        "for doc in tqdm(raw_docs_test):\n",
        "    tokens = word_tokenize(doc, language='french')\n",
        "    filtered = [word for word in tokens if word not in stop_words]\n",
        "    processed_docs_test.append(\" \".join(filtered))\n",
        "#end for\n",
        "\n",
        "print(\"tokenizing input data...\")\n",
        "#tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n",
        "#tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation)  #leaky\n",
        "word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)\n",
        "\n",
        "#pad sequences\n",
        "word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "pre-processing test data...\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|██████████| 13137/13137 [00:09<00:00, 1331.48it/s]\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "tokenizing input data...\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_WjpJN-Bqjeb"
      },
      "source": [
        "predictions = model.predict(word_seq_test)\n",
        "predictions = np.argmax(predictions,axis=1)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zUwjL_dQqjgx",
        "outputId": "912642ad-95eb-413a-d074-8d4881a57359"
      },
      "source": [
        "report = classification_report(predictions, y_test, output_dict = True)\n",
        "\n",
        "accuracy = report['accuracy']\n",
        "weighted_avg = report['weighted avg']\n",
        "\n",
        "print(accuracy, weighted_avg)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0.5698409073608891 {'precision': 0.6081680700148677, 'recall': 0.5698409073608891, 'f1-score': 0.5847417616022411, 'support': 13137}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
            "  _warn_prf(average, modifier, msg_start, len(result))\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
            "  _warn_prf(average, modifier, msg_start, len(result))\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
            "  _warn_prf(average, modifier, msg_start, len(result))\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ka6DcPe7qqvg",
        "outputId": "0c8cfbe6-178d-4208-98ba-4ba688e32939"
      },
      "source": [
        "from sklearn.metrics import confusion_matrix\n",
        "\n",
        "classesName = encoder.classes_\n",
        "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
        "\n",
        "precision = []\n",
        "recall = []\n",
        "f1 = []\n",
        "support = []\n",
        "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
        "for c in classes:\n",
        "  precision.append(report[c]['precision'])\n",
        "  recall.append(report[c]['recall'])\n",
        "  f1.append(report[c]['f1-score'])\n",
        "  support.append(report[c]['support'])\n",
        "\n",
        "accuracy = report['accuracy']\n",
        "weighted_avg = report['weighted avg']\n",
        "\n",
        "\n",
        "cnf_matrix = confusion_matrix(y_test, predictions)\n",
        "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
        "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
        "TP = np.diag(cnf_matrix)\n",
        "TN = cnf_matrix.sum() - (FP + FN + TP)\n",
        "\n",
        "dff['className'] = classesName\n",
        "dff['precision'] = precision\n",
        "dff['recall'] = recall\n",
        "dff['f1-score'] = f1\n",
        "dff['support'] = support\n",
        "dff['FP'] = FP\n",
        "dff['FN'] = FN\n",
        "dff['TP'] = TP\n",
        "dff['TN'] = TN\n",
        "\n",
        "print(\"test_cnn_s\"+str(maxOfInstancePerClass))\n",
        "\n",
        "print(weighted_avg)\n",
        "print(accuracy)\n",
        "print(dff)\n",
        "\n",
        "dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_test_cnn_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "test_cnn_s10000\n",
            "{'precision': 0.6081680700148677, 'recall': 0.5698409073608891, 'f1-score': 0.5847417616022411, 'support': 13137}\n",
            "0.5698409073608891\n",
            "                                      className  precision  ...    TP     TN\n",
            "0               Agriculture - Economie rustique   0.216535  ...    55  12636\n",
            "1                                      Anatomie   0.459821  ...   103  12768\n",
            "2                                     Antiquité   0.287975  ...    91  12710\n",
            "3                                  Architecture   0.339623  ...   108  12722\n",
            "4                               Arts et métiers   0.015504  ...     2  12995\n",
            "5                                    Beaux-arts   0.060000  ...     6  13018\n",
            "6                       Belles-lettres - Poésie   0.127660  ...    30  12761\n",
            "7                                        Blason   0.228571  ...    24  12993\n",
            "8                                    Caractères   0.037037  ...     1  13110\n",
            "9                                        Chasse   0.221311  ...    27  12962\n",
            "10                                       Chimie   0.160714  ...    18  12991\n",
            "11                                     Commerce   0.443418  ...   192  12490\n",
            "12                        Droit - Jurisprudence   0.762879  ...  1081  11263\n",
            "13                          Economie domestique   0.000000  ...     0  13102\n",
            "14                                    Grammaire   0.408929  ...   229  12254\n",
            "15                                   Géographie   0.917312  ...  2607   9910\n",
            "16                                     Histoire   0.405063  ...   288  11777\n",
            "17                           Histoire naturelle   0.743292  ...   831  11661\n",
            "18                                          Jeu   0.061538  ...     4  13067\n",
            "19                                       Marine   0.590805  ...   257  12549\n",
            "20                           Maréchage - Manège   0.620690  ...    72  13001\n",
            "21                                Mathématiques   0.549669  ...    83  12903\n",
            "22                                       Mesure   0.095238  ...     4  13087\n",
            "23              Militaire (Art) - Guerre - Arme   0.476351  ...   141  12704\n",
            "24                                  Minéralogie   0.000000  ...     0  13111\n",
            "25                                      Monnaie   0.054795  ...     4  13051\n",
            "26                                      Musique   0.287500  ...    46  12904\n",
            "27                                    Médailles   0.000000  ...     0  13107\n",
            "28                         Médecine - Chirurgie   0.376218  ...   193  12149\n",
            "29                                      Métiers   0.605634  ...   731  11047\n",
            "30                                    Pharmacie   0.070423  ...     5  13045\n",
            "31                                  Philosophie   0.071429  ...     8  12996\n",
            "32  Physique - [Sciences physico-mathématiques]   0.378378  ...   112  12674\n",
            "33                                    Politique   0.000000  ...     0  13110\n",
            "34                                        Pêche   0.170213  ...     8  13069\n",
            "35                                     Religion   0.326371  ...   125  12488\n",
            "36                                    Spectacle   0.000000  ...     0  13121\n",
            "37                                 Superstition   0.000000  ...     0  13112\n",
            "\n",
            "[38 rows x 9 columns]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BqJ1_hUUqqx5"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bhfuGNwIqrOQ"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NkL3MopyqrQk"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XLHl-pvzqjjI"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lLR_Xvi9qjlo"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8cGcLOFTqjoP"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vLGTnit_W_V8"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "R-3lBXjDD9wE"
      },
      "source": [
        "def predict(data, max_len):\n",
        "  \n",
        "  pad_sequ_test, _ = prepare_sequence(data, max_len)\n",
        "  pred_labels_ = model.predict(pad_sequ_test)\n",
        "\n",
        "  return np.argmax(pred_labels_,axis=1)\n",
        "\n",
        "\n",
        "def eval(data, labels, max_len):\n",
        "  \n",
        "  pred_labels_ = predict(data, max_len)\n",
        "  report = classification_report(pred_labels_, labels, output_dict = True)\n",
        "\n",
        "  accuracy = report['accuracy']\n",
        "  weighted_avg = report['weighted avg']\n",
        "  \n",
        "  print(accuracy, weighted_avg)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6T3kAvKvExgc",
        "outputId": "c6d4560e-fc64-4579-9adb-79c2e36d2386"
      },
      "source": [
        "# evaluation sur le jeu de validation\n",
        "eval(df_validation[columnText], y_validation, max_len)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
            "  return np.array(self.texts_to_sequences(texts))\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0.06925290207361841 {'precision': 0.09108131158125257, 'recall': 0.06925290207361841, 'f1-score': 0.06099084715237025, 'support': 10079}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pTDJA03_-8yu",
        "outputId": "d8bcdf73-c4c3-4c88-b063-90bd1cad5122"
      },
      "source": [
        "# evaluation sur le jeu de test\n",
        "df_test = pd.read_csv(test_path, sep=\"\\t\")\n",
        "#df_test = resample_classes(df_test, columnClass, maxOfInstancePerClass)\n",
        "\n",
        "y_test = df_test[columnClass]\n",
        "encoder = preprocessing.LabelEncoder()\n",
        "y_test = encoder.fit_transform(y_test)\n",
        "\n",
        "eval(df_test[columnText], y_test, max_len)\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
            "  return np.array(self.texts_to_sequences(texts))\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0.07231483595950369 {'precision': 0.081194635559303, 'recall': 0.07231483595950369, 'f1-score': 0.06322383877903374, 'support': 13137}\n"
          ]
        }
      ]
    }
  ]
}