{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "EDdA-Classification_CNN_Conv2D.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "0yFsoHXX8Iyy" }, "source": [ "# Deep learning for EDdA classification" ] }, { "cell_type": "markdown", "metadata": { "id": "tFlUCDL2778i" }, "source": [ "## Setup colab environment" ] }, { "cell_type": "code", "metadata": { "id": "Sp8d_Uus7SHJ", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "2791f79e-a849-4b95-92c5-72150ea4d6e2" }, "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "jQBu-p6hBU-j" }, "source": [ "### Install packages" ] }, { "cell_type": "code", "metadata": { "id": "bTIXsF6kBUdh" }, "source": [ "#!pip install zeugma\n", "#!pip install plot_model" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "56-04SNF8BMx" }, "source": [ "### Import librairies" ] }, { "cell_type": "code", "metadata": { "id": "HwWkSznz7SEv" }, "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pickle\n", "import os\n", "\n", "from tqdm import tqdm\n", "import requests, zipfile, io\n", "import codecs\n", "\n", "from sklearn import preprocessing # LabelEncoder\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import confusion_matrix\n", "\n", "from keras.preprocessing import sequence\n", "from keras.preprocessing.text import Tokenizer\n", "\n", "from keras.layers import BatchNormalization, Input, Reshape, Conv2D, MaxPool2D, Concatenate\n", "from keras.layers import Embedding, Dropout, Flatten, Dense\n", "from keras.models import Model, load_model\n", "from keras.callbacks import ModelCheckpoint\n" ], "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "xrekV6W978l4" }, "source": [ "### Utils functions" ] }, { "cell_type": "code", "metadata": { "id": "4LJ5blQR7PUe" }, "source": [ "\n", "def resample_classes(df, classColumnName, numberOfInstances):\n", " #random numberOfInstances elements\n", " replace = False # with replacement\n", " fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n", " return df.groupby(classColumnName, as_index=False).apply(fn)\n", " \n" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "-Rh3JMDh7zYd" }, "source": [ "" ], "execution_count": 3, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "MtLr35eM753e" }, "source": [ "## Load Data" ] }, { "cell_type": "code", "metadata": { "id": "FnbNT4NF7zal", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "ca2c2751-6ac2-4ec8-ec1d-03927e9bd358" }, "source": [ "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n", "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2022-02-18 07:27:50-- https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n", "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n", "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 175634219 (167M) [text/tab-separated-values]\n", "Saving to: ‘training_set.tsv’\n", "\n", "training_set.tsv 100%[===================>] 167.50M 23.4MB/s in 7.7s \n", "\n", "2022-02-18 07:27:58 (21.8 MB/s) - ‘training_set.tsv’ saved [175634219/175634219]\n", "\n", "--2022-02-18 07:27:58-- https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv\n", "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n", "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 42730598 (41M) [text/tab-separated-values]\n", "Saving to: ‘test_set.tsv’\n", "\n", "test_set.tsv 100%[===================>] 40.75M 17.1MB/s in 2.4s \n", "\n", "2022-02-18 07:28:01 (17.1 MB/s) - ‘test_set.tsv’ saved [42730598/42730598]\n", "\n" ] } ] }, { "cell_type": "markdown", "source": [ "### Loading dataset" ], "metadata": { "id": "UHushJ1XfUj9" } }, { "cell_type": "code", "source": [ "train_path = 'training_set.tsv'\n", "test_path = 'test_set.tsv'" ], "metadata": { "id": "Q4te2c0bfvaJ" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "nRLaQUO97zcq" }, "source": [ "df_train = pd.read_csv(train_path, sep=\"\\t\")\n" ], "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "df_train.sample(5)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 461 }, "id": "2MvHEc7zVK1N", "outputId": "e5cfc8df-4ed9-4dd4-ce64-3350a8f9f6bc" }, "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "\n", " <div id=\"df-3f5f81f9-5804-4f8c-8424-4b90ee905792\">\n", " <div class=\"colab-df-container\">\n", " <div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>volume</th>\n", " <th>numero</th>\n", " <th>head</th>\n", " <th>normClass</th>\n", " <th>classEDdA</th>\n", " <th>author</th>\n", " <th>id_enccre</th>\n", " <th>domaine_enccre</th>\n", " <th>ensemble_domaine_enccre</th>\n", " <th>content</th>\n", " <th>contentWithoutClass</th>\n", " <th>firstParagraph</th>\n", " <th>nb_words</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>5965</th>\n", " <td>9</td>\n", " <td>712</td>\n", " <td>KNAWEL</td>\n", " <td>Botanique</td>\n", " <td>Botan.</td>\n", " <td>Jaucourt</td>\n", " <td>v9-452-0</td>\n", " <td>botanique</td>\n", " <td>Histoire naturelle</td>\n", " <td>KNAWEL, (Botan.) genre de plante ainsi nommée ...</td>\n", " <td>knawel genre plante nommée \\n gérard ray par...</td>\n", " <td>knawel genre plante nommée \\n gérard ray par...</td>\n", " <td>169</td>\n", " </tr>\n", " <tr>\n", " <th>21406</th>\n", " <td>4</td>\n", " <td>3605</td>\n", " <td>DECRETE</td>\n", " <td>Jurisprudence</td>\n", " <td>Jurispr.</td>\n", " <td>Boucher d'Argis</td>\n", " <td>v4-1826-0</td>\n", " <td>jurisprudence</td>\n", " <td>Droit - Jurisprudence</td>\n", " <td>DECRETE, adj. (Jurispr.) se dit communément\\nd...</td>\n", " <td>decrete adj communément \\n contre a ordonné ...</td>\n", " <td>decrete adj communément \\n contre a ordonné ...</td>\n", " <td>80</td>\n", " </tr>\n", " <tr>\n", " <th>46481</th>\n", " <td>12</td>\n", " <td>2389</td>\n", " <td>Piece nette</td>\n", " <td>Artillerie</td>\n", " <td>Artillerie.</td>\n", " <td>Jaucourt</td>\n", " <td>v12-1440-16</td>\n", " <td>artillerie</td>\n", " <td>Militaire (Art) - Guerre - Arme</td>\n", " <td>Piece nette, (Artillerie.) on appelle pieces n...</td>\n", " <td>piece nette appelle pieces nestes pieces art...</td>\n", " <td>piece nette appelle pieces nestes pieces art...</td>\n", " <td>68</td>\n", " </tr>\n", " <tr>\n", " <th>32540</th>\n", " <td>7</td>\n", " <td>1375</td>\n", " <td>Gale</td>\n", " <td>Manège | Maréchallerie</td>\n", " <td>Manége & Maréchallerie.</td>\n", " <td>Bourgelat</td>\n", " <td>v7-622-1</td>\n", " <td>manège</td>\n", " <td>Maréchage - Manège</td>\n", " <td>Gale, (Manége & Maréchallerie.) maladie prurig...</td>\n", " <td>gale maladie prurigineuse \\n cutanée manifes...</td>\n", " <td>gale maladie prurigineuse \\n cutanée manifes...</td>\n", " <td>3052</td>\n", " </tr>\n", " <tr>\n", " <th>27748</th>\n", " <td>13</td>\n", " <td>4039</td>\n", " <td>Récit historique</td>\n", " <td>Histoire</td>\n", " <td>Histoire.</td>\n", " <td>unsigned</td>\n", " <td>v13-2396-2</td>\n", " <td>histoire</td>\n", " <td>Histoire</td>\n", " <td>Récit historique, (Histoire.) le récit histori...</td>\n", " <td>récit historique récit historique \\n exposé ...</td>\n", " <td>récit historique récit historique \\n exposé ...</td>\n", " <td>122</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>\n", " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3f5f81f9-5804-4f8c-8424-4b90ee905792')\"\n", " title=\"Convert this dataframe to an interactive table.\"\n", " style=\"display:none;\">\n", " \n", " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", " width=\"24px\">\n", " <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", " <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", " </svg>\n", " </button>\n", " \n", " <style>\n", " .colab-df-container {\n", " display:flex;\n", " flex-wrap:wrap;\n", " gap: 12px;\n", " }\n", "\n", " .colab-df-convert {\n", " background-color: #E8F0FE;\n", " border: none;\n", " border-radius: 50%;\n", " cursor: pointer;\n", " display: none;\n", " fill: #1967D2;\n", " height: 32px;\n", " padding: 0 0 0 0;\n", " width: 32px;\n", " }\n", "\n", " .colab-df-convert:hover {\n", " background-color: #E2EBFA;\n", " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", " fill: #174EA6;\n", " }\n", "\n", " [theme=dark] .colab-df-convert {\n", " background-color: #3B4455;\n", " fill: #D2E3FC;\n", " }\n", "\n", " [theme=dark] .colab-df-convert:hover {\n", " background-color: #434B5C;\n", " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", " fill: #FFFFFF;\n", " }\n", " </style>\n", "\n", " <script>\n", " const buttonEl =\n", " document.querySelector('#df-3f5f81f9-5804-4f8c-8424-4b90ee905792 button.colab-df-convert');\n", " buttonEl.style.display =\n", " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", "\n", " async function convertToInteractive(key) {\n", " const element = document.querySelector('#df-3f5f81f9-5804-4f8c-8424-4b90ee905792');\n", " const dataTable =\n", " await google.colab.kernel.invokeFunction('convertToInteractive',\n", " [key], {});\n", " if (!dataTable) return;\n", "\n", " const docLinkHtml = 'Like what you see? Visit the ' +\n", " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", " + ' to learn more about interactive tables.';\n", " element.innerHTML = '';\n", " dataTable['output_type'] = 'display_data';\n", " await google.colab.output.renderOutput(dataTable, element);\n", " const docLink = document.createElement('div');\n", " docLink.innerHTML = docLinkHtml;\n", " element.appendChild(docLink);\n", " }\n", " </script>\n", " </div>\n", " </div>\n", " " ], "text/plain": [ " volume ... nb_words\n", "5965 9 ... 169\n", "21406 4 ... 80\n", "46481 12 ... 68\n", "32540 7 ... 3052\n", "27748 13 ... 122\n", "\n", "[5 rows x 13 columns]" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "markdown", "source": [ "## Configuration\n" ], "metadata": { "id": "-63bh_cKfN4p" } }, { "cell_type": "code", "source": [ "columnText = 'contentWithoutClass'\n", "columnClass = 'ensemble_domaine_enccre'\n", "\n", "maxOfInstancePerClass = 1500\n", "\n", "batch_size = 64\n", "validation_split = 0.20\n", "max_nb_words = 20000 # taille du vocabulaire\n", "max_sequence_length = 512 # taille max du 'document' \n", "epochs = 10\n", "\n", "#embedding_name = \"fasttext\" \n", "#embedding_dim = 300 \n", "\n", "embedding_name = \"glove.6B.100d\"\n", "embedding_dim = 100 \n", "\n", "path = \"drive/MyDrive/Classification-EDdA/\"\n", "\n", "encoder_filename = \"label_encoder.pkl\"\n", "tokenizer_filename = \"tokenizer_keras.pkl\"" ], "metadata": { "id": "nsRuyzYUfOBg" }, "execution_count": 8, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Preprocessing\n" ], "metadata": { "id": "ZDz-Y1LCfQt0" } }, { "cell_type": "code", "source": [ "if maxOfInstancePerClass != 10000:\n", " df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)" ], "metadata": { "id": "4r41Z6T_yNND" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "vGWAgBH87ze8" }, "source": [ "labels = df_train[columnClass]\n", "numberOfClasses = labels.nunique()\n", "\n", "if os.path.isfile(path+encoder_filename): \n", " # load existing encoder \n", " with open(path+encoder_filename, 'rb') as file:\n", " encoder = pickle.load(file)\n", "\n", "else:\n", " encoder = preprocessing.LabelEncoder()\n", " encoder.fit(labels)\n", "\n", " with open(path+encoder_filename, 'wb') as file:\n", " pickle.dump(encoder, file)\n", "\n", "\n", "labels = encoder.transform(labels)" ], "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "encoder.classes_" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SME4vvVhW9Sn", "outputId": "5774cc43-ef3f-4874-bbef-c68f28ca9880" }, "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['Agriculture - Economie rustique', 'Anatomie', 'Antiquité',\n", " 'Architecture', 'Arts et métiers', 'Beaux-arts',\n", " 'Belles-lettres - Poésie', 'Blason', 'Caractères', 'Chasse',\n", " 'Chimie', 'Commerce', 'Droit - Jurisprudence',\n", " 'Economie domestique', 'Grammaire', 'Géographie', 'Histoire',\n", " 'Histoire naturelle', 'Jeu', 'Marine', 'Maréchage - Manège',\n", " 'Mathématiques', 'Mesure', 'Militaire (Art) - Guerre - Arme',\n", " 'Minéralogie', 'Monnaie', 'Musique', 'Médailles',\n", " 'Médecine - Chirurgie', 'Métiers', 'Pharmacie', 'Philosophie',\n", " 'Physique - [Sciences physico-mathématiques]', 'Politique',\n", " 'Pêche', 'Religion', 'Spectacle', 'Superstition'], dtype=object)" ] }, "metadata": {}, "execution_count": 11 } ] }, { "cell_type": "code", "source": [ "labels_index = dict(zip(list(encoder.classes_), encoder.transform(list(encoder.classes_))))" ], "metadata": { "id": "nIzWQ2VbW_UO" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "labels_index" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4e7ggEGiXC_W", "outputId": "77c1fdd0-1c20-479f-eb47-9473484b424e" }, "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'Agriculture - Economie rustique': 0,\n", " 'Anatomie': 1,\n", " 'Antiquité': 2,\n", " 'Architecture': 3,\n", " 'Arts et métiers': 4,\n", " 'Beaux-arts': 5,\n", " 'Belles-lettres - Poésie': 6,\n", " 'Blason': 7,\n", " 'Caractères': 8,\n", " 'Chasse': 9,\n", " 'Chimie': 10,\n", " 'Commerce': 11,\n", " 'Droit - Jurisprudence': 12,\n", " 'Economie domestique': 13,\n", " 'Grammaire': 14,\n", " 'Géographie': 15,\n", " 'Histoire': 16,\n", " 'Histoire naturelle': 17,\n", " 'Jeu': 18,\n", " 'Marine': 19,\n", " 'Maréchage - Manège': 20,\n", " 'Mathématiques': 21,\n", " 'Mesure': 22,\n", " 'Militaire (Art) - Guerre - Arme': 23,\n", " 'Minéralogie': 24,\n", " 'Monnaie': 25,\n", " 'Musique': 26,\n", " 'Médailles': 27,\n", " 'Médecine - Chirurgie': 28,\n", " 'Métiers': 29,\n", " 'Pharmacie': 30,\n", " 'Philosophie': 31,\n", " 'Physique - [Sciences physico-mathématiques]': 32,\n", " 'Politique': 33,\n", " 'Pêche': 34,\n", " 'Religion': 35,\n", " 'Spectacle': 36,\n", " 'Superstition': 37}" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "markdown", "source": [ "### Loading pre-trained embeddings\n", "\n", "#### FastText" ], "metadata": { "id": "Xo47i2WdIP7M" } }, { "cell_type": "code", "source": [ "# download FastText (prend trop de place pour le laisser sur le drive)\n", "zip_file_url = \"https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip\"\n", "r = requests.get(zip_file_url)\n", "z = zipfile.ZipFile(io.BytesIO(r.content))\n", "z.extractall()" ], "metadata": { "id": "1yLSez4GIPu9" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print('loading word embeddings FastText...')\n", "\n", "embeddings_index = {}\n", "f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')\n", "\n", "for line in tqdm(f):\n", " values = line.rstrip().rsplit(' ')\n", " word = values[0]\n", " coefs = np.asarray(values[1:], dtype='float32')\n", " embeddings_index[word] = coefs\n", "f.close()\n", "\n", "print('found %s word vectors' % len(embeddings_index))" ], "metadata": { "id": "3HqbPxx6IPsV" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "#### GLOVE" ], "metadata": { "id": "y359qxKWIPam" } }, { "cell_type": "code", "source": [ "# download Glove\n", "#zip_file_url = \"https://nlp.stanford.edu/data/glove.6B.zip\"\n", "#r = requests.get(zip_file_url)\n", "#z = zipfile.ZipFile(io.BytesIO(r.content))\n", "#z.extractall()" ], "metadata": { "id": "5e_yuRDFIPKL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print('loading word embeddings GLOVE...')\n", "\n", "embeddings_index = {}\n", "f = open(path+\"embeddings/\"+embedding_name+\".txt\", encoding='utf-8')\n", "for line in tqdm(f):\n", " values = line.split()\n", " word = values[0]\n", " coefs = np.asarray(values[1:], dtype='float32')\n", " embeddings_index[word] = coefs\n", "f.close()\n", "\n", "print('Found %s word vectors.' % len(embeddings_index))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9rEI90qGIPHm", "outputId": "8e8abb83-97a8-4465-d10f-ae0e5715f9df" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "loading word embeddings GLOVE...\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "400000it [00:13, 30217.64it/s]" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Found 400000 word vectors.\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "HuUVfklf-dSR" }, "source": [ "## Training models" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NTNh6kMTp_eU", "outputId": "aab501a9-16f7-47a9-bf4c-3134a431c114" }, "source": [ "\n", "raw_docs_train = df_train[columnText].tolist()\n", "\n", "\n", "print(\"pre-processing train data...\")\n", "\n", "if os.path.isfile(path+tokenizer_filename):\n", " with open(path+tokenizer_filename, 'rb') as file:\n", " tokenizer = pickle.load(file)\n", "else:\n", " tokenizer = Tokenizer(num_words = max_nb_words)\n", " tokenizer.fit_on_texts(raw_docs_train) \n", "\n", " with open(path+tokenizer_filename, 'wb') as file:\n", " pickle.dump(tokenizer, file)\n", "\n", "sequences = tokenizer.texts_to_sequences(raw_docs_train)\n", "\n", "word_index = tokenizer.word_index\n", "print(\"dictionary size: \", len(word_index))\n", "\n", "#pad sequences\n", "data = sequence.pad_sequences(sequences, maxlen=max_sequence_length)\n", "\n", "print('Shape of data tensor:', data.shape)\n", "print('Shape of label tensor:', labels.shape)\n", "print(labels)" ], "execution_count": 16, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "pre-processing train data...\n", "dictionary size: 190508\n", "Shape of data tensor: (27381, 512)\n", "Shape of label tensor: (27381,)\n", "[ 0 0 0 ... 37 37 37]\n" ] } ] }, { "cell_type": "code", "source": [ "# split the data into a training set and a validation set\n", "\n", "indices = np.arange(data.shape[0])\n", "np.random.shuffle(indices)\n", "data = data[indices]\n", "labels = labels[indices]\n", "\n", "nb_validation_samples = int(validation_split * data.shape[0])\n", "\n", "x_train = data[:-nb_validation_samples]\n", "y_train = labels[:-nb_validation_samples]\n", "x_val = data[-nb_validation_samples:]\n", "y_val = labels[-nb_validation_samples:]\n" ], "metadata": { "id": "sHYJ4P-YDfFb" }, "execution_count": 17, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wGjQI0YgpQAS", "outputId": "0eb50fb4-92b2-4fca-e277-c531b7b474e9" }, "source": [ "#embedding matrix\n", "\n", "print('preparing embedding matrix...')\n", "\n", "embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))\n", "\n", "for word, i in word_index.items():\n", " embedding_vector = embeddings_index.get(word)\n", " if embedding_vector is not None : \n", " embedding_matrix[i] = embedding_vector\n" ], "execution_count": 18, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "preparing embedding matrix...\n" ] } ] }, { "cell_type": "code", "source": [ "\n", "filter_sizes = [2, 3, 5]\n", "drop = 0.5\n", "\n", "embedding_layer = Embedding(len(word_index)+1, embedding_dim, input_length = max_sequence_length,\n", " weights=[embedding_matrix], trainable=False)\n", "inputs = Input(shape=(max_sequence_length), dtype='int32')\n", "embedding = embedding_layer(inputs)\n", "\n", "print(embedding.shape)\n", "reshape = Reshape((max_sequence_length, embedding_dim, 1))(embedding)\n", "print(reshape.shape)\n", "\n", "# https://github.com/elvinaqa/Text-Classification-GloVe-CNN\n", "\n", "conv_0 = Conv2D(max_sequence_length, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)\n", "conv_1 = Conv2D(max_sequence_length, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)\n", "conv_2 = Conv2D(max_sequence_length, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)\n", "\n", "maxpool_0 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)\n", "maxpool_1 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)\n", "maxpool_2 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)\n", "\n", "concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])\n", "flatten = Flatten()(concatenated_tensor)\n", "dropout = Dropout(drop)(flatten)\n", "output = Dense(len(labels_index), activation='softmax')(dropout)\n", "\n", "# this creates a model that includes\n", "model = Model(inputs=inputs, outputs=output)\n", "\n", "checkpoint = ModelCheckpoint('weights_cnn_sentece.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')\n", "#adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)\n", "\n", "model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])\n", "model.summary()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OUphqlYJCC9n", "outputId": "d51c889c-5582-411e-a039-ef5911fbeb9a" }, "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(None, 512, 100)\n", "(None, 512, 100, 1)\n", "Model: \"model\"\n", "__________________________________________________________________________________________________\n", " Layer (type) Output Shape Param # Connected to \n", "==================================================================================================\n", " input_1 (InputLayer) [(None, 512)] 0 [] \n", " \n", " embedding (Embedding) (None, 512, 100) 19050900 ['input_1[0][0]'] \n", " \n", " reshape (Reshape) (None, 512, 100, 1) 0 ['embedding[0][0]'] \n", " \n", " conv2d (Conv2D) (None, 511, 1, 512) 102912 ['reshape[0][0]'] \n", " \n", " conv2d_1 (Conv2D) (None, 510, 1, 512) 154112 ['reshape[0][0]'] \n", " \n", " conv2d_2 (Conv2D) (None, 508, 1, 512) 256512 ['reshape[0][0]'] \n", " \n", " max_pooling2d (MaxPooling2D) (None, 1, 1, 512) 0 ['conv2d[0][0]'] \n", " \n", " max_pooling2d_1 (MaxPooling2D) (None, 1, 1, 512) 0 ['conv2d_1[0][0]'] \n", " \n", " max_pooling2d_2 (MaxPooling2D) (None, 1, 1, 512) 0 ['conv2d_2[0][0]'] \n", " \n", " concatenate (Concatenate) (None, 3, 1, 512) 0 ['max_pooling2d[0][0]', \n", " 'max_pooling2d_1[0][0]', \n", " 'max_pooling2d_2[0][0]'] \n", " \n", " flatten (Flatten) (None, 1536) 0 ['concatenate[0][0]'] \n", " \n", " dropout (Dropout) (None, 1536) 0 ['flatten[0][0]'] \n", " \n", " dense (Dense) (None, 38) 58406 ['dropout[0][0]'] \n", " \n", "==================================================================================================\n", "Total params: 19,622,842\n", "Trainable params: 571,942\n", "Non-trainable params: 19,050,900\n", "__________________________________________________________________________________________________\n" ] } ] }, { "cell_type": "code", "source": [ "history = model.fit(x_train, y_train, \n", " batch_size=batch_size, \n", " epochs=epochs, \n", " verbose=1,\n", " callbacks=[checkpoint],\n", " validation_data=(x_val, y_val))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3aUBdLdNCEGK", "outputId": "421839de-e503-4c04-e96f-ed3496c8e0ee" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1/10\n", "343/343 [==============================] - ETA: 0s - loss: 2.6879 - acc: 0.3116\n", "Epoch 1: val_acc improved from -inf to 0.54164, saving model to weights_cnn_sentece.hdf5\n", "343/343 [==============================] - 573s 2s/step - loss: 2.6879 - acc: 0.3116 - val_loss: 1.7624 - val_acc: 0.5416\n", "Epoch 2/10\n", "343/343 [==============================] - ETA: 0s - loss: 1.6911 - acc: 0.5385\n", "Epoch 2: val_acc improved from 0.54164 to 0.61158, saving model to weights_cnn_sentece.hdf5\n", "343/343 [==============================] - 559s 2s/step - loss: 1.6911 - acc: 0.5385 - val_loss: 1.4473 - val_acc: 0.6116\n", "Epoch 3/10\n", "177/343 [==============>...............] - ETA: 4:07 - loss: 1.3971 - acc: 0.6153" ] } ] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "ZcYbQsQEJKLq" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "plt.plot(history.history['acc'])\n", "plt.plot(history.history['val_acc'])\n", "plt.title('model accuracy')\n", "plt.ylabel('accuracy')\n", "plt.xlabel('epoch')\n", "plt.legend(['train', 'validation'], loc='lower right')\n", "plt.show()\n", "\n", "# summarize history for loss\n", "plt.plot(history.history['loss'])\n", "plt.plot(history.history['val_loss'])\n", "plt.title('model loss')\n", "plt.ylabel('loss')\n", "plt.xlabel('epoch')\n", "plt.legend(['train', 'validation'], loc='upper right')\n", "plt.show()" ], "metadata": { "id": "Job-3uMvJKN_" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Uw6YR76p_AF0" }, "source": [ "## Saving models" ] }, { "cell_type": "code", "source": [ "name = \"cnn_conv2D_\"+embedding_name+\"_s\"+str(maxOfInstancePerClass)" ], "metadata": { "id": "TEzya-KCIyE7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ykTp9lyRaAma" }, "source": [ "model.save(path+name+\".h5\")\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "5J4xDoqRUSfS" }, "source": [ "# save embeddings\n", "\n", "# saving embeddings index \n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "HHlEtipG_Cp0" }, "source": [ "## Loading models" ] }, { "cell_type": "code", "metadata": { "id": "fKt8ft1t_Cxx" }, "source": [ "model = load_model(path+name+\".h5\")\n", "\n", "with open(path+tokenizer_filename, 'rb') as file:\n", " tokenizer = pickle.load(file)\n", "\n", "with open(path+encoder_filename, 'rb') as file:\n", " encoder = pickle.load(file)\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "zbS4poso-3k7" }, "source": [ "## Evaluation" ] }, { "cell_type": "code", "source": [ "df_test = pd.read_csv(test_path, sep=\"\\t\")\n" ], "metadata": { "id": "KWORvsadvBbr" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "test_texts = df_test[columnText].tolist()\n", "test_labels = df_test[columnClass].tolist()\n", "\n", "test_sequences = tokenizer.texts_to_sequences(test_texts)\n", "test_input = sequence.pad_sequences(test_sequences, maxlen=max_sequence_length)\n", "\n", "# Get predictions\n", "test_predictions_probas = model.predict(test_input)\n", "test_predictions = test_predictions_probas.argmax(axis=-1)" ], "metadata": { "id": "Xr0o-0i5t38G" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "test_intent_predictions = encoder.inverse_transform(test_predictions)\n", "#test_intent_original = encoder.inverse_transform(test_labels)\n", "\n", "print('accuracy: ', sum(test_intent_predictions == test_labels) / len(test_labels))\n", "print(\"Precision, Recall and F1-Score:\\n\\n\", classification_report(test_labels, test_intent_predictions))\n", "\n" ], "metadata": { "id": "lSn8yZ0gt3-d" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "report = classification_report(test_labels, test_intent_predictions, output_dict = True)\n", "\n", "precision = []\n", "recall = []\n", "f1 = []\n", "support = []\n", "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", "for c in encoder.classes_:\n", " precision.append(report[c]['precision'])\n", " recall.append(report[c]['recall'])\n", " f1.append(report[c]['f1-score'])\n", " support.append(report[c]['support'])\n", "\n", "accuracy = report['accuracy']\n", "weighted_avg = report['weighted avg']\n", "\n", "\n", "cnf_matrix = confusion_matrix(test_labels, test_intent_predictions)\n", "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", "TP = np.diag(cnf_matrix)\n", "TN = cnf_matrix.sum() - (FP + FN + TP)\n", "\n", "dff['className'] = encoder.classes_\n", "dff['precision'] = precision\n", "dff['recall'] = recall\n", "dff['f1-score'] = f1\n", "dff['support'] = support\n", "dff['FP'] = FP\n", "dff['FN'] = FN\n", "dff['TP'] = TP\n", "dff['TN'] = TN\n", "\n", "\n", " \n", "content = name + \"\\n\"\n", "print(name)\n", "content += str(weighted_avg) + \"\\n\"\n", "print(weighted_avg)\n", "print(accuracy)\n", "print(dff)\n", "\n", "dff.to_csv(path+\"/reports/report_\"+name+\".csv\", index=False)\n", "\n", "# enregistrer les predictions\n", "pd.DataFrame({'labels': pd.Series(df_test[columnClass]), 'predictions': pd.Series(test_intent_predictions)}).to_csv(path+\"/predictions/predictions_\"+name+\".csv\")\n", "\n", "with open(path+\"/reports/report_\"+name+\".txt\", 'w') as f:\n", " f.write(content)\n" ], "metadata": { "id": "RQ0LYGuOt4A4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "Lbwg2H8sJRe7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "4mX5g55AJRhj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "bIzOUHZnu8s_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "AsDsTNWdu8vf" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "m9S-2wbeu8yK" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "qDpAbSWKu80r" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "-D8Gj6kzJRjv" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "Lwz5cO2eJRmD" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "yr_UWq14JRoI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "WJM6J6_EJRqx" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "CtYR7NTvJRs2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "Qppm6jATJRvM" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "rK5nK4gyJRx0" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "vSjWwcQKJRz7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "G9pjdMdNW_KS" }, "source": [ "predictions = model.predict(word_seq_validation)\n", "predictions = np.argmax(predictions,axis=1)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IHpVJ79IW_M0", "outputId": "2e1657b3-04d1-42f1-ea8b-9bbcd4744108" }, "source": [ "report = classification_report(predictions, y_validation, output_dict = True)\n", "\n", "accuracy = report['accuracy']\n", "weighted_avg = report['weighted avg']\n", "\n", "print(accuracy, weighted_avg)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.5726683109527725 {'precision': 0.6118028288513718, 'recall': 0.5726683109527725, 'f1-score': 0.5870482221489528, 'support': 10947}\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "9SKjWffUW_PC" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "LpgkGq-fW_RN" }, "source": [ "df_test = pd.read_csv(test_path, sep=\"\\t\")\n", "\n", "encoder = preprocessing.LabelEncoder()\n", "y_test = encoder.fit_transform(df_test[columnClass])\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Q9eYqi5SW_Ta", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "31e45f20-583a-4ca6-eac8-21863f6fef5b" }, "source": [ "raw_docs_test = df_test[columnText].tolist()\n", "\n", "print(\"pre-processing test data...\")\n", "\n", "stop_words = set(stopwords.words('french'))\n", "\n", "processed_docs_test = []\n", "for doc in tqdm(raw_docs_test):\n", " tokens = word_tokenize(doc, language='french')\n", " filtered = [word for word in tokens if word not in stop_words]\n", " processed_docs_test.append(\" \".join(filtered))\n", "#end for\n", "\n", "print(\"tokenizing input data...\")\n", "#tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n", "#tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation) #leaky\n", "word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)\n", "\n", "#pad sequences\n", "word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "pre-processing test data...\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "100%|██████████| 13137/13137 [00:09<00:00, 1331.48it/s]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "tokenizing input data...\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "_WjpJN-Bqjeb" }, "source": [ "predictions = model.predict(word_seq_test)\n", "predictions = np.argmax(predictions,axis=1)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zUwjL_dQqjgx", "outputId": "912642ad-95eb-413a-d074-8d4881a57359" }, "source": [ "report = classification_report(predictions, y_test, output_dict = True)\n", "\n", "accuracy = report['accuracy']\n", "weighted_avg = report['weighted avg']\n", "\n", "print(accuracy, weighted_avg)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.5698409073608891 {'precision': 0.6081680700148677, 'recall': 0.5698409073608891, 'f1-score': 0.5847417616022411, 'support': 13137}\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ka6DcPe7qqvg", "outputId": "0c8cfbe6-178d-4208-98ba-4ba688e32939" }, "source": [ "from sklearn.metrics import confusion_matrix\n", "\n", "classesName = encoder.classes_\n", "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n", "\n", "precision = []\n", "recall = []\n", "f1 = []\n", "support = []\n", "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", "for c in classes:\n", " precision.append(report[c]['precision'])\n", " recall.append(report[c]['recall'])\n", " f1.append(report[c]['f1-score'])\n", " support.append(report[c]['support'])\n", "\n", "accuracy = report['accuracy']\n", "weighted_avg = report['weighted avg']\n", "\n", "\n", "cnf_matrix = confusion_matrix(y_test, predictions)\n", "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", "TP = np.diag(cnf_matrix)\n", "TN = cnf_matrix.sum() - (FP + FN + TP)\n", "\n", "dff['className'] = classesName\n", "dff['precision'] = precision\n", "dff['recall'] = recall\n", "dff['f1-score'] = f1\n", "dff['support'] = support\n", "dff['FP'] = FP\n", "dff['FN'] = FN\n", "dff['TP'] = TP\n", "dff['TN'] = TN\n", "\n", "print(\"test_cnn_s\"+str(maxOfInstancePerClass))\n", "\n", "print(weighted_avg)\n", "print(accuracy)\n", "print(dff)\n", "\n", "dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_test_cnn_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "test_cnn_s10000\n", "{'precision': 0.6081680700148677, 'recall': 0.5698409073608891, 'f1-score': 0.5847417616022411, 'support': 13137}\n", "0.5698409073608891\n", " className precision ... TP TN\n", "0 Agriculture - Economie rustique 0.216535 ... 55 12636\n", "1 Anatomie 0.459821 ... 103 12768\n", "2 Antiquité 0.287975 ... 91 12710\n", "3 Architecture 0.339623 ... 108 12722\n", "4 Arts et métiers 0.015504 ... 2 12995\n", "5 Beaux-arts 0.060000 ... 6 13018\n", "6 Belles-lettres - Poésie 0.127660 ... 30 12761\n", "7 Blason 0.228571 ... 24 12993\n", "8 Caractères 0.037037 ... 1 13110\n", "9 Chasse 0.221311 ... 27 12962\n", "10 Chimie 0.160714 ... 18 12991\n", "11 Commerce 0.443418 ... 192 12490\n", "12 Droit - Jurisprudence 0.762879 ... 1081 11263\n", "13 Economie domestique 0.000000 ... 0 13102\n", "14 Grammaire 0.408929 ... 229 12254\n", "15 Géographie 0.917312 ... 2607 9910\n", "16 Histoire 0.405063 ... 288 11777\n", "17 Histoire naturelle 0.743292 ... 831 11661\n", "18 Jeu 0.061538 ... 4 13067\n", "19 Marine 0.590805 ... 257 12549\n", "20 Maréchage - Manège 0.620690 ... 72 13001\n", "21 Mathématiques 0.549669 ... 83 12903\n", "22 Mesure 0.095238 ... 4 13087\n", "23 Militaire (Art) - Guerre - Arme 0.476351 ... 141 12704\n", "24 Minéralogie 0.000000 ... 0 13111\n", "25 Monnaie 0.054795 ... 4 13051\n", "26 Musique 0.287500 ... 46 12904\n", "27 Médailles 0.000000 ... 0 13107\n", "28 Médecine - Chirurgie 0.376218 ... 193 12149\n", "29 Métiers 0.605634 ... 731 11047\n", "30 Pharmacie 0.070423 ... 5 13045\n", "31 Philosophie 0.071429 ... 8 12996\n", "32 Physique - [Sciences physico-mathématiques] 0.378378 ... 112 12674\n", "33 Politique 0.000000 ... 0 13110\n", "34 Pêche 0.170213 ... 8 13069\n", "35 Religion 0.326371 ... 125 12488\n", "36 Spectacle 0.000000 ... 0 13121\n", "37 Superstition 0.000000 ... 0 13112\n", "\n", "[38 rows x 9 columns]\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "BqJ1_hUUqqx5" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bhfuGNwIqrOQ" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "NkL3MopyqrQk" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "XLHl-pvzqjjI" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "lLR_Xvi9qjlo" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "8cGcLOFTqjoP" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "vLGTnit_W_V8" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "R-3lBXjDD9wE" }, "source": [ "def predict(data, max_len):\n", " \n", " pad_sequ_test, _ = prepare_sequence(data, max_len)\n", " pred_labels_ = model.predict(pad_sequ_test)\n", "\n", " return np.argmax(pred_labels_,axis=1)\n", "\n", "\n", "def eval(data, labels, max_len):\n", " \n", " pred_labels_ = predict(data, max_len)\n", " report = classification_report(pred_labels_, labels, output_dict = True)\n", "\n", " accuracy = report['accuracy']\n", " weighted_avg = report['weighted avg']\n", " \n", " print(accuracy, weighted_avg)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6T3kAvKvExgc", "outputId": "c6d4560e-fc64-4579-9adb-79c2e36d2386" }, "source": [ "# evaluation sur le jeu de validation\n", "eval(df_validation[columnText], y_validation, max_len)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " return np.array(self.texts_to_sequences(texts))\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "0.06925290207361841 {'precision': 0.09108131158125257, 'recall': 0.06925290207361841, 'f1-score': 0.06099084715237025, 'support': 10079}\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pTDJA03_-8yu", "outputId": "d8bcdf73-c4c3-4c88-b063-90bd1cad5122" }, "source": [ "# evaluation sur le jeu de test\n", "df_test = pd.read_csv(test_path, sep=\"\\t\")\n", "#df_test = resample_classes(df_test, columnClass, maxOfInstancePerClass)\n", "\n", "y_test = df_test[columnClass]\n", "encoder = preprocessing.LabelEncoder()\n", "y_test = encoder.fit_transform(y_test)\n", "\n", "eval(df_test[columnText], y_test, max_len)\n" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " return np.array(self.texts_to_sequences(texts))\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "0.07231483595950369 {'precision': 0.081194635559303, 'recall': 0.07231483595950369, 'f1-score': 0.06322383877903374, 'support': 13137}\n" ] } ] } ] }