diff --git a/.DS_Store b/.DS_Store
index 55e7fe6422480e50b15aea87086f46dda65ca172..801dbbfe341b8be3c7a21024f642288b612c928d 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/.gitignore b/.gitignore
index 03156e2c22bc6e140bbe03842bc500d4728df496..db1ed1dfb73882a0d0ea40c2b70301ac99f0ce06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ dataframe_with_normClass_artfl.csv
 *.pkl
 .DS_Store
 .DS_Store
+.DS_Store
diff --git a/notebooks/EDdA_Classification_BertFineTuning.ipynb b/notebooks/EDdA_Classification_BertFineTuning.ipynb
index dc0830e18213cca6d3b8ef6733586bdb1c7715b9..4058698a7665850bd055042710bcb25391cae564 100644
--- a/notebooks/EDdA_Classification_BertFineTuning.ipynb
+++ b/notebooks/EDdA_Classification_BertFineTuning.ipynb
@@ -62,17 +62,7 @@
     "id": "WF0qFN_g3ekz",
     "outputId": "445ffd96-843b-4ff1-a24d-c110964a63e4"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Your runtime has 27.3 gigabytes of available RAM\n",
-      "\n",
-      "You are using a high-RAM runtime!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from psutil import virtual_memory\n",
     "ram_gb = virtual_memory().total / 1e9\n",
@@ -94,15 +84,7 @@
     "id": "vL0S-s9Uofvn",
     "outputId": "415b7bf1-d3fd-42b6-ee03-13601c953a4f"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Mounted at /content/drive\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from google.colab import drive\n",
     "drive.mount('/content/drive')"
@@ -127,16 +109,7 @@
     "id": "dPOU-Efhf4ui",
     "outputId": "fc873e0c-1254-4928-c8e9-e3eb093acc64"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "There are 1 GPU(s) available.\n",
-      "We will use the GPU: Tesla P100-PCIE-16GB\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import torch\n",
     "\n",
@@ -175,57 +148,7 @@
     "id": "pwmZ5bBvgGNh",
     "outputId": "e92404c6-af38-4bd8-8c99-20ec6b545b3f"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Collecting transformers==4.10.3\n",
-      "  Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)\n",
-      "\u001b[K     |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2.8 MB 5.0 MB/s \n",
-      "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n",
-      "  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n",
-      "\u001b[K     |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3.3 MB 38.8 MB/s \n",
-      "\u001b[?25hCollecting pyyaml>=5.1\n",
-      "  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n",
-      "\u001b[K     |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 596 kB 58.6 MB/s \n",
-      "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (2019.12.20)\n",
-      "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (4.62.3)\n",
-      "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (2.23.0)\n",
-      "Collecting huggingface-hub>=0.0.12\n",
-      "  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)\n",
-      "\u001b[K     |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 61 kB 486 kB/s \n",
-      "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (3.4.0)\n",
-      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (1.19.5)\n",
-      "Collecting sacremoses\n",
-      "  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n",
-      "\u001b[K     |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 895 kB 43.3 MB/s \n",
-      "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (21.3)\n",
-      "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (4.8.2)\n",
-      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.12->transformers==4.10.3) (3.10.0.2)\n",
-      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers==4.10.3) (3.0.6)\n",
-      "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.10.3) (3.6.0)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (2.10)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (1.24.3)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (2021.10.8)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (3.0.4)\n",
-      "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (7.1.2)\n",
-      "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (1.15.0)\n",
-      "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (1.1.0)\n",
-      "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n",
-      "  Attempting uninstall: pyyaml\n",
-      "    Found existing installation: PyYAML 3.13\n",
-      "    Uninstalling PyYAML-3.13:\n",
-      "      Successfully uninstalled PyYAML-3.13\n",
-      "Successfully installed huggingface-hub-0.2.1 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.10.3\n",
-      "Collecting sentencepiece\n",
-      "  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
-      "\u001b[K     |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1.2 MB 5.1 MB/s \n",
-      "\u001b[?25hInstalling collected packages: sentencepiece\n",
-      "Successfully installed sentencepiece-0.1.96\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!pip install transformers==4.10.3\n",
     "!pip install sentencepiece"
@@ -384,16 +307,7 @@
     "id": "zj3JDoJNfx1f",
     "outputId": "59262e3f-5fe0-49f5-bb55-8586653498ab"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(30650, 13)\n",
-      "(10947, 13)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(df_train.shape)\n",
     "print(df_validation.shape)"
@@ -417,17 +331,6 @@
     "y_validation = encoder.fit_transform(y_validation)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "u9AxxaA_h1CM"
-   },
-   "outputs": [],
-   "source": [
-    "#train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -443,39 +346,6 @@
     "labels_validation = y_validation.tolist()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "Dq_KF5WAsbpC",
-    "outputId": "ba91b953-abcb-4bed-a5c5-9e429e68239a"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([\"\\nESTAMPEUR, s. m. en , est une\\nsorte de pilon de bois, surmontÃ© d'un manche d'environ \\ndeux piÃ©s & demi. On s'en sert pour estamper\\nles formes oÃ¹ l'on veut faire des vergeoises. Voyez\\nVergeoise & Estamper.\\n\",\n",
-       "       \"\\nOn doit Ã©bourgeonner les vignes, alors ce mot doit\\ns'entendre autrement que pour les arbres fruitiers:\\non Ã©bourgeonne les vignes. non-seulement quand on\\nsupprime les bourgeons surnumÃ©raires, mais encore\\nquand on arrÃªte par-en-haut les bourgeons. Il en est\\nde mÃªme quand on dÃ©tache en cassant les faux bourgeons \\nqui poussent d'ordinaire Ã  chaque noeud Ã \\ncÃ´tÃ© des yeux, Ã  commencer par le bas. (K)\\n\",\n",
-       "       \"\\nBois mort en piÃ©, s'il est pourri sur piÃ©, sans\\nsubstance, & bon seulement Ã  brÃ»ler.\\n\",\n",
-       "       ...,\n",
-       "       \"\\nIl y a une hydatoscopie naturelle & permise ; elle\\nconsiste Ã  prÃ©voir & Ã  prÃ©dire les orages & les tempÃªtes \\nsur certains signes qu'on remarque dans la mer,\\ndans l'air, & dans les nuages. Voyez Tems & Ouragans. Dict. de TrÃ©voux.\\n\",\n",
-       "       \"\\nMÃ‰TÃ‰OROMANCIE, s.f. () divination par\\nles mÃ©tÃ©ores ; & comme les mÃ©tÃ©ores ignÃ©s sont ceux\\nqui jettent le plus de crainte parmi les hommes, la\\nmÃ©tÃ©oromancie dÃ©signe proprement la divination par\\nle tonnerre & les Ã©clairs. Cette espece de divination\\npassa des Toscans aux Romains, sons rien perdre de\\nce qu'elle avoit de frivole. Seneque nous apprend\\nque deux auteurs graves, & qui avoient exercÃ© des\\n\\nmagistratures, Ã©crivoient Ã  Rome sur cette matiere.\\nIl semble mÃªme que l'un d'eux l'Ã©puisa entierement,\\ncar il donnoit une liste exacte des diffÃ©rentes especes\\nde tonnerres. Il circonstancioit & leurs noms & les\\nprognostics qui s'en pouvoient tirer ; le tout avec un\\nair de confiance plus surprenant encore que les choses\\nqu'il rapportoit. On eÃ»t dit, tant cette matiere mÃ©tÃ©orologique lui Ã©toit familiere, qu'il comptoit les tableaux \\nde sa galerie, ou qu'il faisoit la description\\ndes fleurs de son jardin. La plus ancienne maladie,\\nla plus invÃ©tÃ©rÃ©e, la plus incurable du genre humain,\\nc'est l'envie de connoÃ®tre ce qui doit arriver.\\nNi le voile obscur qui nous cache notre destinÃ©e, ni\\nl'expÃ©rience journaliere, ni une infinitÃ© de tentatives \\nmalheureuses, n'ont pÃ» guerir les hommes. HÃ©!\\nse dÃ©prÃ©viennent-ils jamais d'une erreur agrÃ©ablement \\nreÃ§ue? Nous sommes sur ce point aussi crÃ©dules\\nque nos ancÃªtres ; nous prÃªtons comme eux l'oreille\\nÃ  toutes les impostures flatteuses. Pour avoir trompÃ©\\ncent fois, elles n'ont point perdu le droit funeste de\\ntromper encore. (D. J.)\\n\",\n",
-       "       \"\\nPENTACLE, s. m. () c'est le nom que la\\nmagie des exorcismes donne Ã  un sceau imprimÃ© ou\\nsur du parchemin vierge fait de peau de bouc, ou\\nsur quelque mÃ©tal, or, argent, cuivre, Ã©tain, plomb,\\n&c. On ne peut faire aucune opÃ©ration magique pour\\nexorciser les esprits, sans avoir ce sceau qui contient\\nles noms de Dieu. Le pentacle se fait en renfermant\\nun triangle dans deux cercles : on lit dans ce triangle \\nces trois mots ; formatio, reformatio, transformatio. A cÃ´tÃ© du triangle est le mot agla, qui est trÃ¨s puissant \\npour arrÃªter la malice des esprits. Il faut que\\nla peau sur laquelle on applique le sceau soit exorcisÃ©e \\n& bÃ©nite. On exorcise aussi l'encre & la plume,\\ndont on se sert pour Ã©crire les mots dont on vient de\\nparler. AprÃ¨s cela on encense le pentacle ; on l'enferme \\ntrois jours & trois nuits dans un vase bien net ;\\nenfin, on le met dans un linge ou dans un livre que\\nl'on parfume & que l'on exorcise. VoilÃ  les fadaises\\nqu'on lit dans le livre intitulÃ© Encheiridion Leonis papae, ouvrage misÃ©rable, qui n'a servi qu'Ã  gÃ¢ter davantage \\nles esprits crÃ©dules & portÃ©s Ã  la superstitition.\\n(D. J.)\\n\"],\n",
-       "      dtype=object)"
-      ]
-     },
-     "execution_count": 41,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sentences_train"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -548,57 +418,7 @@
     "id": "C4bigx_3ibuN",
     "outputId": "b8cef3f8-7a6c-47d1-9d37-7b3b6d08f00b"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading CamemBERT tokenizer...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "06c6e7721b68449a9f3619ffdf18dfeb",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/811k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fba1d1d5c83b40659295a3457d74cb4e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/1.40M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6a29c1c28ceb415f91ec55512da981c5",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Load the BERT tokenizer.\n",
     "if model_chosen == \"bert\":\n",
@@ -619,15 +439,7 @@
     "id": "5hNod5X9jDZN",
     "outputId": "93b6e633-afb7-4bcc-be00-44388f801d64"
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Token indices sequence length is longer than the specified maximum sequence length for this model (1263 > 512). Running this sequence through the model will result in indexing errors\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     " # Tokenize all of the sentences and map the tokens to thier word IDs.\n",
     "input_ids_train = []\n",
@@ -685,16 +497,7 @@
     "id": "W9EWv5JvjGH3",
     "outputId": "32cd417d-9a40-4086-d900-b81982407667"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Max sentence length train:  2253\n",
-      "Max sentence length validation:  3067\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print('Max sentence length train: ', max([len(sen) for sen in input_ids_train]))\n",
     "print('Max sentence length validation: ', max([len(sen) for sen in input_ids_validation])) "
@@ -862,338 +665,7 @@
     "id": "C7M2Er1ajsTf",
     "outputId": "2c3f467d-ab09-4f8f-d464-a4e738333587"
    },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4873cc6c9e1d493c9a67d6536e4367a6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']\n",
-      "- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "CamembertForSequenceClassification(\n",
-       "  (roberta): RobertaModel(\n",
-       "    (embeddings): RobertaEmbeddings(\n",
-       "      (word_embeddings): Embedding(32005, 768, padding_idx=1)\n",
-       "      (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
-       "      (token_type_embeddings): Embedding(1, 768)\n",
-       "      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "      (dropout): Dropout(p=0.1, inplace=False)\n",
-       "    )\n",
-       "    (encoder): RobertaEncoder(\n",
-       "      (layer): ModuleList(\n",
-       "        (0): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (1): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (2): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (3): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (4): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (5): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (6): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (7): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (8): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (9): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (10): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (11): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       "  (classifier): RobertaClassificationHead(\n",
-       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "    (dropout): Dropout(p=0.1, inplace=False)\n",
-       "    (out_proj): Linear(in_features=768, out_features=38, bias=True)\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 51,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Load BertForSequenceClassification, the pretrained BERT model with a single \n",
     "# linear classification layer on top.\n",
@@ -1267,320 +739,7 @@
     "id": "SbHBbYpwkKaA",
     "outputId": "49f7f5f4-716d-44c2-e299-505086a89061"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "======== Epoch 1 / 4 ========\n",
-      "Training...\n",
-      "  Batch    40  of  2,642.    Elapsed: 0:00:18.\n",
-      "  Batch    80  of  2,642.    Elapsed: 0:00:36.\n",
-      "  Batch   120  of  2,642.    Elapsed: 0:00:55.\n",
-      "  Batch   160  of  2,642.    Elapsed: 0:01:13.\n",
-      "  Batch   200  of  2,642.    Elapsed: 0:01:31.\n",
-      "  Batch   240  of  2,642.    Elapsed: 0:01:49.\n",
-      "  Batch   280  of  2,642.    Elapsed: 0:02:08.\n",
-      "  Batch   320  of  2,642.    Elapsed: 0:02:26.\n",
-      "  Batch   360  of  2,642.    Elapsed: 0:02:44.\n",
-      "  Batch   400  of  2,642.    Elapsed: 0:03:02.\n",
-      "  Batch   440  of  2,642.    Elapsed: 0:03:20.\n",
-      "  Batch   480  of  2,642.    Elapsed: 0:03:39.\n",
-      "  Batch   520  of  2,642.    Elapsed: 0:03:57.\n",
-      "  Batch   560  of  2,642.    Elapsed: 0:04:15.\n",
-      "  Batch   600  of  2,642.    Elapsed: 0:04:33.\n",
-      "  Batch   640  of  2,642.    Elapsed: 0:04:51.\n",
-      "  Batch   680  of  2,642.    Elapsed: 0:05:10.\n",
-      "  Batch   720  of  2,642.    Elapsed: 0:05:28.\n",
-      "  Batch   760  of  2,642.    Elapsed: 0:05:46.\n",
-      "  Batch   800  of  2,642.    Elapsed: 0:06:04.\n",
-      "  Batch   840  of  2,642.    Elapsed: 0:06:22.\n",
-      "  Batch   880  of  2,642.    Elapsed: 0:06:41.\n",
-      "  Batch   920  of  2,642.    Elapsed: 0:06:59.\n",
-      "  Batch   960  of  2,642.    Elapsed: 0:07:17.\n",
-      "  Batch 1,000  of  2,642.    Elapsed: 0:07:35.\n",
-      "  Batch 1,040  of  2,642.    Elapsed: 0:07:54.\n",
-      "  Batch 1,080  of  2,642.    Elapsed: 0:08:12.\n",
-      "  Batch 1,120  of  2,642.    Elapsed: 0:08:30.\n",
-      "  Batch 1,160  of  2,642.    Elapsed: 0:08:48.\n",
-      "  Batch 1,200  of  2,642.    Elapsed: 0:09:06.\n",
-      "  Batch 1,240  of  2,642.    Elapsed: 0:09:25.\n",
-      "  Batch 1,280  of  2,642.    Elapsed: 0:09:43.\n",
-      "  Batch 1,320  of  2,642.    Elapsed: 0:10:01.\n",
-      "  Batch 1,360  of  2,642.    Elapsed: 0:10:19.\n",
-      "  Batch 1,400  of  2,642.    Elapsed: 0:10:37.\n",
-      "  Batch 1,440  of  2,642.    Elapsed: 0:10:56.\n",
-      "  Batch 1,480  of  2,642.    Elapsed: 0:11:14.\n",
-      "  Batch 1,520  of  2,642.    Elapsed: 0:11:32.\n",
-      "  Batch 1,560  of  2,642.    Elapsed: 0:11:50.\n",
-      "  Batch 1,600  of  2,642.    Elapsed: 0:12:08.\n",
-      "  Batch 1,640  of  2,642.    Elapsed: 0:12:27.\n",
-      "  Batch 1,680  of  2,642.    Elapsed: 0:12:45.\n",
-      "  Batch 1,720  of  2,642.    Elapsed: 0:13:03.\n",
-      "  Batch 1,760  of  2,642.    Elapsed: 0:13:21.\n",
-      "  Batch 1,800  of  2,642.    Elapsed: 0:13:39.\n",
-      "  Batch 1,840  of  2,642.    Elapsed: 0:13:58.\n",
-      "  Batch 1,880  of  2,642.    Elapsed: 0:14:16.\n",
-      "  Batch 1,920  of  2,642.    Elapsed: 0:14:34.\n",
-      "  Batch 1,960  of  2,642.    Elapsed: 0:14:52.\n",
-      "  Batch 2,000  of  2,642.    Elapsed: 0:15:11.\n",
-      "  Batch 2,040  of  2,642.    Elapsed: 0:15:29.\n",
-      "  Batch 2,080  of  2,642.    Elapsed: 0:15:47.\n",
-      "  Batch 2,120  of  2,642.    Elapsed: 0:16:05.\n",
-      "  Batch 2,160  of  2,642.    Elapsed: 0:16:23.\n",
-      "  Batch 2,200  of  2,642.    Elapsed: 0:16:42.\n",
-      "  Batch 2,240  of  2,642.    Elapsed: 0:17:00.\n",
-      "  Batch 2,280  of  2,642.    Elapsed: 0:17:18.\n",
-      "  Batch 2,320  of  2,642.    Elapsed: 0:17:36.\n",
-      "  Batch 2,360  of  2,642.    Elapsed: 0:17:54.\n",
-      "  Batch 2,400  of  2,642.    Elapsed: 0:18:13.\n",
-      "  Batch 2,440  of  2,642.    Elapsed: 0:18:31.\n",
-      "  Batch 2,480  of  2,642.    Elapsed: 0:18:49.\n",
-      "  Batch 2,520  of  2,642.    Elapsed: 0:19:07.\n",
-      "  Batch 2,560  of  2,642.    Elapsed: 0:19:26.\n",
-      "  Batch 2,600  of  2,642.    Elapsed: 0:19:44.\n",
-      "  Batch 2,640  of  2,642.    Elapsed: 0:20:02.\n",
-      "\n",
-      "  Average training loss: 2.04\n",
-      "  Training epoch took: 0:20:03\n",
-      "\n",
-      "Running Validation...\n",
-      "  Accuracy: 0.75\n",
-      "  Validation took: 0:03:09\n",
-      "\n",
-      "======== Epoch 2 / 4 ========\n",
-      "Training...\n",
-      "  Batch    40  of  2,642.    Elapsed: 0:00:18.\n",
-      "  Batch    80  of  2,642.    Elapsed: 0:00:36.\n",
-      "  Batch   120  of  2,642.    Elapsed: 0:00:55.\n",
-      "  Batch   160  of  2,642.    Elapsed: 0:01:13.\n",
-      "  Batch   200  of  2,642.    Elapsed: 0:01:31.\n",
-      "  Batch   240  of  2,642.    Elapsed: 0:01:49.\n",
-      "  Batch   280  of  2,642.    Elapsed: 0:02:07.\n",
-      "  Batch   320  of  2,642.    Elapsed: 0:02:26.\n",
-      "  Batch   360  of  2,642.    Elapsed: 0:02:44.\n",
-      "  Batch   400  of  2,642.    Elapsed: 0:03:02.\n",
-      "  Batch   440  of  2,642.    Elapsed: 0:03:20.\n",
-      "  Batch   480  of  2,642.    Elapsed: 0:03:38.\n",
-      "  Batch   520  of  2,642.    Elapsed: 0:03:57.\n",
-      "  Batch   560  of  2,642.    Elapsed: 0:04:15.\n",
-      "  Batch   600  of  2,642.    Elapsed: 0:04:33.\n",
-      "  Batch   640  of  2,642.    Elapsed: 0:04:51.\n",
-      "  Batch   680  of  2,642.    Elapsed: 0:05:10.\n",
-      "  Batch   720  of  2,642.    Elapsed: 0:05:28.\n",
-      "  Batch   760  of  2,642.    Elapsed: 0:05:46.\n",
-      "  Batch   800  of  2,642.    Elapsed: 0:06:04.\n",
-      "  Batch   840  of  2,642.    Elapsed: 0:06:22.\n",
-      "  Batch   880  of  2,642.    Elapsed: 0:06:41.\n",
-      "  Batch   920  of  2,642.    Elapsed: 0:06:59.\n",
-      "  Batch   960  of  2,642.    Elapsed: 0:07:17.\n",
-      "  Batch 1,000  of  2,642.    Elapsed: 0:07:35.\n",
-      "  Batch 1,040  of  2,642.    Elapsed: 0:07:53.\n",
-      "  Batch 1,080  of  2,642.    Elapsed: 0:08:12.\n",
-      "  Batch 1,120  of  2,642.    Elapsed: 0:08:30.\n",
-      "  Batch 1,160  of  2,642.    Elapsed: 0:08:48.\n",
-      "  Batch 1,200  of  2,642.    Elapsed: 0:09:06.\n",
-      "  Batch 1,240  of  2,642.    Elapsed: 0:09:24.\n",
-      "  Batch 1,280  of  2,642.    Elapsed: 0:09:43.\n",
-      "  Batch 1,320  of  2,642.    Elapsed: 0:10:01.\n",
-      "  Batch 1,360  of  2,642.    Elapsed: 0:10:19.\n",
-      "  Batch 1,400  of  2,642.    Elapsed: 0:10:37.\n",
-      "  Batch 1,440  of  2,642.    Elapsed: 0:10:55.\n",
-      "  Batch 1,480  of  2,642.    Elapsed: 0:11:14.\n",
-      "  Batch 1,520  of  2,642.    Elapsed: 0:11:32.\n",
-      "  Batch 1,560  of  2,642.    Elapsed: 0:11:50.\n",
-      "  Batch 1,600  of  2,642.    Elapsed: 0:12:08.\n",
-      "  Batch 1,640  of  2,642.    Elapsed: 0:12:27.\n",
-      "  Batch 1,680  of  2,642.    Elapsed: 0:12:45.\n",
-      "  Batch 1,720  of  2,642.    Elapsed: 0:13:03.\n",
-      "  Batch 1,760  of  2,642.    Elapsed: 0:13:21.\n",
-      "  Batch 1,800  of  2,642.    Elapsed: 0:13:39.\n",
-      "  Batch 1,840  of  2,642.    Elapsed: 0:13:58.\n",
-      "  Batch 1,880  of  2,642.    Elapsed: 0:14:16.\n",
-      "  Batch 1,920  of  2,642.    Elapsed: 0:14:34.\n",
-      "  Batch 1,960  of  2,642.    Elapsed: 0:14:52.\n",
-      "  Batch 2,000  of  2,642.    Elapsed: 0:15:10.\n",
-      "  Batch 2,040  of  2,642.    Elapsed: 0:15:29.\n",
-      "  Batch 2,080  of  2,642.    Elapsed: 0:15:47.\n",
-      "  Batch 2,120  of  2,642.    Elapsed: 0:16:05.\n",
-      "  Batch 2,160  of  2,642.    Elapsed: 0:16:23.\n",
-      "  Batch 2,200  of  2,642.    Elapsed: 0:16:41.\n",
-      "  Batch 2,240  of  2,642.    Elapsed: 0:17:00.\n",
-      "  Batch 2,280  of  2,642.    Elapsed: 0:17:18.\n",
-      "  Batch 2,320  of  2,642.    Elapsed: 0:17:36.\n",
-      "  Batch 2,360  of  2,642.    Elapsed: 0:17:54.\n",
-      "  Batch 2,400  of  2,642.    Elapsed: 0:18:12.\n",
-      "  Batch 2,440  of  2,642.    Elapsed: 0:18:31.\n",
-      "  Batch 2,480  of  2,642.    Elapsed: 0:18:49.\n",
-      "  Batch 2,520  of  2,642.    Elapsed: 0:19:07.\n",
-      "  Batch 2,560  of  2,642.    Elapsed: 0:19:25.\n",
-      "  Batch 2,600  of  2,642.    Elapsed: 0:19:44.\n",
-      "  Batch 2,640  of  2,642.    Elapsed: 0:20:02.\n",
-      "\n",
-      "  Average training loss: 1.03\n",
-      "  Training epoch took: 0:20:02\n",
-      "\n",
-      "Running Validation...\n",
-      "  Accuracy: 0.79\n",
-      "  Validation took: 0:03:09\n",
-      "\n",
-      "======== Epoch 3 / 4 ========\n",
-      "Training...\n",
-      "  Batch    40  of  2,642.    Elapsed: 0:00:18.\n",
-      "  Batch    80  of  2,642.    Elapsed: 0:00:36.\n",
-      "  Batch   120  of  2,642.    Elapsed: 0:00:55.\n",
-      "  Batch   160  of  2,642.    Elapsed: 0:01:13.\n",
-      "  Batch   200  of  2,642.    Elapsed: 0:01:31.\n",
-      "  Batch   240  of  2,642.    Elapsed: 0:01:49.\n",
-      "  Batch   280  of  2,642.    Elapsed: 0:02:07.\n",
-      "  Batch   320  of  2,642.    Elapsed: 0:02:26.\n",
-      "  Batch   360  of  2,642.    Elapsed: 0:02:44.\n",
-      "  Batch   400  of  2,642.    Elapsed: 0:03:02.\n",
-      "  Batch   440  of  2,642.    Elapsed: 0:03:20.\n",
-      "  Batch   480  of  2,642.    Elapsed: 0:03:38.\n",
-      "  Batch   520  of  2,642.    Elapsed: 0:03:57.\n",
-      "  Batch   560  of  2,642.    Elapsed: 0:04:15.\n",
-      "  Batch   600  of  2,642.    Elapsed: 0:04:33.\n",
-      "  Batch   640  of  2,642.    Elapsed: 0:04:51.\n",
-      "  Batch   680  of  2,642.    Elapsed: 0:05:09.\n",
-      "  Batch   720  of  2,642.    Elapsed: 0:05:28.\n",
-      "  Batch   760  of  2,642.    Elapsed: 0:05:46.\n",
-      "  Batch   800  of  2,642.    Elapsed: 0:06:04.\n",
-      "  Batch   840  of  2,642.    Elapsed: 0:06:22.\n",
-      "  Batch   880  of  2,642.    Elapsed: 0:06:41.\n",
-      "  Batch   920  of  2,642.    Elapsed: 0:06:59.\n",
-      "  Batch   960  of  2,642.    Elapsed: 0:07:17.\n",
-      "  Batch 1,000  of  2,642.    Elapsed: 0:07:35.\n",
-      "  Batch 1,040  of  2,642.    Elapsed: 0:07:53.\n",
-      "  Batch 1,080  of  2,642.    Elapsed: 0:08:12.\n",
-      "  Batch 1,120  of  2,642.    Elapsed: 0:08:30.\n",
-      "  Batch 1,160  of  2,642.    Elapsed: 0:08:48.\n",
-      "  Batch 1,200  of  2,642.    Elapsed: 0:09:06.\n",
-      "  Batch 1,240  of  2,642.    Elapsed: 0:09:24.\n",
-      "  Batch 1,280  of  2,642.    Elapsed: 0:09:43.\n",
-      "  Batch 1,320  of  2,642.    Elapsed: 0:10:01.\n",
-      "  Batch 1,360  of  2,642.    Elapsed: 0:10:19.\n",
-      "  Batch 1,400  of  2,642.    Elapsed: 0:10:37.\n",
-      "  Batch 1,440  of  2,642.    Elapsed: 0:10:55.\n",
-      "  Batch 1,480  of  2,642.    Elapsed: 0:11:14.\n",
-      "  Batch 1,520  of  2,642.    Elapsed: 0:11:32.\n",
-      "  Batch 1,560  of  2,642.    Elapsed: 0:11:50.\n",
-      "  Batch 1,600  of  2,642.    Elapsed: 0:12:08.\n",
-      "  Batch 1,640  of  2,642.    Elapsed: 0:12:26.\n",
-      "  Batch 1,680  of  2,642.    Elapsed: 0:12:45.\n",
-      "  Batch 1,720  of  2,642.    Elapsed: 0:13:03.\n",
-      "  Batch 1,760  of  2,642.    Elapsed: 0:13:21.\n",
-      "  Batch 1,800  of  2,642.    Elapsed: 0:13:39.\n",
-      "  Batch 1,840  of  2,642.    Elapsed: 0:13:57.\n",
-      "  Batch 1,880  of  2,642.    Elapsed: 0:14:16.\n",
-      "  Batch 1,920  of  2,642.    Elapsed: 0:14:34.\n",
-      "  Batch 1,960  of  2,642.    Elapsed: 0:14:52.\n",
-      "  Batch 2,000  of  2,642.    Elapsed: 0:15:10.\n",
-      "  Batch 2,040  of  2,642.    Elapsed: 0:15:28.\n",
-      "  Batch 2,080  of  2,642.    Elapsed: 0:15:47.\n",
-      "  Batch 2,120  of  2,642.    Elapsed: 0:16:05.\n",
-      "  Batch 2,160  of  2,642.    Elapsed: 0:16:23.\n",
-      "  Batch 2,200  of  2,642.    Elapsed: 0:16:41.\n",
-      "  Batch 2,240  of  2,642.    Elapsed: 0:17:00.\n",
-      "  Batch 2,280  of  2,642.    Elapsed: 0:17:18.\n",
-      "  Batch 2,320  of  2,642.    Elapsed: 0:17:36.\n",
-      "  Batch 2,360  of  2,642.    Elapsed: 0:17:54.\n",
-      "  Batch 2,400  of  2,642.    Elapsed: 0:18:12.\n",
-      "  Batch 2,440  of  2,642.    Elapsed: 0:18:31.\n",
-      "  Batch 2,480  of  2,642.    Elapsed: 0:18:49.\n",
-      "  Batch 2,520  of  2,642.    Elapsed: 0:19:07.\n",
-      "  Batch 2,560  of  2,642.    Elapsed: 0:19:25.\n",
-      "  Batch 2,600  of  2,642.    Elapsed: 0:19:43.\n",
-      "  Batch 2,640  of  2,642.    Elapsed: 0:20:02.\n",
-      "\n",
-      "  Average training loss: 0.75\n",
-      "  Training epoch took: 0:20:02\n",
-      "\n",
-      "Running Validation...\n",
-      "  Accuracy: 0.79\n",
-      "  Validation took: 0:03:09\n",
-      "\n",
-      "======== Epoch 4 / 4 ========\n",
-      "Training...\n",
-      "  Batch    40  of  2,642.    Elapsed: 0:00:18.\n",
-      "  Batch    80  of  2,642.    Elapsed: 0:00:36.\n",
-      "  Batch   120  of  2,642.    Elapsed: 0:00:55.\n",
-      "  Batch   160  of  2,642.    Elapsed: 0:01:13.\n",
-      "  Batch   200  of  2,642.    Elapsed: 0:01:31.\n",
-      "  Batch   240  of  2,642.    Elapsed: 0:01:49.\n",
-      "  Batch   280  of  2,642.    Elapsed: 0:02:07.\n",
-      "  Batch   320  of  2,642.    Elapsed: 0:02:26.\n",
-      "  Batch   360  of  2,642.    Elapsed: 0:02:44.\n",
-      "  Batch   400  of  2,642.    Elapsed: 0:03:02.\n",
-      "  Batch   440  of  2,642.    Elapsed: 0:03:20.\n",
-      "  Batch   480  of  2,642.    Elapsed: 0:03:39.\n",
-      "  Batch   520  of  2,642.    Elapsed: 0:03:57.\n",
-      "  Batch   560  of  2,642.    Elapsed: 0:04:15.\n",
-      "  Batch   600  of  2,642.    Elapsed: 0:04:33.\n",
-      "  Batch   640  of  2,642.    Elapsed: 0:04:51.\n",
-      "  Batch   680  of  2,642.    Elapsed: 0:05:10.\n",
-      "  Batch   720  of  2,642.    Elapsed: 0:05:28.\n",
-      "  Batch   760  of  2,642.    Elapsed: 0:05:46.\n",
-      "  Batch   800  of  2,642.    Elapsed: 0:06:04.\n",
-      "  Batch   840  of  2,642.    Elapsed: 0:06:22.\n",
-      "  Batch   880  of  2,642.    Elapsed: 0:06:41.\n",
-      "  Batch   920  of  2,642.    Elapsed: 0:06:59.\n",
-      "  Batch   960  of  2,642.    Elapsed: 0:07:17.\n",
-      "  Batch 1,000  of  2,642.    Elapsed: 0:07:35.\n",
-      "  Batch 1,040  of  2,642.    Elapsed: 0:07:53.\n",
-      "  Batch 1,080  of  2,642.    Elapsed: 0:08:12.\n",
-      "  Batch 1,120  of  2,642.    Elapsed: 0:08:30.\n",
-      "  Batch 1,160  of  2,642.    Elapsed: 0:08:48.\n",
-      "  Batch 1,200  of  2,642.    Elapsed: 0:09:06.\n",
-      "  Batch 1,240  of  2,642.    Elapsed: 0:09:24.\n",
-      "  Batch 1,280  of  2,642.    Elapsed: 0:09:43.\n",
-      "  Batch 1,320  of  2,642.    Elapsed: 0:10:01.\n",
-      "  Batch 1,360  of  2,642.    Elapsed: 0:10:19.\n",
-      "  Batch 1,400  of  2,642.    Elapsed: 0:10:37.\n",
-      "  Batch 1,440  of  2,642.    Elapsed: 0:10:55.\n",
-      "  Batch 1,480  of  2,642.    Elapsed: 0:11:14.\n",
-      "  Batch 1,520  of  2,642.    Elapsed: 0:11:32.\n",
-      "  Batch 1,560  of  2,642.    Elapsed: 0:11:50.\n",
-      "  Batch 1,600  of  2,642.    Elapsed: 0:12:08.\n",
-      "  Batch 1,640  of  2,642.    Elapsed: 0:12:26.\n",
-      "  Batch 1,680  of  2,642.    Elapsed: 0:12:45.\n",
-      "  Batch 1,720  of  2,642.    Elapsed: 0:13:03.\n",
-      "  Batch 1,760  of  2,642.    Elapsed: 0:13:21.\n",
-      "  Batch 1,800  of  2,642.    Elapsed: 0:13:39.\n",
-      "  Batch 1,840  of  2,642.    Elapsed: 0:13:57.\n",
-      "  Batch 1,880  of  2,642.    Elapsed: 0:14:16.\n",
-      "  Batch 1,920  of  2,642.    Elapsed: 0:14:34.\n",
-      "  Batch 1,960  of  2,642.    Elapsed: 0:14:52.\n",
-      "  Batch 2,000  of  2,642.    Elapsed: 0:15:10.\n",
-      "  Batch 2,040  of  2,642.    Elapsed: 0:15:28.\n",
-      "  Batch 2,080  of  2,642.    Elapsed: 0:15:46.\n",
-      "  Batch 2,120  of  2,642.    Elapsed: 0:16:05.\n",
-      "  Batch 2,160  of  2,642.    Elapsed: 0:16:23.\n",
-      "  Batch 2,200  of  2,642.    Elapsed: 0:16:41.\n",
-      "  Batch 2,240  of  2,642.    Elapsed: 0:16:59.\n",
-      "  Batch 2,280  of  2,642.    Elapsed: 0:17:17.\n",
-      "  Batch 2,320  of  2,642.    Elapsed: 0:17:36.\n",
-      "  Batch 2,360  of  2,642.    Elapsed: 0:17:54.\n",
-      "  Batch 2,400  of  2,642.    Elapsed: 0:18:12.\n",
-      "  Batch 2,440  of  2,642.    Elapsed: 0:18:30.\n",
-      "  Batch 2,480  of  2,642.    Elapsed: 0:18:48.\n",
-      "  Batch 2,520  of  2,642.    Elapsed: 0:19:07.\n",
-      "  Batch 2,560  of  2,642.    Elapsed: 0:19:25.\n",
-      "  Batch 2,600  of  2,642.    Elapsed: 0:19:43.\n",
-      "  Batch 2,640  of  2,642.    Elapsed: 0:20:01.\n",
-      "\n",
-      "  Average training loss: 0.60\n",
-      "  Training epoch took: 0:20:02\n",
-      "\n",
-      "Running Validation...\n",
-      "  Accuracy: 0.80\n",
-      "  Validation took: 0:03:09\n",
-      "\n",
-      "Training complete!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# This training code is based on the `run_glue.py` script here:\n",
     "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n",
@@ -1938,17 +1097,6 @@
     "  return pred_labels_, true_labels_"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "AJ0suC8iMs8a"
-   },
-   "outputs": [],
-   "source": [
-    "dataset_name = [\"validation\", \"test\"]"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1957,59 +1105,82 @@
    },
    "outputs": [],
    "source": [
-    "for dataset in dataset_name:\n",
+    "\n",
+    "for dataset in [\"test\", \"validation\"]:\n",
     "  df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n",
+    "  df_eval = df_eval.dropna(subset=[columnClass]).reset_index(drop=True)   # supprimer les NaN...\n",
+    "  \n",
     "  data_eval = df_eval[columnText].values\n",
     "\n",
     "  y = df_eval[columnClass]\n",
+    "\n",
+    "  \n",
+    "  \n",
     "  encoder = preprocessing.LabelEncoder()\n",
     "  y = encoder.fit_transform(y)\n",
     "  labels = y.tolist()\n",
     "\n",
-    "  pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n",
-    "\n",
-    "\n",
-    "  report = classification_report( pred_labels_, true_labels_, output_dict = True)\n",
+    "  # for maxOfInstancePerClass in [500, 1500, 10000]:\n",
+    "  for maxOfInstancePerClass in [500]:\n",
+    "    # il manque le model BERT s500 ...\n",
+    "    \n",
+    "    #for model_bert in [\"camembert-base\", \"bert-base-multilingual-cased\"]:\n",
+    "    for model_bert in [\"bert-base-multilingual-cased\"]:\n",
     "      \n",
-    "  classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
-    "  classesName = encoder.classes_\n",
-    "\n",
-    "  precision = []\n",
-    "  recall = []\n",
-    "  f1 = []\n",
-    "  support = []\n",
-    "  dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
-    "  for c in classes:\n",
-    "    precision.append(report[c]['precision'])\n",
-    "    recall.append(report[c]['recall'])\n",
-    "    f1.append(report[c]['f1-score'])\n",
-    "    support.append(report[c]['support'])\n",
-    "\n",
-    "  accuracy = report['accuracy']\n",
-    "  weighted_avg = report['weighted avg']\n",
-    "  cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n",
-    "  FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
-    "  FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
-    "  TP = np.diag(cnf_matrix)\n",
-    "  TN = cnf_matrix.sum() - (FP + FN + TP)\n",
-    "\n",
-    "  dff['className'] = classesName\n",
-    "  dff['precision'] = precision\n",
-    "  dff['recall'] = recall\n",
-    "  dff['f1-score'] = f1\n",
-    "  dff['support'] = support\n",
-    "  dff['FP'] = FP\n",
-    "  dff['FN'] = FN\n",
-    "  dff['TP'] = TP\n",
-    "  dff['TN'] = TN\n",
-    "\n",
-    "  print(dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass))\n",
-    "\n",
-    "  print(weighted_avg)\n",
-    "  print(accuracy)\n",
-    "  print(dff)\n",
-    "\n",
-    "  dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)"
+    "      model_path = \"drive/MyDrive/Classification-EDdA/model_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".pt\"\n",
+    "      model = torch.load(model_path)\n",
+    "\n",
+    "      if model_bert == \"bert-base-multilingual-cased\":\n",
+    "        tokenizer = BertTokenizer.from_pretrained(model_bert)\n",
+    "      elif model_bert == \"camembert-base\":\n",
+    "        tokenizer = CamembertTokenizer.from_pretrained(model_bert)\n",
+    "\n",
+    "      pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n",
+    "\n",
+    "\n",
+    "      report = classification_report(true_labels_, pred_labels_,  output_dict = True)\n",
+    "          \n",
+    "      classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
+    "      classesName = encoder.classes_\n",
+    "\n",
+    "      precision = []\n",
+    "      recall = []\n",
+    "      f1 = []\n",
+    "      support = []\n",
+    "      dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
+    "      for c in classes:\n",
+    "        precision.append(report[c]['precision'])\n",
+    "        recall.append(report[c]['recall'])\n",
+    "        f1.append(report[c]['f1-score'])\n",
+    "        support.append(report[c]['support'])\n",
+    "\n",
+    "      accuracy = report['accuracy']\n",
+    "      weighted_avg = report['weighted avg']\n",
+    "      cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n",
+    "      FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
+    "      FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
+    "      TP = np.diag(cnf_matrix)\n",
+    "      TN = cnf_matrix.sum() - (FP + FN + TP)\n",
+    "\n",
+    "      dff['className'] = classesName\n",
+    "      dff['precision'] = precision\n",
+    "      dff['recall'] = recall\n",
+    "      dff['f1-score'] = f1\n",
+    "      dff['support'] = support\n",
+    "      dff['FP'] = FP\n",
+    "      dff['FN'] = FN\n",
+    "      dff['TP'] = TP\n",
+    "      dff['TN'] = TN\n",
+    "\n",
+    "      print(dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass))\n",
+    "\n",
+    "      print(weighted_avg)\n",
+    "      print(accuracy)\n",
+    "      print(dff)\n",
+    "\n",
+    "      dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n",
+    "      # enregistrer les predictions\n",
+    "      pd.DataFrame({'labels': pd.Series(true_labels_), 'predictions': pd.Series(pred_labels_)}).to_csv(\"drive/MyDrive/Classification-EDdA/predictions/predictions_\"+dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".csv\")\n"
    ]
   },
   {
@@ -2065,957 +1236,6 @@
    },
    "outputs": [],
    "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "x_n57EvhJMQh"
-   },
-   "outputs": [],
-   "source": [
-    "model_path = \"drive/MyDrive/Classification-EDdA/model_bert-base-multilingual-cased_s10000.pt\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "R3_9tA9MI8ju"
-   },
-   "outputs": [],
-   "source": [
-    "model = torch.load(model_path)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "_fzgS5USJeAF",
-    "outputId": "be4a5506-76ed-4eef-bb3c-fe2bb77c6e4d"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--2021-09-30 19:38:22--  https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv\n",
-      "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n",
-      "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n",
-      "HTTP request sent, awaiting response... 200 OK\n",
-      "Length: 356197 (348K) [text/tab-separated-values]\n",
-      "Saving to: â€˜LGE_withContent.tsvâ€™\n",
-      "\n",
-      "LGE_withContent.tsv 100%[===================>] 347.85K   567KB/s    in 0.6s    \n",
-      "\n",
-      "2021-09-30 19:38:24 (567 KB/s) - â€˜LGE_withContent.tsvâ€™ saved [356197/356197]\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "!wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "8WEJjQC7I8mP"
-   },
-   "outputs": [],
-   "source": [
-    "df_LGE = pd.read_csv(\"LGE_withContent.tsv\", sep=\"\\t\")\n",
-    "data_LGE = df_LGE[\"content\"].values\n",
-    "\n",
-    "\n",
-    "#pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 206
-    },
-    "id": "9qJDTU-6vzkk",
-    "outputId": "1b279f0e-7715-4d23-f524-08e8ba327f6c"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>tome</th>\n",
-       "      <th>rank</th>\n",
-       "      <th>domain</th>\n",
-       "      <th>remark</th>\n",
-       "      <th>content</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>abrabeses-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>623</td>\n",
-       "      <td>geography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ABRABESES. Village dâ€™Espagne de la prov. de Za...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>accius-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1076</td>\n",
-       "      <td>biography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>achenbach-2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1357</td>\n",
-       "      <td>biography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACHENBACH(Henri), administrateur prussien, nÃ© ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>acireale-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1513</td>\n",
-       "      <td>geography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>actÃ©e-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1731</td>\n",
-       "      <td>botany</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACTÃ‰E(ActÅ“a L.). Genre de plantes de la famill...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            id  tome  ...  remark                                            content\n",
-       "0  abrabeses-0     1  ...     NaN  ABRABESES. Village dâ€™Espagne de la prov. de Za...\n",
-       "1     accius-0     1  ...     NaN  ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...\n",
-       "2  achenbach-2     1  ...     NaN  ACHENBACH(Henri), administrateur prussien, nÃ© ...\n",
-       "3   acireale-0     1  ...     NaN  ACIREALE. Yille de Sicile, de la province et d...\n",
-       "4      actÃ©e-0     1  ...     NaN  ACTÃ‰E(ActÅ“a L.). Genre de plantes de la famill...\n",
-       "\n",
-       "[5 rows x 6 columns]"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_LGE.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "71-fP61-OOwQ",
-    "outputId": "ef08b49e-0a9f-4653-e303-3163250af35b"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(310, 6)"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_LGE.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "lFFed2EAI8oq"
-   },
-   "outputs": [],
-   "source": [
-    "def generate_prediction_dataloader(chosen_model, sentences_to_predict, batch_size = 8, max_len = 512):\n",
-    "\n",
-    "    if chosen_model == 'bert-base-multilingual-cased' :\n",
-    "        print('Loading Bert Tokenizer...')\n",
-    "        tokenizer = BertTokenizer.from_pretrained(chosen_model)\n",
-    "    elif chosen_model == 'camembert-base':\n",
-    "        print('Loading Camembert Tokenizer...')\n",
-    "        tokenizer = CamembertTokenizer.from_pretrained(chosen_model)\n",
-    "\n",
-    "    # Tokenize all of the sentences and map the tokens to thier word IDs.\n",
-    "    input_ids_test = []\n",
-    "    # For every sentence...\n",
-    "    for sent in sentences_to_predict:\n",
-    "        # `encode` will:\n",
-    "        #   (1) Tokenize the sentence.\n",
-    "        #   (2) Prepend the `[CLS]` token to the start.\n",
-    "        #   (3) Append the `[SEP]` token to the end.\n",
-    "        #   (4) Map tokens to their IDs.\n",
-    "        encoded_sent = tokenizer.encode(\n",
-    "                            sent,                      # Sentence to encode.\n",
-    "                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
-    "                    )\n",
-    "\n",
-    "        input_ids_test.append(encoded_sent)\n",
-    "\n",
-    "    # Pad our input tokens\n",
-    "    padded_test = []\n",
-    "    for i in input_ids_test:\n",
-    "\n",
-    "        if len(i) > max_len:\n",
-    "            padded_test.extend([i[:max_len]])\n",
-    "        else:\n",
-    "\n",
-    "            padded_test.extend([i + [0] * (max_len - len(i))])\n",
-    "    input_ids_test = np.array(padded_test)\n",
-    "\n",
-    "    # Create attention masks\n",
-    "    attention_masks = []\n",
-    "\n",
-    "    # Create a mask of 1s for each token followed by 0s for padding\n",
-    "    for seq in input_ids_test:\n",
-    "        seq_mask = [float(i>0) for i in seq]\n",
-    "        attention_masks.append(seq_mask)\n",
-    "\n",
-    "    # Convert to tensors.\n",
-    "    prediction_inputs = torch.tensor(input_ids_test)\n",
-    "    prediction_masks = torch.tensor(attention_masks)\n",
-    "    #set batch size\n",
-    "\n",
-    "\n",
-    "    # Create the DataLoader.\n",
-    "    prediction_data = TensorDataset(prediction_inputs, prediction_masks)\n",
-    "    prediction_sampler = SequentialSampler(prediction_data)\n",
-    "    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)\n",
-    "\n",
-    "    return prediction_dataloader\n",
-    "\n",
-    "\n",
-    "\n",
-    "def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):\n",
-    "\n",
-    "\n",
-    "    # If there's a GPU available...\n",
-    "    if torch.cuda.is_available():\n",
-    "\n",
-    "        # Tell PyTorch to use the GPU.\n",
-    "        device = torch.device(\"cuda\")\n",
-    "\n",
-    "        print('There are %d GPU(s) available.' % torch.cuda.device_count())\n",
-    "\n",
-    "        print('We will use the GPU:', torch.cuda.get_device_name(0))\n",
-    "\n",
-    "        # If not...\n",
-    "    else:\n",
-    "        print('No GPU available, using the CPU instead.')\n",
-    "        device = torch.device(\"cpu\")\n",
-    "\n",
-    "    # Put model in evaluation mode\n",
-    "    model.eval()\n",
-    "\n",
-    "    # Tracking variables\n",
-    "    predictions_test , true_labels = [], []\n",
-    "    pred_labels_ = []\n",
-    "    # Predict\n",
-    "    for batch in sentences_to_predict_dataloader:\n",
-    "    # Add batch to GPU\n",
-    "        batch = tuple(t.to(device) for t in batch)\n",
-    "\n",
-    "        # Unpack the inputs from the dataloader\n",
-    "        b_input_ids, b_input_mask = batch\n",
-    "\n",
-    "        # Telling the model not to compute or store gradients, saving memory and\n",
-    "        # speeding up prediction\n",
-    "        with torch.no_grad():\n",
-    "            # Forward pass, calculate logit predictions\n",
-    "            outputs = model(b_input_ids, token_type_ids=None,\n",
-    "                            attention_mask=b_input_mask)\n",
-    "\n",
-    "        logits = outputs[0]\n",
-    "        #print(logits)\n",
-    "\n",
-    "        # Move logits and labels to CPU\n",
-    "        logits = logits.detach().cpu().numpy()\n",
-    "        #print(logits)\n",
-    "\n",
-    "        # Store predictions and true labels\n",
-    "        predictions_test.append(logits)\n",
-    "\n",
-    "        #print('    DONE.')\n",
-    "\n",
-    "        pred_labels = []\n",
-    "        \n",
-    "        for i in range(len(predictions_test)):\n",
-    "\n",
-    "            # The predictions for this batch are a 2-column ndarray (one column for \"0\"\n",
-    "            # and one column for \"1\"). Pick the label with the highest value and turn this\n",
-    "            # in to a list of 0s and 1s.\n",
-    "            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()\n",
-    "            pred_labels.append(pred_labels_i)\n",
-    "\n",
-    "    pred_labels_ += [item for sublist in pred_labels for item in sublist]\n",
-    "    return pred_labels_\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "O9eer_kgI8rC",
-    "outputId": "94ea7418-14a8-4918-e210-caf0018f5989"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading Bert Tokenizer...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors\n"
-     ]
-    }
-   ],
-   "source": [
-    "data_loader = generate_prediction_dataloader('bert-base-multilingual-cased', data_LGE)\n",
-    "#data_loader = generate_prediction_dataloader('camembert-base', data_LGE)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "sFpAwbrBwF2h",
-    "outputId": "8d210732-619d-41f0-b6e2-ad9d06a85069"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "There are 1 GPU(s) available.\n",
-      "We will use the GPU: Tesla P100-PCIE-16GB\n"
-     ]
-    }
-   ],
-   "source": [
-    "p = predict_class_bertFineTuning( model, data_loader )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "51HF6-8UPSTc",
-    "outputId": "26bff792-eb8d-4e1a-efa4-a7a6c9d32bf9"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "310"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(p)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "rFFGhaCvQHfh"
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "qgJ-O4rcQHiI",
-    "outputId": "bfe93dd6-4d89-4d5c-be0d-45e1c98c6b14"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "LabelEncoder()"
-      ]
-     },
-     "execution_count": 41,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Il faudrait enregistrer l'encoder, \n",
-    "# sinon on est obligÃ© de le refaire Ã  partir du jeu d'entrainement pour rÃ©cupÃ©rer le noms des classes.\n",
-    "encoder"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "QuST9wJoQHnS"
-   },
-   "outputs": [],
-   "source": [
-    "p2 = list(encoder.inverse_transform(p))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "6ek7suq9QHqE",
-    "outputId": "6636983a-7eba-48c8-d884-f8fb437294dc"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Chimie',\n",
-       " 'Histoire naturelle',\n",
-       " 'GÃ©ographie',\n",
-       " 'MathÃ©matiques',\n",
-       " 'Histoire',\n",
-       " 'GÃ©ographie',\n",
-       " 'Musique',\n",
-       " 'Commerce',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'GÃ©ographie',\n",
-       " 'Physique - [Sciences physico-mathÃ©matiques]',\n",
-       " 'Histoire naturelle',\n",
-       " 'Chimie',\n",
-       " 'Histoire',\n",
-       " 'Physique - [Sciences physico-mathÃ©matiques]',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire',\n",
-       " 'Histoire naturelle',\n",
-       " 'MÃ©decine - Chirurgie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Architecture',\n",
-       " 'Histoire naturelle',\n",
-       " 'Histoire naturelle',\n",
-       " 'GÃ©ographie',\n",
-       " 'Arts et mÃ©tiers',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Marine',\n",
-       " 'Histoire',\n",
-       " 'GÃ©ographie',\n",
-       " 'Architecture',\n",
-       " 'Histoire naturelle',\n",
-       " 'Beaux-arts',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Beaux-arts',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'MÃ©decine - Chirurgie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Chimie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'Religion',\n",
-       " 'Histoire naturelle',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'Agriculture - Economie rustique',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Jeu',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Beaux-arts',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Histoire naturelle',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Histoire',\n",
-       " 'Histoire naturelle',\n",
-       " 'Commerce',\n",
-       " 'Histoire',\n",
-       " 'Militaire (Art) - Guerre - Arme',\n",
-       " 'Histoire',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire',\n",
-       " 'GÃ©ographie',\n",
-       " 'Religion',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'Agriculture - Economie rustique',\n",
-       " 'Histoire',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'MÃ©tiers',\n",
-       " 'Belles-lettres - PoÃ©sie',\n",
-       " 'Beaux-arts',\n",
-       " 'Religion',\n",
-       " 'Architecture',\n",
-       " 'Architecture',\n",
-       " 'Architecture',\n",
-       " 'GÃ©ographie',\n",
-       " 'Chimie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Histoire naturelle',\n",
-       " 'Militaire (Art) - Guerre - Arme',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'MÃ©decine - Chirurgie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'MinÃ©ralogie',\n",
-       " 'Belles-lettres - PoÃ©sie',\n",
-       " 'Histoire naturelle',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'MÃ©decine - Chirurgie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Grammaire',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'MathÃ©matiques',\n",
-       " 'GÃ©ographie',\n",
-       " 'MÃ©decine - Chirurgie',\n",
-       " 'Blason',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'Histoire naturelle',\n",
-       " 'Militaire (Art) - Guerre - Arme',\n",
-       " 'GÃ©ographie',\n",
-       " 'AntiquitÃ©',\n",
-       " 'Agriculture - Economie rustique',\n",
-       " 'Chimie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Belles-lettres - PoÃ©sie',\n",
-       " 'Histoire',\n",
-       " 'GÃ©ographie',\n",
-       " 'MÃ©tiers',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'Arts et mÃ©tiers',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Beaux-arts',\n",
-       " 'GÃ©ographie',\n",
-       " 'Beaux-arts',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'Musique',\n",
-       " 'MÃ©decine - Chirurgie',\n",
-       " 'Religion',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire',\n",
-       " 'Droit - Jurisprudence',\n",
-       " 'Histoire',\n",
-       " 'MÃ©decine - Chirurgie',\n",
-       " 'Histoire',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Chimie',\n",
-       " 'AntiquitÃ©',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'Beaux-arts',\n",
-       " 'Histoire',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'AntiquitÃ©',\n",
-       " 'Grammaire',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Beaux-arts',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire',\n",
-       " 'Architecture',\n",
-       " 'Commerce',\n",
-       " 'AntiquitÃ©',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'MÃ©decine - Chirurgie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Histoire',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'Anatomie',\n",
-       " 'Commerce',\n",
-       " 'Beaux-arts',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'Histoire naturelle',\n",
-       " 'GÃ©ographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Commerce',\n",
-       " 'Architecture',\n",
-       " 'Commerce',\n",
-       " 'AntiquitÃ©',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'MÃ©decine - Chirurgie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'AntiquitÃ©',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Histoire',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'AntiquitÃ©',\n",
-       " 'GÃ©ographie',\n",
-       " 'Religion',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Philosophie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Chimie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Commerce',\n",
-       " 'Commerce',\n",
-       " 'GÃ©ographie',\n",
-       " 'GÃ©ographie']"
-      ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "p2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "XvdDj5PBQHtk"
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "t39Xs0j7QHXJ"
-   },
-   "outputs": [],
-   "source": [
-    "df_LGE['class_bert'] = p2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 206
-    },
-    "id": "-VZ7geRmQHaD",
-    "outputId": "350a4122-5b1f-43e2-e372-2f628f665c4a"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>tome</th>\n",
-       "      <th>rank</th>\n",
-       "      <th>domain</th>\n",
-       "      <th>remark</th>\n",
-       "      <th>content</th>\n",
-       "      <th>class_bert</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>abrabeses-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>623</td>\n",
-       "      <td>geography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ABRABESES. Village dâ€™Espagne de la prov. de Za...</td>\n",
-       "      <td>GÃ©ographie</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>accius-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1076</td>\n",
-       "      <td>biography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n",
-       "      <td>GÃ©ographie</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>achenbach-2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1357</td>\n",
-       "      <td>biography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACHENBACH(Henri), administrateur prussien, nÃ© ...</td>\n",
-       "      <td>GÃ©ographie</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>acireale-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1513</td>\n",
-       "      <td>geography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n",
-       "      <td>GÃ©ographie</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>actÃ©e-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1731</td>\n",
-       "      <td>botany</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACTÃ‰E(ActÅ“a L.). Genre de plantes de la famill...</td>\n",
-       "      <td>Histoire naturelle</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            id  ...          class_bert\n",
-       "0  abrabeses-0  ...          GÃ©ographie\n",
-       "1     accius-0  ...          GÃ©ographie\n",
-       "2  achenbach-2  ...          GÃ©ographie\n",
-       "3   acireale-0  ...          GÃ©ographie\n",
-       "4      actÃ©e-0  ...  Histoire naturelle\n",
-       "\n",
-       "[5 rows x 7 columns]"
-      ]
-     },
-     "execution_count": 46,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_LGE.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "3xkzdkrKQHwA"
-   },
-   "outputs": [],
-   "source": [
-    "df_LGE.to_csv(\"drive/MyDrive/Classification-EDdA/classification_LGE.tsv\", sep=\"\\t\")"
-   ]
   }
  ],
  "metadata": {
diff --git a/notebooks/EDdA_Classification_ClassicModels.ipynb b/notebooks/EDdA_Classification_ClassicModels.ipynb
index fcb2ba0913c4ace29c5ba8cdda2ba0b89c1a5931..4a10ee3f243003a3359d927a5ffaf1239eddce0a 100644
--- a/notebooks/EDdA_Classification_ClassicModels.ipynb
+++ b/notebooks/EDdA_Classification_ClassicModels.ipynb
@@ -1,21 +1,4 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "EDdA-Classification_ClassicModels.ipynb",
-      "provenance": [],
-      "collapsed_sections": [],
-      "machine_shape": "hm"
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
@@ -37,9 +20,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "D_uwiuJq3pAM"
       },
+      "outputs": [],
       "source": [
         "train_path = 'training_set.tsv'\n",
         "validation_path = 'validation_set.tsv'\n",
@@ -67,9 +52,7 @@
         "doc2vec_min_count = 12\n",
         "doc2vec_dm = 0\n",
         "doc2vec_workers = 8"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -82,6 +65,7 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -89,6 +73,7 @@
         "id": "FsAR4CsB3aUc",
         "outputId": "a5e4efde-a5c9-45f9-ef1c-9223b4d52ac6"
       },
+      "outputs": [],
       "source": [
         "from psutil import virtual_memory\n",
         "ram_gb = virtual_memory().total / 1e9\n",
@@ -98,22 +83,11 @@
         "  print('Not using a high-RAM runtime')\n",
         "else:\n",
         "  print('You are using a high-RAM runtime!')"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Your runtime has 27.3 gigabytes of available RAM\n",
-            "\n",
-            "You are using a high-RAM runtime!\n"
-          ]
-        }
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -121,19 +95,10 @@
         "id": "h5MwRwL53aYY",
         "outputId": "bc4c4c16-fb20-404a-e044-550fc4ca907d"
       },
+      "outputs": [],
       "source": [
         "from google.colab import drive\n",
         "drive.mount('/content/drive')"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Mounted at /content/drive\n"
-          ]
-        }
       ]
     },
     {
@@ -147,6 +112,7 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -154,6 +120,7 @@
         "id": "bcptSr6o3ac7",
         "outputId": "19713482-dfeb-4be3-e63c-35b4253cb9e5"
       },
+      "outputs": [],
       "source": [
         "import pandas as pd\n",
         "import numpy as np\n",
@@ -181,33 +148,11 @@
         "import string\n",
         "nltk.download('stopwords')\n",
         "nltk.download('punkt')"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
-            "[nltk_data]   Package stopwords is already up-to-date!\n",
-            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
-            "[nltk_data]   Package punkt is already up-to-date!\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "True"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 3
-        }
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -215,50 +160,9 @@
         "id": "dwSVXDtWZB5H",
         "outputId": "44e2aa14-726f-43af-aa6a-1b7899e1025b"
       },
+      "outputs": [],
       "source": [
         "!python -m spacy download fr_core_news_sm"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Collecting fr_core_news_sm==2.2.5\n",
-            "  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7 MB)\n",
-            "\u001b[K     |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 14.7 MB 5.5 MB/s \n",
-            "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from fr_core_news_sm==2.2.5) (2.2.4)\n",
-            "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (0.8.2)\n",
-            "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (57.4.0)\n",
-            "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.1.3)\n",
-            "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.0.6)\n",
-            "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.5)\n",
-            "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.23.0)\n",
-            "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.0)\n",
-            "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (0.4.1)\n",
-            "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.0.6)\n",
-            "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (7.4.0)\n",
-            "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (4.62.3)\n",
-            "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.19.5)\n",
-            "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.6)\n",
-            "Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (4.8.2)\n",
-            "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.10.0.2)\n",
-            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.6.0)\n",
-            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.10)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (2021.10.8)\n",
-            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.0.4)\n",
-            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.24.3)\n",
-            "Building wheels for collected packages: fr-core-news-sm\n",
-            "  Building wheel for fr-core-news-sm (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.2.5-py3-none-any.whl size=14727026 sha256=994d176b35663506dd047e65863238d29b9b60313ba0dee5997c107f116477aa\n",
-            "  Stored in directory: /tmp/pip-ephem-wheel-cache-c8y7i3ag/wheels/c9/a6/ea/0778337c34660027ee67ef3a91fb9d3600b76777a912ea1c24\n",
-            "Successfully built fr-core-news-sm\n",
-            "Installing collected packages: fr-core-news-sm\n",
-            "Successfully installed fr-core-news-sm-2.2.5\n",
-            "\u001b[38;5;2mâœ” Download and installation successful\u001b[0m\n",
-            "You can now load the model via spacy.load('fr_core_news_sm')\n"
-          ]
-        }
       ]
     },
     {
@@ -272,9 +176,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "Tunf_CYi3afO"
       },
+      "outputs": [],
       "source": [
         "def create_dict(df, classColumnName):\n",
         "    return dict(df[classColumnName].value_counts())\n",
@@ -328,9 +234,7 @@
         "  model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs)\n",
         "  return model\n",
         "  #return np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -343,28 +247,28 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ybiJYL0h3ahh"
       },
+      "outputs": [],
       "source": [
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n",
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n",
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "LRKJzWmf3pCg"
       },
+      "outputs": [],
       "source": [
         "df_train = pd.read_csv(train_path, sep=\"\\t\")\n",
         "df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -377,9 +281,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "6QQXybaQ3pE9"
       },
+      "outputs": [],
       "source": [
         "data_train = df_train[columnText].tolist()\n",
         "vectorizer_dic = {}\n",
@@ -423,9 +329,7 @@
         "      pickle.dump(vectorizer, file)\n",
         "    \n",
         "  vectorizer_dic[vectorizer_name] = vectorizer    "
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -438,9 +342,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "rx_0eV-M3pHc"
       },
+      "outputs": [],
       "source": [
         "classifier_dic = {}\n",
         "grid_param = {}\n",
@@ -460,15 +366,15 @@
         "  elif classifier_name == \"rfc\":\n",
         "    classifier_dic[classifier_name] = RandomForestClassifier()\n",
         "    grid_param[classifier_name] = { 'max_features': ['sqrt', 'log2'], 'max_depth' : [4,5,6,7,8]}\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "pO7oyeAF7KPK"
       },
+      "outputs": [],
       "source": [
         "for clf_name, clf in classifier_dic.items():\n",
         "  if clf_name != 'bayes' :\n",
@@ -488,9 +394,7 @@
         "    # saving classifier\n",
         "    with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'wb') as file:\n",
         "      pickle.dump(clf, file)\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -503,17 +407,18 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "TfKAjtVFblYe"
       },
+      "outputs": [],
       "source": [
         "dataset_name = [\"validation\", \"test\"]"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -521,341 +426,108 @@
         "id": "h8vZar8c7KRq",
         "outputId": "83511c89-9219-43d1-9e5a-820e75012166"
       },
+      "outputs": [],
       "source": [
         "for dataset in dataset_name:\n",
         "  df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n",
+        "  df_eval = df_eval.dropna(subset=[columnClass]).reset_index(drop=True)   # supprimer les NaN...\n",
         "  data_eval = df_eval[columnText].tolist()\n",
         "\n",
-        "  for classifier_name in classifier_list:\n",
+        "  for maxOfInstancePerClass in [500, 1500, 10000]:\n",
+        "    \n",
+        "\n",
+        "    for classifier_name in classifier_list:\n",
         "\n",
-        "    for vectorizer_name in vectorizer_list:\n",
+        "      for vectorizer_name in vectorizer_list:\n",
         "\n",
-        "      clf_file_name = classifier_name + '_' + vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n",
-        "      with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'rb') as file:\n",
-        "        clf = pickle.load(file)\n",
+        "        clf_file_name = classifier_name + '_' + vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n",
+        "        with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'rb') as file:\n",
+        "          clf = pickle.load(file)\n",
         "\n",
-        "      vec_file_name = vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n",
-        "      with open(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name, 'rb') as file:\n",
-        "        vectorizer = pickle.load(file)\n",
+        "        vec_file_name = vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n",
+        "        with open(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name, 'rb') as file:\n",
+        "          vectorizer = pickle.load(file)\n",
         "\n",
-        "      if vectorizer_name != 'doc2vec' :\n",
-        "        vec_data = vectorizer.transform(data_eval)\n",
-        "      else : \n",
-        "        tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(data_eval)]\n",
-        "        vec_data = np.array([vectorizer.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])\n",
+        "        if vectorizer_name != 'doc2vec' :\n",
+        "          vec_data = vectorizer.transform(data_eval)\n",
+        "        else : \n",
+        "          tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(data_eval)]\n",
+        "          vec_data = np.array([vectorizer.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])\n",
         "\n",
         "\n",
-        "      y_pred = clf.predict(vec_data)\n",
+        "        y_pred = clf.predict(vec_data)\n",
         "\n",
         "\n",
-        "      report = classification_report(y_pred, df_eval[columnClass], output_dict = True)\n",
-        "      precision = []\n",
-        "      recall = []\n",
-        "      f1 = []\n",
-        "      support = []\n",
-        "      dff = pd.DataFrame(columns= ['class', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
-        "      for c in df_eval[columnClass].unique() :\n",
-        "        precision.append(report[c]['precision'])\n",
-        "        recall.append(report[c]['recall'])\n",
-        "        f1.append(report[c]['f1-score'])\n",
-        "        support.append(report[c]['support'])\n",
+        "        report = classification_report(df_eval[columnClass], y_pred, output_dict = True)\n",
+        "        precision = []\n",
+        "        recall = []\n",
+        "        f1 = []\n",
+        "        support = []\n",
+        "        dff = pd.DataFrame(columns= ['class', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
+        "        for c in df_eval[columnClass].unique() :\n",
+        "          precision.append(report[c]['precision'])\n",
+        "          recall.append(report[c]['recall'])\n",
+        "          f1.append(report[c]['f1-score'])\n",
+        "          support.append(report[c]['support'])\n",
         "\n",
-        "      accuracy = report['accuracy']\n",
-        "      weighted_avg = report['weighted avg']\n",
-        "      cnf_matrix = confusion_matrix(df_eval[columnClass], y_pred)\n",
-        "      FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
-        "      FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
-        "      TP = np.diag(cnf_matrix)\n",
-        "      TN = cnf_matrix.sum() - (FP + FN + TP)\n",
+        "        accuracy = report['accuracy']\n",
+        "        weighted_avg = report['weighted avg']\n",
+        "        cnf_matrix = confusion_matrix(df_eval[columnClass], y_pred)\n",
+        "        FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
+        "        FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
+        "        TP = np.diag(cnf_matrix)\n",
+        "        TN = cnf_matrix.sum() - (FP + FN + TP)\n",
         "\n",
-        "      dff['class'] = df_eval[columnClass].unique()\n",
-        "      dff['precision'] = precision\n",
-        "      dff['recall'] = recall\n",
-        "      dff['f1-score'] = f1\n",
-        "      dff['support'] = support\n",
-        "      dff['FP'] = FP\n",
-        "      dff['FN'] = FN\n",
-        "      dff['TP'] = TP\n",
-        "      dff['TN'] = TN\n",
+        "        dff['class'] = df_eval[columnClass].unique()\n",
+        "        dff['precision'] = precision\n",
+        "        dff['recall'] = recall\n",
+        "        dff['f1-score'] = f1\n",
+        "        dff['support'] = support\n",
+        "        dff['FP'] = FP\n",
+        "        dff['FN'] = FN\n",
+        "        dff['TP'] = TP\n",
+        "        dff['TN'] = TN\n",
         "\n",
         "\n",
-        "      print(dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass))\n",
+        "        print(dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass))\n",
         "\n",
-        "      print(weighted_avg)\n",
-        "      print(accuracy)\n",
-        "      print(dff)\n",
+        "        print(weighted_avg)\n",
+        "        print(accuracy)\n",
+        "        print(dff)\n",
         "\n",
-        "      dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n",
-        "\n"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "validation_bayes_bagofwords_s10000\n",
-            "{'precision': 0.8377945389222964, 'recall': 0.619530464967571, 'f1-score': 0.6842670335331308, 'support': 10947}\n",
-            "0.619530464967571\n",
-            "                                          class  precision  ...    TP     TN\n",
-            "0                         Droit - Jurisprudence   0.963590  ...     5  10735\n",
-            "1                                     Grammaire   0.321888  ...    46  10760\n",
-            "2                            Histoire naturelle   0.938776  ...    55  10665\n",
-            "3                                      Commerce   0.310249  ...    42  10679\n",
-            "4                                    GÃ©ographie   0.958193  ...     0  10839\n",
-            "5                                  Architecture   0.158491  ...     0  10863\n",
-            "6                                       Monnaie   0.000000  ...     4  10751\n",
-            "7                          MÃ©decine - Chirurgie   0.735981  ...     3  10860\n",
-            "8                                       MÃ©tiers   0.917495  ...     0  10925\n",
-            "9               Militaire (Art) - Guerre - Arme   0.182186  ...     1  10845\n",
-            "10                                     Anatomie   0.245989  ...     1  10853\n",
-            "11                                          Jeu   0.000000  ...   112  10553\n",
-            "12                                    Pharmacie   0.000000  ...  1138   9191\n",
-            "13                                    AntiquitÃ©   0.209125  ...     0  10921\n",
-            "14                      Belles-lettres - PoÃ©sie   0.020513  ...   150  10358\n",
-            "15              Agriculture - Economie rustique   0.023585  ...  2269   8114\n",
-            "16                                MathÃ©matiques   0.142857  ...   357   9728\n",
-            "17                                   Beaux-arts   0.000000  ...   874   9278\n",
-            "18  Physique - [Sciences physico-mathÃ©matiques]   0.364372  ...     0  10893\n",
-            "19                                       Marine   0.410468  ...   149  10579\n",
-            "20                                       Chasse   0.009804  ...     5  10850\n",
-            "21                              Arts et mÃ©tiers   0.000000  ...    18  10819\n",
-            "22                                     Religion   0.526646  ...     0  10912\n",
-            "23                                       Blason   0.034483  ...    45  10699\n",
-            "24                                        PÃªche   0.025641  ...     0  10926\n",
-            "25                                     Histoire   0.603041  ...     0  10886\n",
-            "26                           MarÃ©chage - ManÃ¨ge   0.051546  ...    11  10814\n",
-            "27                                       Mesure   0.000000  ...     0  10924\n",
-            "28                          Economie domestique   0.000000  ...   315  10264\n",
-            "29                                  Philosophie   0.000000  ...   923   8722\n",
-            "30                                 Superstition   0.000000  ...     0  10888\n",
-            "31                                       Chimie   0.010638  ...     0  10854\n",
-            "32                                    MÃ©dailles   0.000000  ...    90  10659\n",
-            "33                                      Musique   0.082707  ...     0  10925\n",
-            "34                                   CaractÃ¨res   0.000000  ...     1  10908\n",
-            "35                                    Spectacle   0.000000  ...   168  10570\n",
-            "36                                  MinÃ©ralogie   0.000000  ...     0  10938\n",
-            "37                                    Politique   0.000000  ...     0  10926\n",
-            "\n",
-            "[38 rows x 9 columns]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "validation_bayes_tf_idf_s10000\n",
-            "{'precision': 0.9361172330822201, 'recall': 0.48853567187357266, 'f1-score': 0.6289575972884817, 'support': 10947}\n",
-            "0.48853567187357266\n",
-            "                                          class  precision  ...    TP     TN\n",
-            "0                         Droit - Jurisprudence   0.922100  ...     0  10735\n",
-            "1                                     Grammaire   0.000000  ...     7  10760\n",
-            "2                            Histoire naturelle   0.888292  ...     0  10684\n",
-            "3                                      Commerce   0.036011  ...     1  10682\n",
-            "4                                    GÃ©ographie   0.995777  ...     0  10839\n",
-            "5                                  Architecture   0.003774  ...     0  10863\n",
-            "6                                       Monnaie   0.000000  ...     0  10752\n",
-            "7                          MÃ©decine - Chirurgie   0.221963  ...     0  10860\n",
-            "8                                       MÃ©tiers   0.903579  ...     0  10925\n",
-            "9               Militaire (Art) - Guerre - Arme   0.004049  ...     0  10845\n",
-            "10                                     Anatomie   0.037433  ...     0  10853\n",
-            "11                                          Jeu   0.000000  ...    13  10585\n",
-            "12                                    Pharmacie   0.000000  ...  1089   9047\n",
-            "13                                    AntiquitÃ©   0.000000  ...     0  10921\n",
-            "14                      Belles-lettres - PoÃ©sie   0.000000  ...     0  10481\n",
-            "15              Agriculture - Economie rustique   0.000000  ...  2358   5636\n",
-            "16                                MathÃ©matiques   0.000000  ...    14  10349\n",
-            "17                                   Beaux-arts   0.000000  ...   827   9314\n",
-            "18  Physique - [Sciences physico-mathÃ©matiques]   0.004049  ...     0  10893\n",
-            "19                                       Marine   0.088154  ...    32  10583\n",
-            "20                                       Chasse   0.000000  ...     0  10850\n",
-            "21                              Arts et mÃ©tiers   0.000000  ...     0  10821\n",
-            "22                                     Religion   0.003135  ...     0  10912\n",
-            "23                                       Blason   0.000000  ...     1  10700\n",
-            "24                                        PÃªche   0.000000  ...     0  10926\n",
-            "25                                     Histoire   0.023649  ...     0  10886\n",
-            "26                           MarÃ©chage - ManÃ¨ge   0.000000  ...     0  10814\n",
-            "27                                       Mesure   0.000000  ...     0  10924\n",
-            "28                          Economie domestique   0.000000  ...    95  10502\n",
-            "29                                  Philosophie   0.000000  ...   909   8731\n",
-            "30                                 Superstition   0.000000  ...     0  10888\n",
-            "31                                       Chimie   0.000000  ...     0  10854\n",
-            "32                                    MÃ©dailles   0.000000  ...     1  10700\n",
-            "33                                      Musique   0.000000  ...     0  10925\n",
-            "34                                   CaractÃ¨res   0.000000  ...     0  10908\n",
-            "35                                    Spectacle   0.000000  ...     1  10628\n",
-            "36                                  MinÃ©ralogie   0.000000  ...     0  10938\n",
-            "37                                    Politique   0.000000  ...     0  10926\n",
-            "\n",
-            "[38 rows x 9 columns]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "test_bayes_bagofwords_s10000\n",
-            "{'precision': 0.8343333806034451, 'recall': 0.6158940397350994, 'f1-score': 0.6801987597575112, 'support': 13137}\n",
-            "0.6158940397350994\n",
-            "                                          class  precision  ...    TP     TN\n",
-            "0                                      Histoire   0.579466  ...     3  12882\n",
-            "1                         Droit - Jurisprudence   0.953423  ...    44  12913\n",
-            "2                                    GÃ©ographie   0.953906  ...    58  12804\n",
-            "3                                       MÃ©tiers   0.922949  ...    48  12815\n",
-            "4                                  Architecture   0.150943  ...     0  13008\n",
-            "5                          MÃ©decine - Chirurgie   0.744639  ...     0  13037\n",
-            "6                                 MathÃ©matiques   0.225166  ...     2  12900\n",
-            "7                                     Grammaire   0.305357  ...     4  13032\n",
-            "8                                       Monnaie   0.000000  ...     0  13110\n",
-            "9                                      Commerce   0.327945  ...     1  13015\n",
-            "10                                     Anatomie   0.196429  ...     2  13025\n",
-            "11  Physique - [Sciences physico-mathÃ©matiques]   0.331081  ...   142  12652\n",
-            "12                                  Philosophie   0.000000  ...  1351  11028\n",
-            "13                      Belles-lettres - PoÃ©sie   0.008511  ...     0  13106\n",
-            "14              Militaire (Art) - Guerre - Arme   0.199324  ...   171  12399\n",
-            "15                                    AntiquitÃ©   0.183544  ...  2711   9779\n",
-            "16                           MarÃ©chage - ManÃ¨ge   0.008621  ...   412  11633\n",
-            "17                                       Chasse   0.008197  ...  1054  11199\n",
-            "18              Agriculture - Economie rustique   0.011811  ...     0  13072\n",
-            "19                           Histoire naturelle   0.942755  ...   185  12697\n",
-            "20                                     Religion   0.535248  ...     1  13021\n",
-            "21                                       Mesure   0.000000  ...    34  12983\n",
-            "22                                      Musique   0.062500  ...     0  13095\n",
-            "23                              Arts et mÃ©tiers   0.000000  ...    59  12838\n",
-            "24                                       Marine   0.425287  ...     0  13111\n",
-            "25                                       Blason   0.038095  ...     0  13064\n",
-            "26                                       Chimie   0.017857  ...    10  12976\n",
-            "27                          Economie domestique   0.000000  ...     0  13109\n",
-            "28                                   Beaux-arts   0.000000  ...   382  12312\n",
-            "29                                          Jeu   0.000000  ...  1114  10375\n",
-            "30                                        PÃªche   0.000000  ...     0  13066\n",
-            "31                                    Politique   0.000000  ...     0  13025\n",
-            "32                                  MinÃ©ralogie   0.000000  ...    98  12817\n",
-            "33                                    Pharmacie   0.000000  ...     0  13111\n",
-            "34                                 Superstition   0.000000  ...     0  13090\n",
-            "35                                   CaractÃ¨res   0.000000  ...   205  12686\n",
-            "36                                    MÃ©dailles   0.000000  ...     0  13126\n",
-            "37                                    Spectacle   0.000000  ...     0  13112\n",
-            "\n",
-            "[38 rows x 9 columns]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "test_bayes_tf_idf_s10000\n",
-            "{'precision': 0.9374431375624079, 'recall': 0.4883915658065007, 'f1-score': 0.6291194809131295, 'support': 13137}\n",
-            "0.4883915658065007\n",
-            "                                          class  precision  ...    TP     TN\n",
-            "0                                      Histoire   0.018284  ...     0  12883\n",
-            "1                         Droit - Jurisprudence   0.928017  ...     3  12913\n",
-            "2                                    GÃ©ographie   0.997185  ...     0  12821\n",
-            "3                                       MÃ©tiers   0.906379  ...     0  12819\n",
-            "4                                  Architecture   0.000000  ...     0  13008\n",
-            "5                          MÃ©decine - Chirurgie   0.230019  ...     0  13037\n",
-            "6                                 MathÃ©matiques   0.000000  ...     0  12902\n",
-            "7                                     Grammaire   0.000000  ...     0  13032\n",
-            "8                                       Monnaie   0.000000  ...     0  13110\n",
-            "9                                      Commerce   0.036952  ...     0  13015\n",
-            "10                                     Anatomie   0.013393  ...     0  13025\n",
-            "11  Physique - [Sciences physico-mathÃ©matiques]   0.003378  ...    16  12701\n",
-            "12                                  Philosophie   0.000000  ...  1315  10852\n",
-            "13                      Belles-lettres - PoÃ©sie   0.000000  ...     0  13106\n",
-            "14              Militaire (Art) - Guerre - Arme   0.003378  ...     0  12577\n",
-            "15                                    AntiquitÃ©   0.000000  ...  2834   6749\n",
-            "16                           MarÃ©chage - ManÃ¨ge   0.000000  ...    13  12422\n",
-            "17                                       Chasse   0.000000  ...   978  11227\n",
-            "18              Agriculture - Economie rustique   0.000000  ...     0  13072\n",
-            "19                           Histoire naturelle   0.874776  ...    42  12702\n",
-            "20                                     Religion   0.002611  ...     0  13021\n",
-            "21                                       Mesure   0.000000  ...     0  12986\n",
-            "22                                      Musique   0.000000  ...     0  13095\n",
-            "23                              Arts et mÃ©tiers   0.000000  ...     1  12841\n",
-            "24                                       Marine   0.096552  ...     0  13111\n",
-            "25                                       Blason   0.000000  ...     0  13064\n",
-            "26                                       Chimie   0.000000  ...     0  12977\n",
-            "27                          Economie domestique   0.000000  ...     0  13109\n",
-            "28                                   Beaux-arts   0.000000  ...   118  12608\n",
-            "29                                          Jeu   0.000000  ...  1094  10439\n",
-            "30                                        PÃªche   0.000000  ...     0  13066\n",
-            "31                                    Politique   0.000000  ...     0  13025\n",
-            "32                                  MinÃ©ralogie   0.000000  ...     1  12840\n",
-            "33                                    Pharmacie   0.000000  ...     0  13111\n",
-            "34                                 Superstition   0.000000  ...     0  13090\n",
-            "35                                   CaractÃ¨res   0.000000  ...     1  12754\n",
-            "36                                    MÃ©dailles   0.000000  ...     0  13126\n",
-            "37                                    Spectacle   0.000000  ...     0  13112\n",
-            "\n",
-            "[38 rows x 9 columns]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        }
+        "        dff.to_csv(\"drive/MyDrive/Classification-EDdA/reports/report_\"+dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n",
+        "\n",
+        "        # enregistrer les predictions\n",
+        "        pd.DataFrame({'labels': pd.Series(df_eval[columnClass]), 'predictions': pd.Series(y_pred)}).to_csv(\"drive/MyDrive/Classification-EDdA/predictions/predictions_\"+dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\")\n",
+        "\n",
+        "  \n"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "mMiQo_sR7KWn"
       },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": []
     }
-  ]
-}
\ No newline at end of file
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "machine_shape": "hm",
+      "name": "EDdA-Classification_ClassicModels.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/notebooks/EDdA_Classification_DeepLearning.ipynb b/notebooks/EDdA_Classification_DeepLearning.ipynb
index d8e9ea64dd3f8eb0d5d3fa12bf8f3f9ee8fa4466..4bdd58e6756dc6b72fda0e6378d9986a9323e3c7 100644
--- a/notebooks/EDdA_Classification_DeepLearning.ipynb
+++ b/notebooks/EDdA_Classification_DeepLearning.ipynb
@@ -1,20 +1,4 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "EDdA-Classification_DeepLearning.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
@@ -36,9 +20,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "G5LT5n9O7SLt"
       },
+      "outputs": [],
       "source": [
         "train_path = 'training_set.tsv'\n",
         "validation_path = 'validation_set.tsv'\n",
@@ -55,9 +41,7 @@
         "max_len = 512 # \n",
         "epochs = 20\n",
         "embedding_dim = 300 "
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -70,6 +54,7 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -77,19 +62,18 @@
         "id": "Sp8d_Uus7SHJ",
         "outputId": "82929364-d0a1-4962-fcb4-47224a48e6cf"
       },
-      "source": [
-        "from google.colab import drive\n",
-        "drive.mount('/content/drive')"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Mounted at /content/drive\n"
           ]
         }
+      ],
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
       ]
     },
     {
@@ -103,15 +87,15 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "bTIXsF6kBUdh"
       },
+      "outputs": [],
       "source": [
         "#!pip install zeugma\n",
         "#!pip install plot_model"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -124,6 +108,7 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -131,6 +116,18 @@
         "id": "HwWkSznz7SEv",
         "outputId": "02ecbbf8-556f-4567-b57d-6e13a4ca28ff"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+            "[nltk_data]   Unzipping corpora/stopwords.zip.\n",
+            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
+            "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
+          ]
+        }
+      ],
       "source": [
         "from nltk.tokenize import word_tokenize\n",
         "import nltk\n",
@@ -164,19 +161,6 @@
         "from tqdm import tqdm\n",
         "import requests, zipfile, io\n",
         "import os, re, csv, math, codecs"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
-            "[nltk_data]   Unzipping corpora/stopwords.zip.\n",
-            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
-            "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
-          ]
-        }
       ]
     },
     {
@@ -190,9 +174,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "4LJ5blQR7PUe"
       },
+      "outputs": [],
       "source": [
         "\n",
         "def resample_classes(df, classColumnName, numberOfInstances):\n",
@@ -201,20 +187,16 @@
         "  fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n",
         "  return df.groupby(classColumnName, as_index=False).apply(fn)\n",
         "    \n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "-Rh3JMDh7zYd"
       },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": []
     },
     {
       "cell_type": "markdown",
@@ -227,34 +209,35 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "FnbNT4NF7zal"
       },
+      "outputs": [],
       "source": [
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n",
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n",
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "WNqDms64lfaS"
       },
+      "outputs": [],
       "source": [
         "# download FastText\n",
         "zip_file_url = \"https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip\"\n",
         "r = requests.get(zip_file_url)\n",
         "z = zipfile.ZipFile(io.BytesIO(r.content))\n",
         "z.extractall()"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -262,73 +245,74 @@
         "id": "PGMIi0CAmqSd",
         "outputId": "09c034fd-f689-43a9-fd75-5923906d89bf"
       },
-      "source": [
-        "print('loading word embeddings...')\n",
-        "\n",
-        "embeddings_index = {}\n",
-        "f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')\n",
-        "\n",
-        "for line in tqdm(f):\n",
-        "    values = line.rstrip().rsplit(' ')\n",
-        "    word = values[0]\n",
-        "    coefs = np.asarray(values[1:], dtype='float32')\n",
-        "    embeddings_index[word] = coefs\n",
-        "f.close()\n",
-        "\n",
-        "print('found %s word vectors' % len(embeddings_index))"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "loading word embeddings...\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "1999996it [03:40, 9087.22it/s]"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "found 1999996 word vectors\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "\n"
           ]
         }
+      ],
+      "source": [
+        "print('loading word embeddings...')\n",
+        "\n",
+        "embeddings_index = {}\n",
+        "f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')\n",
+        "\n",
+        "for line in tqdm(f):\n",
+        "    values = line.rstrip().rsplit(' ')\n",
+        "    word = values[0]\n",
+        "    coefs = np.asarray(values[1:], dtype='float32')\n",
+        "    embeddings_index[word] = coefs\n",
+        "f.close()\n",
+        "\n",
+        "print('found %s word vectors' % len(embeddings_index))"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "nRLaQUO97zcq"
       },
+      "outputs": [],
       "source": [
         "df_train = pd.read_csv(train_path, sep=\"\\t\")\n",
         "df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)\n",
         "\n",
         "df_validation = pd.read_csv(validation_path, sep=\"\\t\")\n",
         "df_validation = resample_classes(df_validation, columnClass, maxOfInstancePerClass)\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "vGWAgBH87ze8"
       },
+      "outputs": [],
       "source": [
         "y_train  = df_train[columnClass]\n",
         "y_validation = df_validation[columnClass]\n",
@@ -338,12 +322,11 @@
         "\n",
         "y_train = encoder.fit_transform(y_train)\n",
         "y_validation = encoder.fit_transform(y_validation)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -352,13 +335,8 @@
         "id": "7OYjo_uhoqcX",
         "outputId": "79c4ff25-0476-4e12-d6ff-a8e073ee3f6c"
       },
-      "source": [
-        "df_validation.head()"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
             "text/html": [
               "<div>\n",
@@ -492,9 +470,13 @@
               "[5 rows x 13 columns]"
             ]
           },
+          "execution_count": 10,
           "metadata": {},
-          "execution_count": 10
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "df_validation.head()"
       ]
     },
     {
@@ -508,6 +490,7 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -515,6 +498,31 @@
         "id": "NTNh6kMTp_eU",
         "outputId": "3c1eb88c-7f1d-48f1-92bc-bc671f5e1bc1"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "pre-processing train data...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21129/21129 [00:15<00:00, 1359.31it/s]\n",
+            "100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10079/10079 [00:07<00:00, 1378.11it/s]\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "tokenizing input data...\n",
+            "dictionary size:  95254\n"
+          ]
+        }
+      ],
       "source": [
         "#https://github.com/emmanuellaanggi/disaster_tweet_sentiment/blob/master/(Medium)_Text_Classification_Disaster_Tweet_.ipynb\n",
         "\n",
@@ -551,36 +559,11 @@
         "#pad sequences\n",
         "word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_len)\n",
         "word_seq_validation = sequence.pad_sequences(word_seq_validation, maxlen=max_len)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "pre-processing train data...\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21129/21129 [00:15<00:00, 1359.31it/s]\n",
-            "100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10079/10079 [00:07<00:00, 1378.11it/s]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "tokenizing input data...\n",
-            "dictionary size:  95254\n"
-          ]
-        }
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -588,13 +571,8 @@
         "id": "Wj8RkOhT_e2c",
         "outputId": "56152da7-47b7-4b07-84e7-8c499671d53e"
       },
-      "source": [
-        "word_seq_validation"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
             "text/plain": [
               "array([[  0,   0,   0, ..., 293,   8,   7],\n",
@@ -606,13 +584,18 @@
               "       [  0,   0,   0, ..., 188, 213,  37]], dtype=int32)"
             ]
           },
+          "execution_count": 12,
           "metadata": {},
-          "execution_count": 12
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "word_seq_validation"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -620,6 +603,16 @@
         "id": "wGjQI0YgpQAS",
         "outputId": "43a3d902-5a8d-4159-a21e-419b5ee35d7d"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "preparing embedding matrix...\n",
+            "number of null word embeddings: 70\n"
+          ]
+        }
+      ],
       "source": [
         "#embedding matrix\n",
         "\n",
@@ -639,21 +632,11 @@
         "    else:\n",
         "        words_not_found.append(word)\n",
         "print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "preparing embedding matrix...\n",
-            "number of null word embeddings: 70\n"
-          ]
-        }
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -661,23 +644,23 @@
         "id": "hjaeYIZCtGca",
         "outputId": "5ab4dd1a-a500-479f-e289-892242c83de8"
       },
-      "source": [
-        "print(\"sample words not found: \", np.random.choice(words_not_found, 10))"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "sample words not found:  ['especes' \"d'argent\" \"d'oÃ¹\" \"d'argent\" \"qu'elle\" \"qu'elle\" \"c'Ã©toit\"\n",
             " 'diffÃ©rens' 'faisoit' 'faisoit']\n"
           ]
         }
+      ],
+      "source": [
+        "print(\"sample words not found: \", np.random.choice(words_not_found, 10))"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -685,25 +668,10 @@
         "id": "4O0gnsX8pNVU",
         "outputId": "46feba64-b608-4b53-de15-b586dc24b880"
       },
-      "source": [
-        "from keras.layers import BatchNormalization\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "model = tf.keras.Sequential()\n",
-        "\n",
-        "model.add(Embedding(nb_words,embedding_dim,input_length=max_len, weights=[embedding_matrix],trainable=False))\n",
-        "model.add(Bidirectional(LSTM(100)))\n",
-        "model.add(Dense(64,activation='relu'))\n",
-        "model.add(Dropout(0.2))\n",
-        "#model.add(Dense(numberOfClasses,activation='sigmoid'))\n",
-        "model.add(Dense(numberOfClasses,activation='softmax'))\n",
-        "model.summary()"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Model: \"sequential\"\n",
             "_________________________________________________________________\n",
@@ -727,25 +695,40 @@
             "_________________________________________________________________\n"
           ]
         }
+      ],
+      "source": [
+        "from keras.layers import BatchNormalization\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "model = tf.keras.Sequential()\n",
+        "\n",
+        "model.add(Embedding(nb_words,embedding_dim,input_length=max_len, weights=[embedding_matrix],trainable=False))\n",
+        "model.add(Bidirectional(LSTM(100)))\n",
+        "model.add(Dense(64,activation='relu'))\n",
+        "model.add(Dropout(0.2))\n",
+        "#model.add(Dense(numberOfClasses,activation='sigmoid'))\n",
+        "model.add(Dense(numberOfClasses,activation='softmax'))\n",
+        "model.summary()"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "GcfMJl8f-cBA"
       },
+      "outputs": [],
       "source": [
         "\n",
         "#model = NN_withEmbeddings(longueur_dict, embedding_dim, max_len, numberOfClasses)\n",
         "\n",
         "model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
         "#model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC(multi_label=True)])"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -753,15 +736,10 @@
         "id": "OTQTH5VDuA3I",
         "outputId": "b8286232-4938-4591-b483-6b6d1bdc015e"
       },
-      "source": [
-        "#model.fit(padded, np.array(y_train), epochs=epochs, batch_size = batch_size) \n",
-        "model.fit(word_seq_train, y_train, batch_size=256, epochs=epochs, validation_data=(word_seq_validation, y_validation), shuffle=True)"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Epoch 1/20\n",
             "83/83 [==============================] - 530s 6s/step - loss: 3.0575 - accuracy: 0.1886 - val_loss: 2.2493 - val_accuracy: 0.4315\n",
@@ -806,15 +784,19 @@
           ]
         },
         {
-          "output_type": "execute_result",
           "data": {
             "text/plain": [
               "<keras.callbacks.History at 0x7f4269526a90>"
             ]
           },
+          "execution_count": 17,
           "metadata": {},
-          "execution_count": 17
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "#model.fit(padded, np.array(y_train), epochs=epochs, batch_size = batch_size) \n",
+        "model.fit(word_seq_train, y_train, batch_size=256, epochs=epochs, validation_data=(word_seq_validation, y_validation), shuffle=True)"
       ]
     },
     {
@@ -828,27 +810,27 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ykTp9lyRaAma"
       },
+      "outputs": [],
       "source": [
         "model.save(\"drive/MyDrive/Classification-EDdA/lstm_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "5J4xDoqRUSfS"
       },
+      "outputs": [],
       "source": [
         "# save embeddings\n",
         "\n",
         "# saving embeddings index \n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -861,14 +843,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "fKt8ft1t_Cxx"
       },
+      "outputs": [],
       "source": [
         "model = keras.models.load_model(\"drive/MyDrive/Classification-EDdA/lstm_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -881,471 +863,150 @@
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "id": "G9pjdMdNW_KS"
-      },
-      "source": [
-        "predictions = model.predict(word_seq_validation)\n",
-        "predictions = np.argmax(predictions,axis=1)"
-      ],
       "execution_count": null,
-      "outputs": []
+      "metadata": {},
+      "outputs": [],
+      "source": []
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "IHpVJ79IW_M0",
-        "outputId": "78e2a1aa-d35c-428c-e6c3-0ad332abcdfd"
-      },
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
       "source": [
-        "report = classification_report(predictions, y_validation, output_dict = True)\n",
+        "from sklearn.metrics import confusion_matrix\n",
         "\n",
-        "accuracy = report['accuracy']\n",
-        "weighted_avg = report['weighted avg']\n",
         "\n",
-        "print(accuracy, weighted_avg)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "0.5773390217283461 {'precision': 0.5977985581006744, 'recall': 0.5773390217283461, 'f1-score': 0.5808733866443131, 'support': 10079}\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "9SKjWffUW_PC"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "LpgkGq-fW_RN"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "4gGNaPY1iuXD"
-      },
-      "source": [
-        "df_test = pd.read_csv(test_path, sep=\"\\t\")\n",
+        "for dataset in [\"test\", \"validation\"]:\n",
         "\n",
-        "encoder = preprocessing.LabelEncoder()\n",
-        "y_test = encoder.fit_transform(df_test[columnClass])\n"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "P67p7BUZiuZV",
-        "outputId": "f958a063-ee95-4157-fcd9-796991615f03"
-      },
-      "source": [
-        "raw_docs_test = df_test[columnText].tolist()\n",
+        "  df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n",
+        "  df_eval = df_eval.dropna(subset=[columnClass]).reset_index(drop=True)   # supprimer les NaN...\n",
+        "    \n",
+        "  encoder = preprocessing.LabelEncoder()\n",
+        "  y_test = encoder.fit_transform(df_eval[columnClass])\n",
         "\n",
-        "print(\"pre-processing test data...\")\n",
         "\n",
-        "stop_words = set(stopwords.words('french'))\n",
+        "  raw_docs_test = df_eval[columnText].tolist()\n",
         "\n",
-        "processed_docs_test = []\n",
-        "for doc in tqdm(raw_docs_test):\n",
-        "    tokens = word_tokenize(doc, language='french')\n",
-        "    filtered = [word for word in tokens if word not in stop_words]\n",
-        "    processed_docs_test.append(\" \".join(filtered))\n",
-        "#end for\n",
+        "  print(\"pre-processing test data...\")\n",
         "\n",
-        "print(\"tokenizing input data...\")\n",
-        "#tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n",
-        "#tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation)  #leaky\n",
-        "word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)\n",
+        "  stop_words = set(stopwords.words('french'))\n",
         "\n",
-        "#pad sequences\n",
-        "word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "pre-processing test data...\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 13137/13137 [00:09<00:00, 1317.07it/s]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "tokenizing input data...\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "czeIqlD5iudH"
-      },
-      "source": [
-        "predictions = model.predict(word_seq_test)\n",
-        "predictions = np.argmax(predictions,axis=1)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Q9eYqi5SW_Ta",
-        "outputId": "3682a42a-7c07-446e-d913-3d20640fb2bf"
-      },
-      "source": [
-        "report = classification_report(predictions, y_test, output_dict = True)\n",
+        "  processed_docs_test = []\n",
+        "  for doc in tqdm(raw_docs_test):\n",
+        "      tokens = word_tokenize(doc, language='french')\n",
+        "      filtered = [word for word in tokens if word not in stop_words]\n",
+        "      processed_docs_test.append(\" \".join(filtered))\n",
+        "  #end for\n",
         "\n",
-        "accuracy = report['accuracy']\n",
-        "weighted_avg = report['weighted avg']\n",
+        "  print(\"tokenizing input data...\")\n",
+        "  #tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n",
+        "  #tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation)  #leaky\n",
+        "  word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)\n",
         "\n",
-        "print(accuracy, weighted_avg)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "0.5957220065463956 {'precision': 0.6075119377257042, 'recall': 0.5957220065463956, 'f1-score': 0.59493432234528, 'support': 13137}\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "ra4FOHVniwUI",
-        "outputId": "cbe576f6-ce14-49ef-9aba-2d26f76cab92"
-      },
-      "source": [
-        "from sklearn.metrics import confusion_matrix\n",
+        "  #pad sequences\n",
+        "  word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)\n",
         "\n",
-        "classesName = encoder.classes_\n",
-        "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
+        "  for maxOfInstancePerClass in [500, 1500, 10000]:\n",
+        "      # il manque le model BERT s500 ...\n",
+        "      \n",
+        "      for classifier_name in [\"lstm\", 'cnn']:\n",
         "\n",
-        "precision = []\n",
-        "recall = []\n",
-        "f1 = []\n",
-        "support = []\n",
-        "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
-        "for c in classes:\n",
-        "  precision.append(report[c]['precision'])\n",
-        "  recall.append(report[c]['recall'])\n",
-        "  f1.append(report[c]['f1-score'])\n",
-        "  support.append(report[c]['support'])\n",
+        "        model = keras.models.load_model(\"drive/MyDrive/Classification-EDdA/\"+classifier_name+\"_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n",
         "\n",
-        "accuracy = report['accuracy']\n",
-        "weighted_avg = report['weighted avg']\n",
         "\n",
+        "        predictions = model.predict(word_seq_test)\n",
+        "        predictions = np.argmax(predictions,axis=1)\n",
         "\n",
-        "cnf_matrix = confusion_matrix(y_test, predictions)\n",
-        "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
-        "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
-        "TP = np.diag(cnf_matrix)\n",
-        "TN = cnf_matrix.sum() - (FP + FN + TP)\n",
         "\n",
-        "dff['className'] = classesName\n",
-        "dff['precision'] = precision\n",
-        "dff['recall'] = recall\n",
-        "dff['f1-score'] = f1\n",
-        "dff['support'] = support\n",
-        "dff['FP'] = FP\n",
-        "dff['FN'] = FN\n",
-        "dff['TP'] = TP\n",
-        "dff['TN'] = TN\n",
+        "        report = classification_report(y_test, predictions, output_dict = True)\n",
         "\n",
-        "print(\"test_lstm_s\"+str(maxOfInstancePerClass))\n",
+        "        accuracy = report['accuracy']\n",
+        "        weighted_avg = report['weighted avg']\n",
         "\n",
-        "print(weighted_avg)\n",
-        "print(accuracy)\n",
-        "print(dff)\n",
+        "        print(accuracy, weighted_avg)\n",
         "\n",
-        "dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_test_lstm_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "test_lstm_s1500\n",
-            "{'precision': 0.6075119377257042, 'recall': 0.5957220065463956, 'f1-score': 0.59493432234528, 'support': 13137}\n",
-            "0.5957220065463956\n",
-            "                                      className  precision  ...    TP     TN\n",
-            "0               Agriculture - Economie rustique   0.259843  ...    66  12780\n",
-            "1                                      Anatomie   0.446429  ...   100  12818\n",
-            "2                                     AntiquitÃ©   0.525316  ...   166  12425\n",
-            "3                                  Architecture   0.518868  ...   165  12597\n",
-            "4                               Arts et mÃ©tiers   0.007752  ...     1  13002\n",
-            "5                                    Beaux-arts   0.020000  ...     2  13016\n",
-            "6                       Belles-lettres - PoÃ©sie   0.200000  ...    47  12667\n",
-            "7                                        Blason   0.466667  ...    49  12908\n",
-            "8                                    CaractÃ¨res   0.074074  ...     2  13110\n",
-            "9                                        Chasse   0.262295  ...    32  12929\n",
-            "10                                       Chimie   0.348214  ...    39  12952\n",
-            "11                                     Commerce   0.524249  ...   227  12442\n",
-            "12                        Droit - Jurisprudence   0.750176  ...  1063  11473\n",
-            "13                          Economie domestique   0.000000  ...     0  13106\n",
-            "14                                    Grammaire   0.587500  ...   329  12094\n",
-            "15                                   GÃ©ographie   0.830753  ...  2361  10167\n",
-            "16                                     Histoire   0.459916  ...   327  11749\n",
-            "17                           Histoire naturelle   0.687835  ...   769  11871\n",
-            "18                                          Jeu   0.415385  ...    27  13034\n",
-            "19                                       Marine   0.708046  ...   308  12497\n",
-            "20                           MarÃ©chage - ManÃ¨ge   0.784483  ...    91  12991\n",
-            "21                                MathÃ©matiques   0.450331  ...    68  12922\n",
-            "22                                       Mesure   0.333333  ...    14  13078\n",
-            "23              Militaire (Art) - Guerre - Arme   0.510135  ...   151  12719\n",
-            "24                                  MinÃ©ralogie   0.000000  ...     0  13111\n",
-            "25                                      Monnaie   0.041096  ...     3  13057\n",
-            "26                                      Musique   0.525000  ...    84  12922\n",
-            "27                                    MÃ©dailles   0.000000  ...     0  13109\n",
-            "28                         MÃ©decine - Chirurgie   0.584795  ...   300  12279\n",
-            "29                                      MÃ©tiers   0.592378  ...   715  11248\n",
-            "30                                    Pharmacie   0.014085  ...     1  13065\n",
-            "31                                  Philosophie   0.160714  ...    18  12934\n",
-            "32  Physique - [Sciences physico-mathÃ©matiques]   0.533784  ...   158  12690\n",
-            "33                                    Politique   0.000000  ...     0  13111\n",
-            "34                                        PÃªche   0.127660  ...     6  13067\n",
-            "35                                     Religion   0.357702  ...   137  12580\n",
-            "36                                    Spectacle   0.000000  ...     0  13126\n",
-            "37                                 Superstition   0.000000  ...     0  13112\n",
-            "\n",
-            "[38 rows x 9 columns]\n"
-          ]
-        }
+        "        classesName = encoder.classes_\n",
+        "        classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
+        "\n",
+        "        precision = []\n",
+        "        recall = []\n",
+        "        f1 = []\n",
+        "        support = []\n",
+        "        dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
+        "        for c in classes:\n",
+        "          precision.append(report[c]['precision'])\n",
+        "          recall.append(report[c]['recall'])\n",
+        "          f1.append(report[c]['f1-score'])\n",
+        "          support.append(report[c]['support'])\n",
+        "\n",
+        "        accuracy = report['accuracy']\n",
+        "        weighted_avg = report['weighted avg']\n",
+        "\n",
+        "\n",
+        "        cnf_matrix = confusion_matrix(y_test, predictions)\n",
+        "        FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
+        "        FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
+        "        TP = np.diag(cnf_matrix)\n",
+        "        TN = cnf_matrix.sum() - (FP + FN + TP)\n",
+        "\n",
+        "        dff['className'] = classesName\n",
+        "        dff['precision'] = precision\n",
+        "        dff['recall'] = recall\n",
+        "        dff['f1-score'] = f1\n",
+        "        dff['support'] = support\n",
+        "        dff['FP'] = FP\n",
+        "        dff['FN'] = FN\n",
+        "        dff['TP'] = TP\n",
+        "        dff['TN'] = TN\n",
+        "\n",
+        "        print(dataset+\"_\"+classifier_name+\"_s\"+str(maxOfInstancePerClass))\n",
+        "\n",
+        "        print(weighted_avg)\n",
+        "        print(accuracy)\n",
+        "        print(dff)\n",
+        "\n",
+        "        dff.to_csv(\"drive/MyDrive/Classification-EDdA/reports/report_\"+dataset+\"_\"+classifier_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n",
+        "        # enregistrer les predictions\n",
+        "        pd.DataFrame({'labels': pd.Series(y_test), 'predictions': pd.Series(predictions)}).to_csv(\"drive/MyDrive/Classification-EDdA/predictions/predictions_\"+dataset+\"_\"+classifier_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\")\n"
       ]
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "id": "x03FC0D-iwWP"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "gSVqcywgiwYH"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "-T5LfFtwiwaV"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Yjd5c70_iwcY"
-      },
-      "source": [
-        ""
-      ],
       "execution_count": null,
-      "outputs": []
+      "metadata": {},
+      "outputs": [],
+      "source": []
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "id": "2UNjiHYliwes"
-      },
-      "source": [
-        ""
-      ],
       "execution_count": null,
-      "outputs": []
+      "metadata": {},
+      "outputs": [],
+      "source": []
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "id": "vLGTnit_W_V8"
-      },
-      "source": [
-        ""
-      ],
       "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "R-3lBXjDD9wE"
-      },
-      "source": [
-        "def predict(data, max_len):\n",
-        "  \n",
-        "  pad_sequ_test, _ = prepare_sequence(data, max_len)\n",
-        "  pred_labels_ = model.predict(pad_sequ_test)\n",
-        "\n",
-        "  return np.argmax(pred_labels_,axis=1)\n",
-        "\n",
-        "\n",
-        "def eval(data, labels, max_len):\n",
-        "  \n",
-        "  pred_labels_ = predict(data, max_len)\n",
-        "  report = classification_report(pred_labels_, labels, output_dict = True)\n",
-        "\n",
-        "  accuracy = report['accuracy']\n",
-        "  weighted_avg = report['weighted avg']\n",
-        "  \n",
-        "  print(accuracy, weighted_avg)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      "metadata": {},
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "EDdA-Classification_DeepLearning.ipynb",
+      "provenance": []
     },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "6T3kAvKvExgc",
-        "outputId": "c6d4560e-fc64-4579-9adb-79c2e36d2386"
-      },
-      "source": [
-        "# evaluation sur le jeu de validation\n",
-        "eval(df_validation[columnText], y_validation, max_len)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
-            "  return np.array(self.texts_to_sequences(texts))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "0.06925290207361841 {'precision': 0.09108131158125257, 'recall': 0.06925290207361841, 'f1-score': 0.06099084715237025, 'support': 10079}\n"
-          ]
-        }
-      ]
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
     },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "pTDJA03_-8yu",
-        "outputId": "d8bcdf73-c4c3-4c88-b063-90bd1cad5122"
-      },
-      "source": [
-        "# evaluation sur le jeu de test\n",
-        "df_test = pd.read_csv(test_path, sep=\"\\t\")\n",
-        "#df_test = resample_classes(df_test, columnClass, maxOfInstancePerClass)\n",
-        "\n",
-        "y_test = df_test[columnClass]\n",
-        "encoder = preprocessing.LabelEncoder()\n",
-        "y_test = encoder.fit_transform(y_test)\n",
-        "\n",
-        "eval(df_test[columnText], y_test, max_len)\n"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
-            "  return np.array(self.texts_to_sequences(texts))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "0.07231483595950369 {'precision': 0.081194635559303, 'recall': 0.07231483595950369, 'f1-score': 0.06322383877903374, 'support': 13137}\n"
-          ]
-        }
-      ]
+    "language_info": {
+      "name": "python"
     }
-  ]
-}
\ No newline at end of file
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}