diff --git a/.DS_Store b/.DS_Store
index 55e7fe6422480e50b15aea87086f46dda65ca172..801dbbfe341b8be3c7a21024f642288b612c928d 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/.gitignore b/.gitignore
index 03156e2c22bc6e140bbe03842bc500d4728df496..db1ed1dfb73882a0d0ea40c2b70301ac99f0ce06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ dataframe_with_normClass_artfl.csv
 *.pkl
 .DS_Store
 .DS_Store
+.DS_Store
diff --git a/notebooks/EDdA_Classification_BertFineTuning.ipynb b/notebooks/EDdA_Classification_BertFineTuning.ipynb
index dc0830e18213cca6d3b8ef6733586bdb1c7715b9..4058698a7665850bd055042710bcb25391cae564 100644
--- a/notebooks/EDdA_Classification_BertFineTuning.ipynb
+++ b/notebooks/EDdA_Classification_BertFineTuning.ipynb
@@ -62,17 +62,7 @@
     "id": "WF0qFN_g3ekz",
     "outputId": "445ffd96-843b-4ff1-a24d-c110964a63e4"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Your runtime has 27.3 gigabytes of available RAM\n",
-      "\n",
-      "You are using a high-RAM runtime!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from psutil import virtual_memory\n",
     "ram_gb = virtual_memory().total / 1e9\n",
@@ -94,15 +84,7 @@
     "id": "vL0S-s9Uofvn",
     "outputId": "415b7bf1-d3fd-42b6-ee03-13601c953a4f"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Mounted at /content/drive\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from google.colab import drive\n",
     "drive.mount('/content/drive')"
@@ -127,16 +109,7 @@
     "id": "dPOU-Efhf4ui",
     "outputId": "fc873e0c-1254-4928-c8e9-e3eb093acc64"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "There are 1 GPU(s) available.\n",
-      "We will use the GPU: Tesla P100-PCIE-16GB\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import torch\n",
     "\n",
@@ -175,57 +148,7 @@
     "id": "pwmZ5bBvgGNh",
     "outputId": "e92404c6-af38-4bd8-8c99-20ec6b545b3f"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Collecting transformers==4.10.3\n",
-      "  Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)\n",
-      "\u001b[K     |████████████████████████████████| 2.8 MB 5.0 MB/s \n",
-      "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n",
-      "  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n",
-      "\u001b[K     |████████████████████████████████| 3.3 MB 38.8 MB/s \n",
-      "\u001b[?25hCollecting pyyaml>=5.1\n",
-      "  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n",
-      "\u001b[K     |████████████████████████████████| 596 kB 58.6 MB/s \n",
-      "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (2019.12.20)\n",
-      "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (4.62.3)\n",
-      "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (2.23.0)\n",
-      "Collecting huggingface-hub>=0.0.12\n",
-      "  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)\n",
-      "\u001b[K     |████████████████████████████████| 61 kB 486 kB/s \n",
-      "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (3.4.0)\n",
-      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (1.19.5)\n",
-      "Collecting sacremoses\n",
-      "  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n",
-      "\u001b[K     |████████████████████████████████| 895 kB 43.3 MB/s \n",
-      "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (21.3)\n",
-      "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (4.8.2)\n",
-      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.12->transformers==4.10.3) (3.10.0.2)\n",
-      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers==4.10.3) (3.0.6)\n",
-      "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.10.3) (3.6.0)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (2.10)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (1.24.3)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (2021.10.8)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (3.0.4)\n",
-      "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (7.1.2)\n",
-      "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (1.15.0)\n",
-      "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (1.1.0)\n",
-      "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n",
-      "  Attempting uninstall: pyyaml\n",
-      "    Found existing installation: PyYAML 3.13\n",
-      "    Uninstalling PyYAML-3.13:\n",
-      "      Successfully uninstalled PyYAML-3.13\n",
-      "Successfully installed huggingface-hub-0.2.1 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.10.3\n",
-      "Collecting sentencepiece\n",
-      "  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
-      "\u001b[K     |████████████████████████████████| 1.2 MB 5.1 MB/s \n",
-      "\u001b[?25hInstalling collected packages: sentencepiece\n",
-      "Successfully installed sentencepiece-0.1.96\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!pip install transformers==4.10.3\n",
     "!pip install sentencepiece"
@@ -384,16 +307,7 @@
     "id": "zj3JDoJNfx1f",
     "outputId": "59262e3f-5fe0-49f5-bb55-8586653498ab"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(30650, 13)\n",
-      "(10947, 13)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(df_train.shape)\n",
     "print(df_validation.shape)"
@@ -417,17 +331,6 @@
     "y_validation = encoder.fit_transform(y_validation)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "u9AxxaA_h1CM"
-   },
-   "outputs": [],
-   "source": [
-    "#train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -443,39 +346,6 @@
     "labels_validation = y_validation.tolist()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "Dq_KF5WAsbpC",
-    "outputId": "ba91b953-abcb-4bed-a5c5-9e429e68239a"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([\"\\nESTAMPEUR, s. m. en , est une\\nsorte de pilon de bois, surmonté d'un manche d'environ \\ndeux piés & demi. On s'en sert pour estamper\\nles formes où l'on veut faire des vergeoises. Voyez\\nVergeoise & Estamper.\\n\",\n",
-       "       \"\\nOn doit ébourgeonner les vignes, alors ce mot doit\\ns'entendre autrement que pour les arbres fruitiers:\\non ébourgeonne les vignes. non-seulement quand on\\nsupprime les bourgeons surnuméraires, mais encore\\nquand on arrête par-en-haut les bourgeons. Il en est\\nde même quand on détache en cassant les faux bourgeons \\nqui poussent d'ordinaire à chaque noeud à\\ncôté des yeux, à commencer par le bas. (K)\\n\",\n",
-       "       \"\\nBois mort en pié, s'il est pourri sur pié, sans\\nsubstance, & bon seulement à brûler.\\n\",\n",
-       "       ...,\n",
-       "       \"\\nIl y a une hydatoscopie naturelle & permise ; elle\\nconsiste à prévoir & à prédire les orages & les tempêtes \\nsur certains signes qu'on remarque dans la mer,\\ndans l'air, & dans les nuages. Voyez Tems & Ouragans. Dict. de Trévoux.\\n\",\n",
-       "       \"\\nMÉTÉOROMANCIE, s.f. () divination par\\nles météores ; & comme les météores ignés sont ceux\\nqui jettent le plus de crainte parmi les hommes, la\\nmétéoromancie désigne proprement la divination par\\nle tonnerre & les éclairs. Cette espece de divination\\npassa des Toscans aux Romains, sons rien perdre de\\nce qu'elle avoit de frivole. Seneque nous apprend\\nque deux auteurs graves, & qui avoient exercé des\\n\\nmagistratures, écrivoient à Rome sur cette matiere.\\nIl semble même que l'un d'eux l'épuisa entierement,\\ncar il donnoit une liste exacte des différentes especes\\nde tonnerres. Il circonstancioit & leurs noms & les\\nprognostics qui s'en pouvoient tirer ; le tout avec un\\nair de confiance plus surprenant encore que les choses\\nqu'il rapportoit. On eût dit, tant cette matiere météorologique lui étoit familiere, qu'il comptoit les tableaux \\nde sa galerie, ou qu'il faisoit la description\\ndes fleurs de son jardin. La plus ancienne maladie,\\nla plus invétérée, la plus incurable du genre humain,\\nc'est l'envie de connoître ce qui doit arriver.\\nNi le voile obscur qui nous cache notre destinée, ni\\nl'expérience journaliere, ni une infinité de tentatives \\nmalheureuses, n'ont pû guerir les hommes. Hé!\\nse dépréviennent-ils jamais d'une erreur agréablement \\nreçue? Nous sommes sur ce point aussi crédules\\nque nos ancêtres ; nous prêtons comme eux l'oreille\\nà toutes les impostures flatteuses. Pour avoir trompé\\ncent fois, elles n'ont point perdu le droit funeste de\\ntromper encore. (D. J.)\\n\",\n",
-       "       \"\\nPENTACLE, s. m. () c'est le nom que la\\nmagie des exorcismes donne à un sceau imprimé ou\\nsur du parchemin vierge fait de peau de bouc, ou\\nsur quelque métal, or, argent, cuivre, étain, plomb,\\n&c. On ne peut faire aucune opération magique pour\\nexorciser les esprits, sans avoir ce sceau qui contient\\nles noms de Dieu. Le pentacle se fait en renfermant\\nun triangle dans deux cercles : on lit dans ce triangle \\nces trois mots ; formatio, reformatio, transformatio. A côté du triangle est le mot agla, qui est très puissant \\npour arrêter la malice des esprits. Il faut que\\nla peau sur laquelle on applique le sceau soit exorcisée \\n& bénite. On exorcise aussi l'encre & la plume,\\ndont on se sert pour écrire les mots dont on vient de\\nparler. Après cela on encense le pentacle ; on l'enferme \\ntrois jours & trois nuits dans un vase bien net ;\\nenfin, on le met dans un linge ou dans un livre que\\nl'on parfume & que l'on exorcise. Voilà les fadaises\\nqu'on lit dans le livre intitulé Encheiridion Leonis papae, ouvrage misérable, qui n'a servi qu'à gâter davantage \\nles esprits crédules & portés à la superstitition.\\n(D. J.)\\n\"],\n",
-       "      dtype=object)"
-      ]
-     },
-     "execution_count": 41,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sentences_train"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -548,57 +418,7 @@
     "id": "C4bigx_3ibuN",
     "outputId": "b8cef3f8-7a6c-47d1-9d37-7b3b6d08f00b"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading CamemBERT tokenizer...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "06c6e7721b68449a9f3619ffdf18dfeb",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/811k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fba1d1d5c83b40659295a3457d74cb4e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/1.40M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6a29c1c28ceb415f91ec55512da981c5",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Load the BERT tokenizer.\n",
     "if model_chosen == \"bert\":\n",
@@ -619,15 +439,7 @@
     "id": "5hNod5X9jDZN",
     "outputId": "93b6e633-afb7-4bcc-be00-44388f801d64"
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Token indices sequence length is longer than the specified maximum sequence length for this model (1263 > 512). Running this sequence through the model will result in indexing errors\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     " # Tokenize all of the sentences and map the tokens to thier word IDs.\n",
     "input_ids_train = []\n",
@@ -685,16 +497,7 @@
     "id": "W9EWv5JvjGH3",
     "outputId": "32cd417d-9a40-4086-d900-b81982407667"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Max sentence length train:  2253\n",
-      "Max sentence length validation:  3067\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print('Max sentence length train: ', max([len(sen) for sen in input_ids_train]))\n",
     "print('Max sentence length validation: ', max([len(sen) for sen in input_ids_validation])) "
@@ -862,338 +665,7 @@
     "id": "C7M2Er1ajsTf",
     "outputId": "2c3f467d-ab09-4f8f-d464-a4e738333587"
    },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4873cc6c9e1d493c9a67d6536e4367a6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']\n",
-      "- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "CamembertForSequenceClassification(\n",
-       "  (roberta): RobertaModel(\n",
-       "    (embeddings): RobertaEmbeddings(\n",
-       "      (word_embeddings): Embedding(32005, 768, padding_idx=1)\n",
-       "      (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
-       "      (token_type_embeddings): Embedding(1, 768)\n",
-       "      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "      (dropout): Dropout(p=0.1, inplace=False)\n",
-       "    )\n",
-       "    (encoder): RobertaEncoder(\n",
-       "      (layer): ModuleList(\n",
-       "        (0): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (1): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (2): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (3): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (4): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (5): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (6): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (7): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (8): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (9): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (10): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (11): RobertaLayer(\n",
-       "          (attention): RobertaAttention(\n",
-       "            (self): RobertaSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "            (output): RobertaSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): RobertaIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): RobertaOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       "  (classifier): RobertaClassificationHead(\n",
-       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "    (dropout): Dropout(p=0.1, inplace=False)\n",
-       "    (out_proj): Linear(in_features=768, out_features=38, bias=True)\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 51,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Load BertForSequenceClassification, the pretrained BERT model with a single \n",
     "# linear classification layer on top.\n",
@@ -1267,320 +739,7 @@
     "id": "SbHBbYpwkKaA",
     "outputId": "49f7f5f4-716d-44c2-e299-505086a89061"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "======== Epoch 1 / 4 ========\n",
-      "Training...\n",
-      "  Batch    40  of  2,642.    Elapsed: 0:00:18.\n",
-      "  Batch    80  of  2,642.    Elapsed: 0:00:36.\n",
-      "  Batch   120  of  2,642.    Elapsed: 0:00:55.\n",
-      "  Batch   160  of  2,642.    Elapsed: 0:01:13.\n",
-      "  Batch   200  of  2,642.    Elapsed: 0:01:31.\n",
-      "  Batch   240  of  2,642.    Elapsed: 0:01:49.\n",
-      "  Batch   280  of  2,642.    Elapsed: 0:02:08.\n",
-      "  Batch   320  of  2,642.    Elapsed: 0:02:26.\n",
-      "  Batch   360  of  2,642.    Elapsed: 0:02:44.\n",
-      "  Batch   400  of  2,642.    Elapsed: 0:03:02.\n",
-      "  Batch   440  of  2,642.    Elapsed: 0:03:20.\n",
-      "  Batch   480  of  2,642.    Elapsed: 0:03:39.\n",
-      "  Batch   520  of  2,642.    Elapsed: 0:03:57.\n",
-      "  Batch   560  of  2,642.    Elapsed: 0:04:15.\n",
-      "  Batch   600  of  2,642.    Elapsed: 0:04:33.\n",
-      "  Batch   640  of  2,642.    Elapsed: 0:04:51.\n",
-      "  Batch   680  of  2,642.    Elapsed: 0:05:10.\n",
-      "  Batch   720  of  2,642.    Elapsed: 0:05:28.\n",
-      "  Batch   760  of  2,642.    Elapsed: 0:05:46.\n",
-      "  Batch   800  of  2,642.    Elapsed: 0:06:04.\n",
-      "  Batch   840  of  2,642.    Elapsed: 0:06:22.\n",
-      "  Batch   880  of  2,642.    Elapsed: 0:06:41.\n",
-      "  Batch   920  of  2,642.    Elapsed: 0:06:59.\n",
-      "  Batch   960  of  2,642.    Elapsed: 0:07:17.\n",
-      "  Batch 1,000  of  2,642.    Elapsed: 0:07:35.\n",
-      "  Batch 1,040  of  2,642.    Elapsed: 0:07:54.\n",
-      "  Batch 1,080  of  2,642.    Elapsed: 0:08:12.\n",
-      "  Batch 1,120  of  2,642.    Elapsed: 0:08:30.\n",
-      "  Batch 1,160  of  2,642.    Elapsed: 0:08:48.\n",
-      "  Batch 1,200  of  2,642.    Elapsed: 0:09:06.\n",
-      "  Batch 1,240  of  2,642.    Elapsed: 0:09:25.\n",
-      "  Batch 1,280  of  2,642.    Elapsed: 0:09:43.\n",
-      "  Batch 1,320  of  2,642.    Elapsed: 0:10:01.\n",
-      "  Batch 1,360  of  2,642.    Elapsed: 0:10:19.\n",
-      "  Batch 1,400  of  2,642.    Elapsed: 0:10:37.\n",
-      "  Batch 1,440  of  2,642.    Elapsed: 0:10:56.\n",
-      "  Batch 1,480  of  2,642.    Elapsed: 0:11:14.\n",
-      "  Batch 1,520  of  2,642.    Elapsed: 0:11:32.\n",
-      "  Batch 1,560  of  2,642.    Elapsed: 0:11:50.\n",
-      "  Batch 1,600  of  2,642.    Elapsed: 0:12:08.\n",
-      "  Batch 1,640  of  2,642.    Elapsed: 0:12:27.\n",
-      "  Batch 1,680  of  2,642.    Elapsed: 0:12:45.\n",
-      "  Batch 1,720  of  2,642.    Elapsed: 0:13:03.\n",
-      "  Batch 1,760  of  2,642.    Elapsed: 0:13:21.\n",
-      "  Batch 1,800  of  2,642.    Elapsed: 0:13:39.\n",
-      "  Batch 1,840  of  2,642.    Elapsed: 0:13:58.\n",
-      "  Batch 1,880  of  2,642.    Elapsed: 0:14:16.\n",
-      "  Batch 1,920  of  2,642.    Elapsed: 0:14:34.\n",
-      "  Batch 1,960  of  2,642.    Elapsed: 0:14:52.\n",
-      "  Batch 2,000  of  2,642.    Elapsed: 0:15:11.\n",
-      "  Batch 2,040  of  2,642.    Elapsed: 0:15:29.\n",
-      "  Batch 2,080  of  2,642.    Elapsed: 0:15:47.\n",
-      "  Batch 2,120  of  2,642.    Elapsed: 0:16:05.\n",
-      "  Batch 2,160  of  2,642.    Elapsed: 0:16:23.\n",
-      "  Batch 2,200  of  2,642.    Elapsed: 0:16:42.\n",
-      "  Batch 2,240  of  2,642.    Elapsed: 0:17:00.\n",
-      "  Batch 2,280  of  2,642.    Elapsed: 0:17:18.\n",
-      "  Batch 2,320  of  2,642.    Elapsed: 0:17:36.\n",
-      "  Batch 2,360  of  2,642.    Elapsed: 0:17:54.\n",
-      "  Batch 2,400  of  2,642.    Elapsed: 0:18:13.\n",
-      "  Batch 2,440  of  2,642.    Elapsed: 0:18:31.\n",
-      "  Batch 2,480  of  2,642.    Elapsed: 0:18:49.\n",
-      "  Batch 2,520  of  2,642.    Elapsed: 0:19:07.\n",
-      "  Batch 2,560  of  2,642.    Elapsed: 0:19:26.\n",
-      "  Batch 2,600  of  2,642.    Elapsed: 0:19:44.\n",
-      "  Batch 2,640  of  2,642.    Elapsed: 0:20:02.\n",
-      "\n",
-      "  Average training loss: 2.04\n",
-      "  Training epoch took: 0:20:03\n",
-      "\n",
-      "Running Validation...\n",
-      "  Accuracy: 0.75\n",
-      "  Validation took: 0:03:09\n",
-      "\n",
-      "======== Epoch 2 / 4 ========\n",
-      "Training...\n",
-      "  Batch    40  of  2,642.    Elapsed: 0:00:18.\n",
-      "  Batch    80  of  2,642.    Elapsed: 0:00:36.\n",
-      "  Batch   120  of  2,642.    Elapsed: 0:00:55.\n",
-      "  Batch   160  of  2,642.    Elapsed: 0:01:13.\n",
-      "  Batch   200  of  2,642.    Elapsed: 0:01:31.\n",
-      "  Batch   240  of  2,642.    Elapsed: 0:01:49.\n",
-      "  Batch   280  of  2,642.    Elapsed: 0:02:07.\n",
-      "  Batch   320  of  2,642.    Elapsed: 0:02:26.\n",
-      "  Batch   360  of  2,642.    Elapsed: 0:02:44.\n",
-      "  Batch   400  of  2,642.    Elapsed: 0:03:02.\n",
-      "  Batch   440  of  2,642.    Elapsed: 0:03:20.\n",
-      "  Batch   480  of  2,642.    Elapsed: 0:03:38.\n",
-      "  Batch   520  of  2,642.    Elapsed: 0:03:57.\n",
-      "  Batch   560  of  2,642.    Elapsed: 0:04:15.\n",
-      "  Batch   600  of  2,642.    Elapsed: 0:04:33.\n",
-      "  Batch   640  of  2,642.    Elapsed: 0:04:51.\n",
-      "  Batch   680  of  2,642.    Elapsed: 0:05:10.\n",
-      "  Batch   720  of  2,642.    Elapsed: 0:05:28.\n",
-      "  Batch   760  of  2,642.    Elapsed: 0:05:46.\n",
-      "  Batch   800  of  2,642.    Elapsed: 0:06:04.\n",
-      "  Batch   840  of  2,642.    Elapsed: 0:06:22.\n",
-      "  Batch   880  of  2,642.    Elapsed: 0:06:41.\n",
-      "  Batch   920  of  2,642.    Elapsed: 0:06:59.\n",
-      "  Batch   960  of  2,642.    Elapsed: 0:07:17.\n",
-      "  Batch 1,000  of  2,642.    Elapsed: 0:07:35.\n",
-      "  Batch 1,040  of  2,642.    Elapsed: 0:07:53.\n",
-      "  Batch 1,080  of  2,642.    Elapsed: 0:08:12.\n",
-      "  Batch 1,120  of  2,642.    Elapsed: 0:08:30.\n",
-      "  Batch 1,160  of  2,642.    Elapsed: 0:08:48.\n",
-      "  Batch 1,200  of  2,642.    Elapsed: 0:09:06.\n",
-      "  Batch 1,240  of  2,642.    Elapsed: 0:09:24.\n",
-      "  Batch 1,280  of  2,642.    Elapsed: 0:09:43.\n",
-      "  Batch 1,320  of  2,642.    Elapsed: 0:10:01.\n",
-      "  Batch 1,360  of  2,642.    Elapsed: 0:10:19.\n",
-      "  Batch 1,400  of  2,642.    Elapsed: 0:10:37.\n",
-      "  Batch 1,440  of  2,642.    Elapsed: 0:10:55.\n",
-      "  Batch 1,480  of  2,642.    Elapsed: 0:11:14.\n",
-      "  Batch 1,520  of  2,642.    Elapsed: 0:11:32.\n",
-      "  Batch 1,560  of  2,642.    Elapsed: 0:11:50.\n",
-      "  Batch 1,600  of  2,642.    Elapsed: 0:12:08.\n",
-      "  Batch 1,640  of  2,642.    Elapsed: 0:12:27.\n",
-      "  Batch 1,680  of  2,642.    Elapsed: 0:12:45.\n",
-      "  Batch 1,720  of  2,642.    Elapsed: 0:13:03.\n",
-      "  Batch 1,760  of  2,642.    Elapsed: 0:13:21.\n",
-      "  Batch 1,800  of  2,642.    Elapsed: 0:13:39.\n",
-      "  Batch 1,840  of  2,642.    Elapsed: 0:13:58.\n",
-      "  Batch 1,880  of  2,642.    Elapsed: 0:14:16.\n",
-      "  Batch 1,920  of  2,642.    Elapsed: 0:14:34.\n",
-      "  Batch 1,960  of  2,642.    Elapsed: 0:14:52.\n",
-      "  Batch 2,000  of  2,642.    Elapsed: 0:15:10.\n",
-      "  Batch 2,040  of  2,642.    Elapsed: 0:15:29.\n",
-      "  Batch 2,080  of  2,642.    Elapsed: 0:15:47.\n",
-      "  Batch 2,120  of  2,642.    Elapsed: 0:16:05.\n",
-      "  Batch 2,160  of  2,642.    Elapsed: 0:16:23.\n",
-      "  Batch 2,200  of  2,642.    Elapsed: 0:16:41.\n",
-      "  Batch 2,240  of  2,642.    Elapsed: 0:17:00.\n",
-      "  Batch 2,280  of  2,642.    Elapsed: 0:17:18.\n",
-      "  Batch 2,320  of  2,642.    Elapsed: 0:17:36.\n",
-      "  Batch 2,360  of  2,642.    Elapsed: 0:17:54.\n",
-      "  Batch 2,400  of  2,642.    Elapsed: 0:18:12.\n",
-      "  Batch 2,440  of  2,642.    Elapsed: 0:18:31.\n",
-      "  Batch 2,480  of  2,642.    Elapsed: 0:18:49.\n",
-      "  Batch 2,520  of  2,642.    Elapsed: 0:19:07.\n",
-      "  Batch 2,560  of  2,642.    Elapsed: 0:19:25.\n",
-      "  Batch 2,600  of  2,642.    Elapsed: 0:19:44.\n",
-      "  Batch 2,640  of  2,642.    Elapsed: 0:20:02.\n",
-      "\n",
-      "  Average training loss: 1.03\n",
-      "  Training epoch took: 0:20:02\n",
-      "\n",
-      "Running Validation...\n",
-      "  Accuracy: 0.79\n",
-      "  Validation took: 0:03:09\n",
-      "\n",
-      "======== Epoch 3 / 4 ========\n",
-      "Training...\n",
-      "  Batch    40  of  2,642.    Elapsed: 0:00:18.\n",
-      "  Batch    80  of  2,642.    Elapsed: 0:00:36.\n",
-      "  Batch   120  of  2,642.    Elapsed: 0:00:55.\n",
-      "  Batch   160  of  2,642.    Elapsed: 0:01:13.\n",
-      "  Batch   200  of  2,642.    Elapsed: 0:01:31.\n",
-      "  Batch   240  of  2,642.    Elapsed: 0:01:49.\n",
-      "  Batch   280  of  2,642.    Elapsed: 0:02:07.\n",
-      "  Batch   320  of  2,642.    Elapsed: 0:02:26.\n",
-      "  Batch   360  of  2,642.    Elapsed: 0:02:44.\n",
-      "  Batch   400  of  2,642.    Elapsed: 0:03:02.\n",
-      "  Batch   440  of  2,642.    Elapsed: 0:03:20.\n",
-      "  Batch   480  of  2,642.    Elapsed: 0:03:38.\n",
-      "  Batch   520  of  2,642.    Elapsed: 0:03:57.\n",
-      "  Batch   560  of  2,642.    Elapsed: 0:04:15.\n",
-      "  Batch   600  of  2,642.    Elapsed: 0:04:33.\n",
-      "  Batch   640  of  2,642.    Elapsed: 0:04:51.\n",
-      "  Batch   680  of  2,642.    Elapsed: 0:05:09.\n",
-      "  Batch   720  of  2,642.    Elapsed: 0:05:28.\n",
-      "  Batch   760  of  2,642.    Elapsed: 0:05:46.\n",
-      "  Batch   800  of  2,642.    Elapsed: 0:06:04.\n",
-      "  Batch   840  of  2,642.    Elapsed: 0:06:22.\n",
-      "  Batch   880  of  2,642.    Elapsed: 0:06:41.\n",
-      "  Batch   920  of  2,642.    Elapsed: 0:06:59.\n",
-      "  Batch   960  of  2,642.    Elapsed: 0:07:17.\n",
-      "  Batch 1,000  of  2,642.    Elapsed: 0:07:35.\n",
-      "  Batch 1,040  of  2,642.    Elapsed: 0:07:53.\n",
-      "  Batch 1,080  of  2,642.    Elapsed: 0:08:12.\n",
-      "  Batch 1,120  of  2,642.    Elapsed: 0:08:30.\n",
-      "  Batch 1,160  of  2,642.    Elapsed: 0:08:48.\n",
-      "  Batch 1,200  of  2,642.    Elapsed: 0:09:06.\n",
-      "  Batch 1,240  of  2,642.    Elapsed: 0:09:24.\n",
-      "  Batch 1,280  of  2,642.    Elapsed: 0:09:43.\n",
-      "  Batch 1,320  of  2,642.    Elapsed: 0:10:01.\n",
-      "  Batch 1,360  of  2,642.    Elapsed: 0:10:19.\n",
-      "  Batch 1,400  of  2,642.    Elapsed: 0:10:37.\n",
-      "  Batch 1,440  of  2,642.    Elapsed: 0:10:55.\n",
-      "  Batch 1,480  of  2,642.    Elapsed: 0:11:14.\n",
-      "  Batch 1,520  of  2,642.    Elapsed: 0:11:32.\n",
-      "  Batch 1,560  of  2,642.    Elapsed: 0:11:50.\n",
-      "  Batch 1,600  of  2,642.    Elapsed: 0:12:08.\n",
-      "  Batch 1,640  of  2,642.    Elapsed: 0:12:26.\n",
-      "  Batch 1,680  of  2,642.    Elapsed: 0:12:45.\n",
-      "  Batch 1,720  of  2,642.    Elapsed: 0:13:03.\n",
-      "  Batch 1,760  of  2,642.    Elapsed: 0:13:21.\n",
-      "  Batch 1,800  of  2,642.    Elapsed: 0:13:39.\n",
-      "  Batch 1,840  of  2,642.    Elapsed: 0:13:57.\n",
-      "  Batch 1,880  of  2,642.    Elapsed: 0:14:16.\n",
-      "  Batch 1,920  of  2,642.    Elapsed: 0:14:34.\n",
-      "  Batch 1,960  of  2,642.    Elapsed: 0:14:52.\n",
-      "  Batch 2,000  of  2,642.    Elapsed: 0:15:10.\n",
-      "  Batch 2,040  of  2,642.    Elapsed: 0:15:28.\n",
-      "  Batch 2,080  of  2,642.    Elapsed: 0:15:47.\n",
-      "  Batch 2,120  of  2,642.    Elapsed: 0:16:05.\n",
-      "  Batch 2,160  of  2,642.    Elapsed: 0:16:23.\n",
-      "  Batch 2,200  of  2,642.    Elapsed: 0:16:41.\n",
-      "  Batch 2,240  of  2,642.    Elapsed: 0:17:00.\n",
-      "  Batch 2,280  of  2,642.    Elapsed: 0:17:18.\n",
-      "  Batch 2,320  of  2,642.    Elapsed: 0:17:36.\n",
-      "  Batch 2,360  of  2,642.    Elapsed: 0:17:54.\n",
-      "  Batch 2,400  of  2,642.    Elapsed: 0:18:12.\n",
-      "  Batch 2,440  of  2,642.    Elapsed: 0:18:31.\n",
-      "  Batch 2,480  of  2,642.    Elapsed: 0:18:49.\n",
-      "  Batch 2,520  of  2,642.    Elapsed: 0:19:07.\n",
-      "  Batch 2,560  of  2,642.    Elapsed: 0:19:25.\n",
-      "  Batch 2,600  of  2,642.    Elapsed: 0:19:43.\n",
-      "  Batch 2,640  of  2,642.    Elapsed: 0:20:02.\n",
-      "\n",
-      "  Average training loss: 0.75\n",
-      "  Training epoch took: 0:20:02\n",
-      "\n",
-      "Running Validation...\n",
-      "  Accuracy: 0.79\n",
-      "  Validation took: 0:03:09\n",
-      "\n",
-      "======== Epoch 4 / 4 ========\n",
-      "Training...\n",
-      "  Batch    40  of  2,642.    Elapsed: 0:00:18.\n",
-      "  Batch    80  of  2,642.    Elapsed: 0:00:36.\n",
-      "  Batch   120  of  2,642.    Elapsed: 0:00:55.\n",
-      "  Batch   160  of  2,642.    Elapsed: 0:01:13.\n",
-      "  Batch   200  of  2,642.    Elapsed: 0:01:31.\n",
-      "  Batch   240  of  2,642.    Elapsed: 0:01:49.\n",
-      "  Batch   280  of  2,642.    Elapsed: 0:02:07.\n",
-      "  Batch   320  of  2,642.    Elapsed: 0:02:26.\n",
-      "  Batch   360  of  2,642.    Elapsed: 0:02:44.\n",
-      "  Batch   400  of  2,642.    Elapsed: 0:03:02.\n",
-      "  Batch   440  of  2,642.    Elapsed: 0:03:20.\n",
-      "  Batch   480  of  2,642.    Elapsed: 0:03:39.\n",
-      "  Batch   520  of  2,642.    Elapsed: 0:03:57.\n",
-      "  Batch   560  of  2,642.    Elapsed: 0:04:15.\n",
-      "  Batch   600  of  2,642.    Elapsed: 0:04:33.\n",
-      "  Batch   640  of  2,642.    Elapsed: 0:04:51.\n",
-      "  Batch   680  of  2,642.    Elapsed: 0:05:10.\n",
-      "  Batch   720  of  2,642.    Elapsed: 0:05:28.\n",
-      "  Batch   760  of  2,642.    Elapsed: 0:05:46.\n",
-      "  Batch   800  of  2,642.    Elapsed: 0:06:04.\n",
-      "  Batch   840  of  2,642.    Elapsed: 0:06:22.\n",
-      "  Batch   880  of  2,642.    Elapsed: 0:06:41.\n",
-      "  Batch   920  of  2,642.    Elapsed: 0:06:59.\n",
-      "  Batch   960  of  2,642.    Elapsed: 0:07:17.\n",
-      "  Batch 1,000  of  2,642.    Elapsed: 0:07:35.\n",
-      "  Batch 1,040  of  2,642.    Elapsed: 0:07:53.\n",
-      "  Batch 1,080  of  2,642.    Elapsed: 0:08:12.\n",
-      "  Batch 1,120  of  2,642.    Elapsed: 0:08:30.\n",
-      "  Batch 1,160  of  2,642.    Elapsed: 0:08:48.\n",
-      "  Batch 1,200  of  2,642.    Elapsed: 0:09:06.\n",
-      "  Batch 1,240  of  2,642.    Elapsed: 0:09:24.\n",
-      "  Batch 1,280  of  2,642.    Elapsed: 0:09:43.\n",
-      "  Batch 1,320  of  2,642.    Elapsed: 0:10:01.\n",
-      "  Batch 1,360  of  2,642.    Elapsed: 0:10:19.\n",
-      "  Batch 1,400  of  2,642.    Elapsed: 0:10:37.\n",
-      "  Batch 1,440  of  2,642.    Elapsed: 0:10:55.\n",
-      "  Batch 1,480  of  2,642.    Elapsed: 0:11:14.\n",
-      "  Batch 1,520  of  2,642.    Elapsed: 0:11:32.\n",
-      "  Batch 1,560  of  2,642.    Elapsed: 0:11:50.\n",
-      "  Batch 1,600  of  2,642.    Elapsed: 0:12:08.\n",
-      "  Batch 1,640  of  2,642.    Elapsed: 0:12:26.\n",
-      "  Batch 1,680  of  2,642.    Elapsed: 0:12:45.\n",
-      "  Batch 1,720  of  2,642.    Elapsed: 0:13:03.\n",
-      "  Batch 1,760  of  2,642.    Elapsed: 0:13:21.\n",
-      "  Batch 1,800  of  2,642.    Elapsed: 0:13:39.\n",
-      "  Batch 1,840  of  2,642.    Elapsed: 0:13:57.\n",
-      "  Batch 1,880  of  2,642.    Elapsed: 0:14:16.\n",
-      "  Batch 1,920  of  2,642.    Elapsed: 0:14:34.\n",
-      "  Batch 1,960  of  2,642.    Elapsed: 0:14:52.\n",
-      "  Batch 2,000  of  2,642.    Elapsed: 0:15:10.\n",
-      "  Batch 2,040  of  2,642.    Elapsed: 0:15:28.\n",
-      "  Batch 2,080  of  2,642.    Elapsed: 0:15:46.\n",
-      "  Batch 2,120  of  2,642.    Elapsed: 0:16:05.\n",
-      "  Batch 2,160  of  2,642.    Elapsed: 0:16:23.\n",
-      "  Batch 2,200  of  2,642.    Elapsed: 0:16:41.\n",
-      "  Batch 2,240  of  2,642.    Elapsed: 0:16:59.\n",
-      "  Batch 2,280  of  2,642.    Elapsed: 0:17:17.\n",
-      "  Batch 2,320  of  2,642.    Elapsed: 0:17:36.\n",
-      "  Batch 2,360  of  2,642.    Elapsed: 0:17:54.\n",
-      "  Batch 2,400  of  2,642.    Elapsed: 0:18:12.\n",
-      "  Batch 2,440  of  2,642.    Elapsed: 0:18:30.\n",
-      "  Batch 2,480  of  2,642.    Elapsed: 0:18:48.\n",
-      "  Batch 2,520  of  2,642.    Elapsed: 0:19:07.\n",
-      "  Batch 2,560  of  2,642.    Elapsed: 0:19:25.\n",
-      "  Batch 2,600  of  2,642.    Elapsed: 0:19:43.\n",
-      "  Batch 2,640  of  2,642.    Elapsed: 0:20:01.\n",
-      "\n",
-      "  Average training loss: 0.60\n",
-      "  Training epoch took: 0:20:02\n",
-      "\n",
-      "Running Validation...\n",
-      "  Accuracy: 0.80\n",
-      "  Validation took: 0:03:09\n",
-      "\n",
-      "Training complete!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# This training code is based on the `run_glue.py` script here:\n",
     "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n",
@@ -1938,17 +1097,6 @@
     "  return pred_labels_, true_labels_"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "AJ0suC8iMs8a"
-   },
-   "outputs": [],
-   "source": [
-    "dataset_name = [\"validation\", \"test\"]"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1957,59 +1105,82 @@
    },
    "outputs": [],
    "source": [
-    "for dataset in dataset_name:\n",
+    "\n",
+    "for dataset in [\"test\", \"validation\"]:\n",
     "  df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n",
+    "  df_eval = df_eval.dropna(subset=[columnClass]).reset_index(drop=True)   # supprimer les NaN...\n",
+    "  \n",
     "  data_eval = df_eval[columnText].values\n",
     "\n",
     "  y = df_eval[columnClass]\n",
+    "\n",
+    "  \n",
+    "  \n",
     "  encoder = preprocessing.LabelEncoder()\n",
     "  y = encoder.fit_transform(y)\n",
     "  labels = y.tolist()\n",
     "\n",
-    "  pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n",
-    "\n",
-    "\n",
-    "  report = classification_report( pred_labels_, true_labels_, output_dict = True)\n",
+    "  # for maxOfInstancePerClass in [500, 1500, 10000]:\n",
+    "  for maxOfInstancePerClass in [500]:\n",
+    "    # il manque le model BERT s500 ...\n",
+    "    \n",
+    "    #for model_bert in [\"camembert-base\", \"bert-base-multilingual-cased\"]:\n",
+    "    for model_bert in [\"bert-base-multilingual-cased\"]:\n",
     "      \n",
-    "  classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
-    "  classesName = encoder.classes_\n",
-    "\n",
-    "  precision = []\n",
-    "  recall = []\n",
-    "  f1 = []\n",
-    "  support = []\n",
-    "  dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
-    "  for c in classes:\n",
-    "    precision.append(report[c]['precision'])\n",
-    "    recall.append(report[c]['recall'])\n",
-    "    f1.append(report[c]['f1-score'])\n",
-    "    support.append(report[c]['support'])\n",
-    "\n",
-    "  accuracy = report['accuracy']\n",
-    "  weighted_avg = report['weighted avg']\n",
-    "  cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n",
-    "  FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
-    "  FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
-    "  TP = np.diag(cnf_matrix)\n",
-    "  TN = cnf_matrix.sum() - (FP + FN + TP)\n",
-    "\n",
-    "  dff['className'] = classesName\n",
-    "  dff['precision'] = precision\n",
-    "  dff['recall'] = recall\n",
-    "  dff['f1-score'] = f1\n",
-    "  dff['support'] = support\n",
-    "  dff['FP'] = FP\n",
-    "  dff['FN'] = FN\n",
-    "  dff['TP'] = TP\n",
-    "  dff['TN'] = TN\n",
-    "\n",
-    "  print(dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass))\n",
-    "\n",
-    "  print(weighted_avg)\n",
-    "  print(accuracy)\n",
-    "  print(dff)\n",
-    "\n",
-    "  dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)"
+    "      model_path = \"drive/MyDrive/Classification-EDdA/model_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".pt\"\n",
+    "      model = torch.load(model_path)\n",
+    "\n",
+    "      if model_bert == \"bert-base-multilingual-cased\":\n",
+    "        tokenizer = BertTokenizer.from_pretrained(model_bert)\n",
+    "      elif model_bert == \"camembert-base\":\n",
+    "        tokenizer = CamembertTokenizer.from_pretrained(model_bert)\n",
+    "\n",
+    "      pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n",
+    "\n",
+    "\n",
+    "      report = classification_report(true_labels_, pred_labels_,  output_dict = True)\n",
+    "          \n",
+    "      classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
+    "      classesName = encoder.classes_\n",
+    "\n",
+    "      precision = []\n",
+    "      recall = []\n",
+    "      f1 = []\n",
+    "      support = []\n",
+    "      dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
+    "      for c in classes:\n",
+    "        precision.append(report[c]['precision'])\n",
+    "        recall.append(report[c]['recall'])\n",
+    "        f1.append(report[c]['f1-score'])\n",
+    "        support.append(report[c]['support'])\n",
+    "\n",
+    "      accuracy = report['accuracy']\n",
+    "      weighted_avg = report['weighted avg']\n",
+    "      cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n",
+    "      FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
+    "      FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
+    "      TP = np.diag(cnf_matrix)\n",
+    "      TN = cnf_matrix.sum() - (FP + FN + TP)\n",
+    "\n",
+    "      dff['className'] = classesName\n",
+    "      dff['precision'] = precision\n",
+    "      dff['recall'] = recall\n",
+    "      dff['f1-score'] = f1\n",
+    "      dff['support'] = support\n",
+    "      dff['FP'] = FP\n",
+    "      dff['FN'] = FN\n",
+    "      dff['TP'] = TP\n",
+    "      dff['TN'] = TN\n",
+    "\n",
+    "      print(dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass))\n",
+    "\n",
+    "      print(weighted_avg)\n",
+    "      print(accuracy)\n",
+    "      print(dff)\n",
+    "\n",
+    "      dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n",
+    "      # enregistrer les predictions\n",
+    "      pd.DataFrame({'labels': pd.Series(true_labels_), 'predictions': pd.Series(pred_labels_)}).to_csv(\"drive/MyDrive/Classification-EDdA/predictions/predictions_\"+dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".csv\")\n"
    ]
   },
   {
@@ -2065,957 +1236,6 @@
    },
    "outputs": [],
    "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "x_n57EvhJMQh"
-   },
-   "outputs": [],
-   "source": [
-    "model_path = \"drive/MyDrive/Classification-EDdA/model_bert-base-multilingual-cased_s10000.pt\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "R3_9tA9MI8ju"
-   },
-   "outputs": [],
-   "source": [
-    "model = torch.load(model_path)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "_fzgS5USJeAF",
-    "outputId": "be4a5506-76ed-4eef-bb3c-fe2bb77c6e4d"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--2021-09-30 19:38:22--  https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv\n",
-      "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n",
-      "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n",
-      "HTTP request sent, awaiting response... 200 OK\n",
-      "Length: 356197 (348K) [text/tab-separated-values]\n",
-      "Saving to: ‘LGE_withContent.tsv’\n",
-      "\n",
-      "LGE_withContent.tsv 100%[===================>] 347.85K   567KB/s    in 0.6s    \n",
-      "\n",
-      "2021-09-30 19:38:24 (567 KB/s) - ‘LGE_withContent.tsv’ saved [356197/356197]\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "!wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "8WEJjQC7I8mP"
-   },
-   "outputs": [],
-   "source": [
-    "df_LGE = pd.read_csv(\"LGE_withContent.tsv\", sep=\"\\t\")\n",
-    "data_LGE = df_LGE[\"content\"].values\n",
-    "\n",
-    "\n",
-    "#pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 206
-    },
-    "id": "9qJDTU-6vzkk",
-    "outputId": "1b279f0e-7715-4d23-f524-08e8ba327f6c"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>tome</th>\n",
-       "      <th>rank</th>\n",
-       "      <th>domain</th>\n",
-       "      <th>remark</th>\n",
-       "      <th>content</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>abrabeses-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>623</td>\n",
-       "      <td>geography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ABRABESES. Village d’Espagne de la prov. de Za...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>accius-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1076</td>\n",
-       "      <td>biography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>achenbach-2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1357</td>\n",
-       "      <td>biography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACHENBACH(Henri), administrateur prussien, né ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>acireale-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1513</td>\n",
-       "      <td>geography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>actée-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1731</td>\n",
-       "      <td>botany</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACTÉE(Actœa L.). Genre de plantes de la famill...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            id  tome  ...  remark                                            content\n",
-       "0  abrabeses-0     1  ...     NaN  ABRABESES. Village d’Espagne de la prov. de Za...\n",
-       "1     accius-0     1  ...     NaN  ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...\n",
-       "2  achenbach-2     1  ...     NaN  ACHENBACH(Henri), administrateur prussien, né ...\n",
-       "3   acireale-0     1  ...     NaN  ACIREALE. Yille de Sicile, de la province et d...\n",
-       "4      actée-0     1  ...     NaN  ACTÉE(Actœa L.). Genre de plantes de la famill...\n",
-       "\n",
-       "[5 rows x 6 columns]"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_LGE.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "71-fP61-OOwQ",
-    "outputId": "ef08b49e-0a9f-4653-e303-3163250af35b"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(310, 6)"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_LGE.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "lFFed2EAI8oq"
-   },
-   "outputs": [],
-   "source": [
-    "def generate_prediction_dataloader(chosen_model, sentences_to_predict, batch_size = 8, max_len = 512):\n",
-    "\n",
-    "    if chosen_model == 'bert-base-multilingual-cased' :\n",
-    "        print('Loading Bert Tokenizer...')\n",
-    "        tokenizer = BertTokenizer.from_pretrained(chosen_model)\n",
-    "    elif chosen_model == 'camembert-base':\n",
-    "        print('Loading Camembert Tokenizer...')\n",
-    "        tokenizer = CamembertTokenizer.from_pretrained(chosen_model)\n",
-    "\n",
-    "    # Tokenize all of the sentences and map the tokens to thier word IDs.\n",
-    "    input_ids_test = []\n",
-    "    # For every sentence...\n",
-    "    for sent in sentences_to_predict:\n",
-    "        # `encode` will:\n",
-    "        #   (1) Tokenize the sentence.\n",
-    "        #   (2) Prepend the `[CLS]` token to the start.\n",
-    "        #   (3) Append the `[SEP]` token to the end.\n",
-    "        #   (4) Map tokens to their IDs.\n",
-    "        encoded_sent = tokenizer.encode(\n",
-    "                            sent,                      # Sentence to encode.\n",
-    "                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
-    "                    )\n",
-    "\n",
-    "        input_ids_test.append(encoded_sent)\n",
-    "\n",
-    "    # Pad our input tokens\n",
-    "    padded_test = []\n",
-    "    for i in input_ids_test:\n",
-    "\n",
-    "        if len(i) > max_len:\n",
-    "            padded_test.extend([i[:max_len]])\n",
-    "        else:\n",
-    "\n",
-    "            padded_test.extend([i + [0] * (max_len - len(i))])\n",
-    "    input_ids_test = np.array(padded_test)\n",
-    "\n",
-    "    # Create attention masks\n",
-    "    attention_masks = []\n",
-    "\n",
-    "    # Create a mask of 1s for each token followed by 0s for padding\n",
-    "    for seq in input_ids_test:\n",
-    "        seq_mask = [float(i>0) for i in seq]\n",
-    "        attention_masks.append(seq_mask)\n",
-    "\n",
-    "    # Convert to tensors.\n",
-    "    prediction_inputs = torch.tensor(input_ids_test)\n",
-    "    prediction_masks = torch.tensor(attention_masks)\n",
-    "    #set batch size\n",
-    "\n",
-    "\n",
-    "    # Create the DataLoader.\n",
-    "    prediction_data = TensorDataset(prediction_inputs, prediction_masks)\n",
-    "    prediction_sampler = SequentialSampler(prediction_data)\n",
-    "    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)\n",
-    "\n",
-    "    return prediction_dataloader\n",
-    "\n",
-    "\n",
-    "\n",
-    "def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):\n",
-    "\n",
-    "\n",
-    "    # If there's a GPU available...\n",
-    "    if torch.cuda.is_available():\n",
-    "\n",
-    "        # Tell PyTorch to use the GPU.\n",
-    "        device = torch.device(\"cuda\")\n",
-    "\n",
-    "        print('There are %d GPU(s) available.' % torch.cuda.device_count())\n",
-    "\n",
-    "        print('We will use the GPU:', torch.cuda.get_device_name(0))\n",
-    "\n",
-    "        # If not...\n",
-    "    else:\n",
-    "        print('No GPU available, using the CPU instead.')\n",
-    "        device = torch.device(\"cpu\")\n",
-    "\n",
-    "    # Put model in evaluation mode\n",
-    "    model.eval()\n",
-    "\n",
-    "    # Tracking variables\n",
-    "    predictions_test , true_labels = [], []\n",
-    "    pred_labels_ = []\n",
-    "    # Predict\n",
-    "    for batch in sentences_to_predict_dataloader:\n",
-    "    # Add batch to GPU\n",
-    "        batch = tuple(t.to(device) for t in batch)\n",
-    "\n",
-    "        # Unpack the inputs from the dataloader\n",
-    "        b_input_ids, b_input_mask = batch\n",
-    "\n",
-    "        # Telling the model not to compute or store gradients, saving memory and\n",
-    "        # speeding up prediction\n",
-    "        with torch.no_grad():\n",
-    "            # Forward pass, calculate logit predictions\n",
-    "            outputs = model(b_input_ids, token_type_ids=None,\n",
-    "                            attention_mask=b_input_mask)\n",
-    "\n",
-    "        logits = outputs[0]\n",
-    "        #print(logits)\n",
-    "\n",
-    "        # Move logits and labels to CPU\n",
-    "        logits = logits.detach().cpu().numpy()\n",
-    "        #print(logits)\n",
-    "\n",
-    "        # Store predictions and true labels\n",
-    "        predictions_test.append(logits)\n",
-    "\n",
-    "        #print('    DONE.')\n",
-    "\n",
-    "        pred_labels = []\n",
-    "        \n",
-    "        for i in range(len(predictions_test)):\n",
-    "\n",
-    "            # The predictions for this batch are a 2-column ndarray (one column for \"0\"\n",
-    "            # and one column for \"1\"). Pick the label with the highest value and turn this\n",
-    "            # in to a list of 0s and 1s.\n",
-    "            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()\n",
-    "            pred_labels.append(pred_labels_i)\n",
-    "\n",
-    "    pred_labels_ += [item for sublist in pred_labels for item in sublist]\n",
-    "    return pred_labels_\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "O9eer_kgI8rC",
-    "outputId": "94ea7418-14a8-4918-e210-caf0018f5989"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading Bert Tokenizer...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors\n"
-     ]
-    }
-   ],
-   "source": [
-    "data_loader = generate_prediction_dataloader('bert-base-multilingual-cased', data_LGE)\n",
-    "#data_loader = generate_prediction_dataloader('camembert-base', data_LGE)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "sFpAwbrBwF2h",
-    "outputId": "8d210732-619d-41f0-b6e2-ad9d06a85069"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "There are 1 GPU(s) available.\n",
-      "We will use the GPU: Tesla P100-PCIE-16GB\n"
-     ]
-    }
-   ],
-   "source": [
-    "p = predict_class_bertFineTuning( model, data_loader )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "51HF6-8UPSTc",
-    "outputId": "26bff792-eb8d-4e1a-efa4-a7a6c9d32bf9"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "310"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(p)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "rFFGhaCvQHfh"
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "qgJ-O4rcQHiI",
-    "outputId": "bfe93dd6-4d89-4d5c-be0d-45e1c98c6b14"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "LabelEncoder()"
-      ]
-     },
-     "execution_count": 41,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Il faudrait enregistrer l'encoder, \n",
-    "# sinon on est obligé de le refaire à partir du jeu d'entrainement pour récupérer le noms des classes.\n",
-    "encoder"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "QuST9wJoQHnS"
-   },
-   "outputs": [],
-   "source": [
-    "p2 = list(encoder.inverse_transform(p))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "6ek7suq9QHqE",
-    "outputId": "6636983a-7eba-48c8-d884-f8fb437294dc"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Chimie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Géographie',\n",
-       " 'Mathématiques',\n",
-       " 'Histoire',\n",
-       " 'Géographie',\n",
-       " 'Musique',\n",
-       " 'Commerce',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Histoire',\n",
-       " 'Géographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Géographie',\n",
-       " 'Physique - [Sciences physico-mathématiques]',\n",
-       " 'Histoire naturelle',\n",
-       " 'Chimie',\n",
-       " 'Histoire',\n",
-       " 'Physique - [Sciences physico-mathématiques]',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Histoire',\n",
-       " 'Histoire naturelle',\n",
-       " 'Médecine - Chirurgie',\n",
-       " 'Géographie',\n",
-       " 'Architecture',\n",
-       " 'Histoire naturelle',\n",
-       " 'Histoire naturelle',\n",
-       " 'Géographie',\n",
-       " 'Arts et métiers',\n",
-       " 'Géographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Marine',\n",
-       " 'Histoire',\n",
-       " 'Géographie',\n",
-       " 'Architecture',\n",
-       " 'Histoire naturelle',\n",
-       " 'Beaux-arts',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Médecine - Chirurgie',\n",
-       " 'Géographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Chimie',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Religion',\n",
-       " 'Histoire naturelle',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Agriculture - Economie rustique',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Jeu',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Histoire naturelle',\n",
-       " 'Géographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Histoire',\n",
-       " 'Histoire naturelle',\n",
-       " 'Commerce',\n",
-       " 'Histoire',\n",
-       " 'Militaire (Art) - Guerre - Arme',\n",
-       " 'Histoire',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Histoire',\n",
-       " 'Géographie',\n",
-       " 'Religion',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Agriculture - Economie rustique',\n",
-       " 'Histoire',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Métiers',\n",
-       " 'Belles-lettres - Poésie',\n",
-       " 'Beaux-arts',\n",
-       " 'Religion',\n",
-       " 'Architecture',\n",
-       " 'Architecture',\n",
-       " 'Architecture',\n",
-       " 'Géographie',\n",
-       " 'Chimie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Histoire naturelle',\n",
-       " 'Militaire (Art) - Guerre - Arme',\n",
-       " 'Géographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Médecine - Chirurgie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Minéralogie',\n",
-       " 'Belles-lettres - Poésie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Médecine - Chirurgie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Grammaire',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Mathématiques',\n",
-       " 'Géographie',\n",
-       " 'Médecine - Chirurgie',\n",
-       " 'Blason',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Histoire naturelle',\n",
-       " 'Militaire (Art) - Guerre - Arme',\n",
-       " 'Géographie',\n",
-       " 'Antiquité',\n",
-       " 'Agriculture - Economie rustique',\n",
-       " 'Chimie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Belles-lettres - Poésie',\n",
-       " 'Histoire',\n",
-       " 'Géographie',\n",
-       " 'Métiers',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Arts et métiers',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Géographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Musique',\n",
-       " 'Médecine - Chirurgie',\n",
-       " 'Religion',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Histoire',\n",
-       " 'Droit - Jurisprudence',\n",
-       " 'Histoire',\n",
-       " 'Médecine - Chirurgie',\n",
-       " 'Histoire',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Chimie',\n",
-       " 'Antiquité',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Histoire',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Beaux-arts',\n",
-       " 'Histoire',\n",
-       " 'Géographie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Antiquité',\n",
-       " 'Grammaire',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Beaux-arts',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Histoire',\n",
-       " 'Architecture',\n",
-       " 'Commerce',\n",
-       " 'Antiquité',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Médecine - Chirurgie',\n",
-       " 'Histoire naturelle',\n",
-       " 'Histoire',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Anatomie',\n",
-       " 'Commerce',\n",
-       " 'Beaux-arts',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Histoire naturelle',\n",
-       " 'Géographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Commerce',\n",
-       " 'Architecture',\n",
-       " 'Commerce',\n",
-       " 'Antiquité',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Médecine - Chirurgie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Antiquité',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Histoire',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Antiquité',\n",
-       " 'Géographie',\n",
-       " 'Religion',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Philosophie',\n",
-       " 'Géographie',\n",
-       " 'Chimie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Géographie',\n",
-       " 'Beaux-arts',\n",
-       " 'Commerce',\n",
-       " 'Commerce',\n",
-       " 'Géographie',\n",
-       " 'Géographie']"
-      ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "p2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "XvdDj5PBQHtk"
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "t39Xs0j7QHXJ"
-   },
-   "outputs": [],
-   "source": [
-    "df_LGE['class_bert'] = p2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 206
-    },
-    "id": "-VZ7geRmQHaD",
-    "outputId": "350a4122-5b1f-43e2-e372-2f628f665c4a"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>tome</th>\n",
-       "      <th>rank</th>\n",
-       "      <th>domain</th>\n",
-       "      <th>remark</th>\n",
-       "      <th>content</th>\n",
-       "      <th>class_bert</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>abrabeses-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>623</td>\n",
-       "      <td>geography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ABRABESES. Village d’Espagne de la prov. de Za...</td>\n",
-       "      <td>Géographie</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>accius-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1076</td>\n",
-       "      <td>biography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n",
-       "      <td>Géographie</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>achenbach-2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1357</td>\n",
-       "      <td>biography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACHENBACH(Henri), administrateur prussien, né ...</td>\n",
-       "      <td>Géographie</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>acireale-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1513</td>\n",
-       "      <td>geography</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n",
-       "      <td>Géographie</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>actée-0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1731</td>\n",
-       "      <td>botany</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ACTÉE(Actœa L.). Genre de plantes de la famill...</td>\n",
-       "      <td>Histoire naturelle</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            id  ...          class_bert\n",
-       "0  abrabeses-0  ...          Géographie\n",
-       "1     accius-0  ...          Géographie\n",
-       "2  achenbach-2  ...          Géographie\n",
-       "3   acireale-0  ...          Géographie\n",
-       "4      actée-0  ...  Histoire naturelle\n",
-       "\n",
-       "[5 rows x 7 columns]"
-      ]
-     },
-     "execution_count": 46,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_LGE.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "3xkzdkrKQHwA"
-   },
-   "outputs": [],
-   "source": [
-    "df_LGE.to_csv(\"drive/MyDrive/Classification-EDdA/classification_LGE.tsv\", sep=\"\\t\")"
-   ]
   }
  ],
  "metadata": {
diff --git a/notebooks/EDdA_Classification_ClassicModels.ipynb b/notebooks/EDdA_Classification_ClassicModels.ipynb
index fcb2ba0913c4ace29c5ba8cdda2ba0b89c1a5931..4a10ee3f243003a3359d927a5ffaf1239eddce0a 100644
--- a/notebooks/EDdA_Classification_ClassicModels.ipynb
+++ b/notebooks/EDdA_Classification_ClassicModels.ipynb
@@ -1,21 +1,4 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "EDdA-Classification_ClassicModels.ipynb",
-      "provenance": [],
-      "collapsed_sections": [],
-      "machine_shape": "hm"
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
@@ -37,9 +20,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "D_uwiuJq3pAM"
       },
+      "outputs": [],
       "source": [
         "train_path = 'training_set.tsv'\n",
         "validation_path = 'validation_set.tsv'\n",
@@ -67,9 +52,7 @@
         "doc2vec_min_count = 12\n",
         "doc2vec_dm = 0\n",
         "doc2vec_workers = 8"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -82,6 +65,7 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -89,6 +73,7 @@
         "id": "FsAR4CsB3aUc",
         "outputId": "a5e4efde-a5c9-45f9-ef1c-9223b4d52ac6"
       },
+      "outputs": [],
       "source": [
         "from psutil import virtual_memory\n",
         "ram_gb = virtual_memory().total / 1e9\n",
@@ -98,22 +83,11 @@
         "  print('Not using a high-RAM runtime')\n",
         "else:\n",
         "  print('You are using a high-RAM runtime!')"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Your runtime has 27.3 gigabytes of available RAM\n",
-            "\n",
-            "You are using a high-RAM runtime!\n"
-          ]
-        }
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -121,19 +95,10 @@
         "id": "h5MwRwL53aYY",
         "outputId": "bc4c4c16-fb20-404a-e044-550fc4ca907d"
       },
+      "outputs": [],
       "source": [
         "from google.colab import drive\n",
         "drive.mount('/content/drive')"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Mounted at /content/drive\n"
-          ]
-        }
       ]
     },
     {
@@ -147,6 +112,7 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -154,6 +120,7 @@
         "id": "bcptSr6o3ac7",
         "outputId": "19713482-dfeb-4be3-e63c-35b4253cb9e5"
       },
+      "outputs": [],
       "source": [
         "import pandas as pd\n",
         "import numpy as np\n",
@@ -181,33 +148,11 @@
         "import string\n",
         "nltk.download('stopwords')\n",
         "nltk.download('punkt')"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
-            "[nltk_data]   Package stopwords is already up-to-date!\n",
-            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
-            "[nltk_data]   Package punkt is already up-to-date!\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "True"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 3
-        }
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -215,50 +160,9 @@
         "id": "dwSVXDtWZB5H",
         "outputId": "44e2aa14-726f-43af-aa6a-1b7899e1025b"
       },
+      "outputs": [],
       "source": [
         "!python -m spacy download fr_core_news_sm"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Collecting fr_core_news_sm==2.2.5\n",
-            "  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7 MB)\n",
-            "\u001b[K     |████████████████████████████████| 14.7 MB 5.5 MB/s \n",
-            "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from fr_core_news_sm==2.2.5) (2.2.4)\n",
-            "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (0.8.2)\n",
-            "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (57.4.0)\n",
-            "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.1.3)\n",
-            "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.0.6)\n",
-            "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.5)\n",
-            "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.23.0)\n",
-            "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.0)\n",
-            "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (0.4.1)\n",
-            "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.0.6)\n",
-            "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (7.4.0)\n",
-            "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (4.62.3)\n",
-            "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.19.5)\n",
-            "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.6)\n",
-            "Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (4.8.2)\n",
-            "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.10.0.2)\n",
-            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.6.0)\n",
-            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.10)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (2021.10.8)\n",
-            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.0.4)\n",
-            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.24.3)\n",
-            "Building wheels for collected packages: fr-core-news-sm\n",
-            "  Building wheel for fr-core-news-sm (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.2.5-py3-none-any.whl size=14727026 sha256=994d176b35663506dd047e65863238d29b9b60313ba0dee5997c107f116477aa\n",
-            "  Stored in directory: /tmp/pip-ephem-wheel-cache-c8y7i3ag/wheels/c9/a6/ea/0778337c34660027ee67ef3a91fb9d3600b76777a912ea1c24\n",
-            "Successfully built fr-core-news-sm\n",
-            "Installing collected packages: fr-core-news-sm\n",
-            "Successfully installed fr-core-news-sm-2.2.5\n",
-            "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
-            "You can now load the model via spacy.load('fr_core_news_sm')\n"
-          ]
-        }
       ]
     },
     {
@@ -272,9 +176,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "Tunf_CYi3afO"
       },
+      "outputs": [],
       "source": [
         "def create_dict(df, classColumnName):\n",
         "    return dict(df[classColumnName].value_counts())\n",
@@ -328,9 +234,7 @@
         "  model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs)\n",
         "  return model\n",
         "  #return np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -343,28 +247,28 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ybiJYL0h3ahh"
       },
+      "outputs": [],
       "source": [
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n",
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n",
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "LRKJzWmf3pCg"
       },
+      "outputs": [],
       "source": [
         "df_train = pd.read_csv(train_path, sep=\"\\t\")\n",
         "df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -377,9 +281,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "6QQXybaQ3pE9"
       },
+      "outputs": [],
       "source": [
         "data_train = df_train[columnText].tolist()\n",
         "vectorizer_dic = {}\n",
@@ -423,9 +329,7 @@
         "      pickle.dump(vectorizer, file)\n",
         "    \n",
         "  vectorizer_dic[vectorizer_name] = vectorizer    "
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -438,9 +342,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "rx_0eV-M3pHc"
       },
+      "outputs": [],
       "source": [
         "classifier_dic = {}\n",
         "grid_param = {}\n",
@@ -460,15 +366,15 @@
         "  elif classifier_name == \"rfc\":\n",
         "    classifier_dic[classifier_name] = RandomForestClassifier()\n",
         "    grid_param[classifier_name] = { 'max_features': ['sqrt', 'log2'], 'max_depth' : [4,5,6,7,8]}\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "pO7oyeAF7KPK"
       },
+      "outputs": [],
       "source": [
         "for clf_name, clf in classifier_dic.items():\n",
         "  if clf_name != 'bayes' :\n",
@@ -488,9 +394,7 @@
         "    # saving classifier\n",
         "    with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'wb') as file:\n",
         "      pickle.dump(clf, file)\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -503,17 +407,18 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "TfKAjtVFblYe"
       },
+      "outputs": [],
       "source": [
         "dataset_name = [\"validation\", \"test\"]"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -521,341 +426,108 @@
         "id": "h8vZar8c7KRq",
         "outputId": "83511c89-9219-43d1-9e5a-820e75012166"
       },
+      "outputs": [],
       "source": [
         "for dataset in dataset_name:\n",
         "  df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n",
+        "  df_eval = df_eval.dropna(subset=[columnClass]).reset_index(drop=True)   # supprimer les NaN...\n",
         "  data_eval = df_eval[columnText].tolist()\n",
         "\n",
-        "  for classifier_name in classifier_list:\n",
+        "  for maxOfInstancePerClass in [500, 1500, 10000]:\n",
+        "    \n",
+        "\n",
+        "    for classifier_name in classifier_list:\n",
         "\n",
-        "    for vectorizer_name in vectorizer_list:\n",
+        "      for vectorizer_name in vectorizer_list:\n",
         "\n",
-        "      clf_file_name = classifier_name + '_' + vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n",
-        "      with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'rb') as file:\n",
-        "        clf = pickle.load(file)\n",
+        "        clf_file_name = classifier_name + '_' + vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n",
+        "        with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'rb') as file:\n",
+        "          clf = pickle.load(file)\n",
         "\n",
-        "      vec_file_name = vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n",
-        "      with open(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name, 'rb') as file:\n",
-        "        vectorizer = pickle.load(file)\n",
+        "        vec_file_name = vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n",
+        "        with open(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name, 'rb') as file:\n",
+        "          vectorizer = pickle.load(file)\n",
         "\n",
-        "      if vectorizer_name != 'doc2vec' :\n",
-        "        vec_data = vectorizer.transform(data_eval)\n",
-        "      else : \n",
-        "        tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(data_eval)]\n",
-        "        vec_data = np.array([vectorizer.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])\n",
+        "        if vectorizer_name != 'doc2vec' :\n",
+        "          vec_data = vectorizer.transform(data_eval)\n",
+        "        else : \n",
+        "          tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(data_eval)]\n",
+        "          vec_data = np.array([vectorizer.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])\n",
         "\n",
         "\n",
-        "      y_pred = clf.predict(vec_data)\n",
+        "        y_pred = clf.predict(vec_data)\n",
         "\n",
         "\n",
-        "      report = classification_report(y_pred, df_eval[columnClass], output_dict = True)\n",
-        "      precision = []\n",
-        "      recall = []\n",
-        "      f1 = []\n",
-        "      support = []\n",
-        "      dff = pd.DataFrame(columns= ['class', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
-        "      for c in df_eval[columnClass].unique() :\n",
-        "        precision.append(report[c]['precision'])\n",
-        "        recall.append(report[c]['recall'])\n",
-        "        f1.append(report[c]['f1-score'])\n",
-        "        support.append(report[c]['support'])\n",
+        "        report = classification_report(df_eval[columnClass], y_pred, output_dict = True)\n",
+        "        precision = []\n",
+        "        recall = []\n",
+        "        f1 = []\n",
+        "        support = []\n",
+        "        dff = pd.DataFrame(columns= ['class', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
+        "        for c in df_eval[columnClass].unique() :\n",
+        "          precision.append(report[c]['precision'])\n",
+        "          recall.append(report[c]['recall'])\n",
+        "          f1.append(report[c]['f1-score'])\n",
+        "          support.append(report[c]['support'])\n",
         "\n",
-        "      accuracy = report['accuracy']\n",
-        "      weighted_avg = report['weighted avg']\n",
-        "      cnf_matrix = confusion_matrix(df_eval[columnClass], y_pred)\n",
-        "      FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
-        "      FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
-        "      TP = np.diag(cnf_matrix)\n",
-        "      TN = cnf_matrix.sum() - (FP + FN + TP)\n",
+        "        accuracy = report['accuracy']\n",
+        "        weighted_avg = report['weighted avg']\n",
+        "        cnf_matrix = confusion_matrix(df_eval[columnClass], y_pred)\n",
+        "        FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
+        "        FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
+        "        TP = np.diag(cnf_matrix)\n",
+        "        TN = cnf_matrix.sum() - (FP + FN + TP)\n",
         "\n",
-        "      dff['class'] = df_eval[columnClass].unique()\n",
-        "      dff['precision'] = precision\n",
-        "      dff['recall'] = recall\n",
-        "      dff['f1-score'] = f1\n",
-        "      dff['support'] = support\n",
-        "      dff['FP'] = FP\n",
-        "      dff['FN'] = FN\n",
-        "      dff['TP'] = TP\n",
-        "      dff['TN'] = TN\n",
+        "        dff['class'] = df_eval[columnClass].unique()\n",
+        "        dff['precision'] = precision\n",
+        "        dff['recall'] = recall\n",
+        "        dff['f1-score'] = f1\n",
+        "        dff['support'] = support\n",
+        "        dff['FP'] = FP\n",
+        "        dff['FN'] = FN\n",
+        "        dff['TP'] = TP\n",
+        "        dff['TN'] = TN\n",
         "\n",
         "\n",
-        "      print(dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass))\n",
+        "        print(dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass))\n",
         "\n",
-        "      print(weighted_avg)\n",
-        "      print(accuracy)\n",
-        "      print(dff)\n",
+        "        print(weighted_avg)\n",
+        "        print(accuracy)\n",
+        "        print(dff)\n",
         "\n",
-        "      dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n",
-        "\n"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "validation_bayes_bagofwords_s10000\n",
-            "{'precision': 0.8377945389222964, 'recall': 0.619530464967571, 'f1-score': 0.6842670335331308, 'support': 10947}\n",
-            "0.619530464967571\n",
-            "                                          class  precision  ...    TP     TN\n",
-            "0                         Droit - Jurisprudence   0.963590  ...     5  10735\n",
-            "1                                     Grammaire   0.321888  ...    46  10760\n",
-            "2                            Histoire naturelle   0.938776  ...    55  10665\n",
-            "3                                      Commerce   0.310249  ...    42  10679\n",
-            "4                                    Géographie   0.958193  ...     0  10839\n",
-            "5                                  Architecture   0.158491  ...     0  10863\n",
-            "6                                       Monnaie   0.000000  ...     4  10751\n",
-            "7                          Médecine - Chirurgie   0.735981  ...     3  10860\n",
-            "8                                       Métiers   0.917495  ...     0  10925\n",
-            "9               Militaire (Art) - Guerre - Arme   0.182186  ...     1  10845\n",
-            "10                                     Anatomie   0.245989  ...     1  10853\n",
-            "11                                          Jeu   0.000000  ...   112  10553\n",
-            "12                                    Pharmacie   0.000000  ...  1138   9191\n",
-            "13                                    Antiquité   0.209125  ...     0  10921\n",
-            "14                      Belles-lettres - Poésie   0.020513  ...   150  10358\n",
-            "15              Agriculture - Economie rustique   0.023585  ...  2269   8114\n",
-            "16                                Mathématiques   0.142857  ...   357   9728\n",
-            "17                                   Beaux-arts   0.000000  ...   874   9278\n",
-            "18  Physique - [Sciences physico-mathématiques]   0.364372  ...     0  10893\n",
-            "19                                       Marine   0.410468  ...   149  10579\n",
-            "20                                       Chasse   0.009804  ...     5  10850\n",
-            "21                              Arts et métiers   0.000000  ...    18  10819\n",
-            "22                                     Religion   0.526646  ...     0  10912\n",
-            "23                                       Blason   0.034483  ...    45  10699\n",
-            "24                                        Pêche   0.025641  ...     0  10926\n",
-            "25                                     Histoire   0.603041  ...     0  10886\n",
-            "26                           Maréchage - Manège   0.051546  ...    11  10814\n",
-            "27                                       Mesure   0.000000  ...     0  10924\n",
-            "28                          Economie domestique   0.000000  ...   315  10264\n",
-            "29                                  Philosophie   0.000000  ...   923   8722\n",
-            "30                                 Superstition   0.000000  ...     0  10888\n",
-            "31                                       Chimie   0.010638  ...     0  10854\n",
-            "32                                    Médailles   0.000000  ...    90  10659\n",
-            "33                                      Musique   0.082707  ...     0  10925\n",
-            "34                                   Caractères   0.000000  ...     1  10908\n",
-            "35                                    Spectacle   0.000000  ...   168  10570\n",
-            "36                                  Minéralogie   0.000000  ...     0  10938\n",
-            "37                                    Politique   0.000000  ...     0  10926\n",
-            "\n",
-            "[38 rows x 9 columns]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "validation_bayes_tf_idf_s10000\n",
-            "{'precision': 0.9361172330822201, 'recall': 0.48853567187357266, 'f1-score': 0.6289575972884817, 'support': 10947}\n",
-            "0.48853567187357266\n",
-            "                                          class  precision  ...    TP     TN\n",
-            "0                         Droit - Jurisprudence   0.922100  ...     0  10735\n",
-            "1                                     Grammaire   0.000000  ...     7  10760\n",
-            "2                            Histoire naturelle   0.888292  ...     0  10684\n",
-            "3                                      Commerce   0.036011  ...     1  10682\n",
-            "4                                    Géographie   0.995777  ...     0  10839\n",
-            "5                                  Architecture   0.003774  ...     0  10863\n",
-            "6                                       Monnaie   0.000000  ...     0  10752\n",
-            "7                          Médecine - Chirurgie   0.221963  ...     0  10860\n",
-            "8                                       Métiers   0.903579  ...     0  10925\n",
-            "9               Militaire (Art) - Guerre - Arme   0.004049  ...     0  10845\n",
-            "10                                     Anatomie   0.037433  ...     0  10853\n",
-            "11                                          Jeu   0.000000  ...    13  10585\n",
-            "12                                    Pharmacie   0.000000  ...  1089   9047\n",
-            "13                                    Antiquité   0.000000  ...     0  10921\n",
-            "14                      Belles-lettres - Poésie   0.000000  ...     0  10481\n",
-            "15              Agriculture - Economie rustique   0.000000  ...  2358   5636\n",
-            "16                                Mathématiques   0.000000  ...    14  10349\n",
-            "17                                   Beaux-arts   0.000000  ...   827   9314\n",
-            "18  Physique - [Sciences physico-mathématiques]   0.004049  ...     0  10893\n",
-            "19                                       Marine   0.088154  ...    32  10583\n",
-            "20                                       Chasse   0.000000  ...     0  10850\n",
-            "21                              Arts et métiers   0.000000  ...     0  10821\n",
-            "22                                     Religion   0.003135  ...     0  10912\n",
-            "23                                       Blason   0.000000  ...     1  10700\n",
-            "24                                        Pêche   0.000000  ...     0  10926\n",
-            "25                                     Histoire   0.023649  ...     0  10886\n",
-            "26                           Maréchage - Manège   0.000000  ...     0  10814\n",
-            "27                                       Mesure   0.000000  ...     0  10924\n",
-            "28                          Economie domestique   0.000000  ...    95  10502\n",
-            "29                                  Philosophie   0.000000  ...   909   8731\n",
-            "30                                 Superstition   0.000000  ...     0  10888\n",
-            "31                                       Chimie   0.000000  ...     0  10854\n",
-            "32                                    Médailles   0.000000  ...     1  10700\n",
-            "33                                      Musique   0.000000  ...     0  10925\n",
-            "34                                   Caractères   0.000000  ...     0  10908\n",
-            "35                                    Spectacle   0.000000  ...     1  10628\n",
-            "36                                  Minéralogie   0.000000  ...     0  10938\n",
-            "37                                    Politique   0.000000  ...     0  10926\n",
-            "\n",
-            "[38 rows x 9 columns]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "test_bayes_bagofwords_s10000\n",
-            "{'precision': 0.8343333806034451, 'recall': 0.6158940397350994, 'f1-score': 0.6801987597575112, 'support': 13137}\n",
-            "0.6158940397350994\n",
-            "                                          class  precision  ...    TP     TN\n",
-            "0                                      Histoire   0.579466  ...     3  12882\n",
-            "1                         Droit - Jurisprudence   0.953423  ...    44  12913\n",
-            "2                                    Géographie   0.953906  ...    58  12804\n",
-            "3                                       Métiers   0.922949  ...    48  12815\n",
-            "4                                  Architecture   0.150943  ...     0  13008\n",
-            "5                          Médecine - Chirurgie   0.744639  ...     0  13037\n",
-            "6                                 Mathématiques   0.225166  ...     2  12900\n",
-            "7                                     Grammaire   0.305357  ...     4  13032\n",
-            "8                                       Monnaie   0.000000  ...     0  13110\n",
-            "9                                      Commerce   0.327945  ...     1  13015\n",
-            "10                                     Anatomie   0.196429  ...     2  13025\n",
-            "11  Physique - [Sciences physico-mathématiques]   0.331081  ...   142  12652\n",
-            "12                                  Philosophie   0.000000  ...  1351  11028\n",
-            "13                      Belles-lettres - Poésie   0.008511  ...     0  13106\n",
-            "14              Militaire (Art) - Guerre - Arme   0.199324  ...   171  12399\n",
-            "15                                    Antiquité   0.183544  ...  2711   9779\n",
-            "16                           Maréchage - Manège   0.008621  ...   412  11633\n",
-            "17                                       Chasse   0.008197  ...  1054  11199\n",
-            "18              Agriculture - Economie rustique   0.011811  ...     0  13072\n",
-            "19                           Histoire naturelle   0.942755  ...   185  12697\n",
-            "20                                     Religion   0.535248  ...     1  13021\n",
-            "21                                       Mesure   0.000000  ...    34  12983\n",
-            "22                                      Musique   0.062500  ...     0  13095\n",
-            "23                              Arts et métiers   0.000000  ...    59  12838\n",
-            "24                                       Marine   0.425287  ...     0  13111\n",
-            "25                                       Blason   0.038095  ...     0  13064\n",
-            "26                                       Chimie   0.017857  ...    10  12976\n",
-            "27                          Economie domestique   0.000000  ...     0  13109\n",
-            "28                                   Beaux-arts   0.000000  ...   382  12312\n",
-            "29                                          Jeu   0.000000  ...  1114  10375\n",
-            "30                                        Pêche   0.000000  ...     0  13066\n",
-            "31                                    Politique   0.000000  ...     0  13025\n",
-            "32                                  Minéralogie   0.000000  ...    98  12817\n",
-            "33                                    Pharmacie   0.000000  ...     0  13111\n",
-            "34                                 Superstition   0.000000  ...     0  13090\n",
-            "35                                   Caractères   0.000000  ...   205  12686\n",
-            "36                                    Médailles   0.000000  ...     0  13126\n",
-            "37                                    Spectacle   0.000000  ...     0  13112\n",
-            "\n",
-            "[38 rows x 9 columns]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "test_bayes_tf_idf_s10000\n",
-            "{'precision': 0.9374431375624079, 'recall': 0.4883915658065007, 'f1-score': 0.6291194809131295, 'support': 13137}\n",
-            "0.4883915658065007\n",
-            "                                          class  precision  ...    TP     TN\n",
-            "0                                      Histoire   0.018284  ...     0  12883\n",
-            "1                         Droit - Jurisprudence   0.928017  ...     3  12913\n",
-            "2                                    Géographie   0.997185  ...     0  12821\n",
-            "3                                       Métiers   0.906379  ...     0  12819\n",
-            "4                                  Architecture   0.000000  ...     0  13008\n",
-            "5                          Médecine - Chirurgie   0.230019  ...     0  13037\n",
-            "6                                 Mathématiques   0.000000  ...     0  12902\n",
-            "7                                     Grammaire   0.000000  ...     0  13032\n",
-            "8                                       Monnaie   0.000000  ...     0  13110\n",
-            "9                                      Commerce   0.036952  ...     0  13015\n",
-            "10                                     Anatomie   0.013393  ...     0  13025\n",
-            "11  Physique - [Sciences physico-mathématiques]   0.003378  ...    16  12701\n",
-            "12                                  Philosophie   0.000000  ...  1315  10852\n",
-            "13                      Belles-lettres - Poésie   0.000000  ...     0  13106\n",
-            "14              Militaire (Art) - Guerre - Arme   0.003378  ...     0  12577\n",
-            "15                                    Antiquité   0.000000  ...  2834   6749\n",
-            "16                           Maréchage - Manège   0.000000  ...    13  12422\n",
-            "17                                       Chasse   0.000000  ...   978  11227\n",
-            "18              Agriculture - Economie rustique   0.000000  ...     0  13072\n",
-            "19                           Histoire naturelle   0.874776  ...    42  12702\n",
-            "20                                     Religion   0.002611  ...     0  13021\n",
-            "21                                       Mesure   0.000000  ...     0  12986\n",
-            "22                                      Musique   0.000000  ...     0  13095\n",
-            "23                              Arts et métiers   0.000000  ...     1  12841\n",
-            "24                                       Marine   0.096552  ...     0  13111\n",
-            "25                                       Blason   0.000000  ...     0  13064\n",
-            "26                                       Chimie   0.000000  ...     0  12977\n",
-            "27                          Economie domestique   0.000000  ...     0  13109\n",
-            "28                                   Beaux-arts   0.000000  ...   118  12608\n",
-            "29                                          Jeu   0.000000  ...  1094  10439\n",
-            "30                                        Pêche   0.000000  ...     0  13066\n",
-            "31                                    Politique   0.000000  ...     0  13025\n",
-            "32                                  Minéralogie   0.000000  ...     1  12840\n",
-            "33                                    Pharmacie   0.000000  ...     0  13111\n",
-            "34                                 Superstition   0.000000  ...     0  13090\n",
-            "35                                   Caractères   0.000000  ...     1  12754\n",
-            "36                                    Médailles   0.000000  ...     0  13126\n",
-            "37                                    Spectacle   0.000000  ...     0  13112\n",
-            "\n",
-            "[38 rows x 9 columns]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        }
+        "        dff.to_csv(\"drive/MyDrive/Classification-EDdA/reports/report_\"+dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n",
+        "\n",
+        "        # enregistrer les predictions\n",
+        "        pd.DataFrame({'labels': pd.Series(df_eval[columnClass]), 'predictions': pd.Series(y_pred)}).to_csv(\"drive/MyDrive/Classification-EDdA/predictions/predictions_\"+dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\")\n",
+        "\n",
+        "  \n"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "mMiQo_sR7KWn"
       },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": []
     }
-  ]
-}
\ No newline at end of file
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "machine_shape": "hm",
+      "name": "EDdA-Classification_ClassicModels.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/notebooks/EDdA_Classification_DeepLearning.ipynb b/notebooks/EDdA_Classification_DeepLearning.ipynb
index d8e9ea64dd3f8eb0d5d3fa12bf8f3f9ee8fa4466..4bdd58e6756dc6b72fda0e6378d9986a9323e3c7 100644
--- a/notebooks/EDdA_Classification_DeepLearning.ipynb
+++ b/notebooks/EDdA_Classification_DeepLearning.ipynb
@@ -1,20 +1,4 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "EDdA-Classification_DeepLearning.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
@@ -36,9 +20,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "G5LT5n9O7SLt"
       },
+      "outputs": [],
       "source": [
         "train_path = 'training_set.tsv'\n",
         "validation_path = 'validation_set.tsv'\n",
@@ -55,9 +41,7 @@
         "max_len = 512 # \n",
         "epochs = 20\n",
         "embedding_dim = 300 "
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -70,6 +54,7 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -77,19 +62,18 @@
         "id": "Sp8d_Uus7SHJ",
         "outputId": "82929364-d0a1-4962-fcb4-47224a48e6cf"
       },
-      "source": [
-        "from google.colab import drive\n",
-        "drive.mount('/content/drive')"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Mounted at /content/drive\n"
           ]
         }
+      ],
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
       ]
     },
     {
@@ -103,15 +87,15 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "bTIXsF6kBUdh"
       },
+      "outputs": [],
       "source": [
         "#!pip install zeugma\n",
         "#!pip install plot_model"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -124,6 +108,7 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -131,6 +116,18 @@
         "id": "HwWkSznz7SEv",
         "outputId": "02ecbbf8-556f-4567-b57d-6e13a4ca28ff"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+            "[nltk_data]   Unzipping corpora/stopwords.zip.\n",
+            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
+            "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
+          ]
+        }
+      ],
       "source": [
         "from nltk.tokenize import word_tokenize\n",
         "import nltk\n",
@@ -164,19 +161,6 @@
         "from tqdm import tqdm\n",
         "import requests, zipfile, io\n",
         "import os, re, csv, math, codecs"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
-            "[nltk_data]   Unzipping corpora/stopwords.zip.\n",
-            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
-            "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
-          ]
-        }
       ]
     },
     {
@@ -190,9 +174,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "4LJ5blQR7PUe"
       },
+      "outputs": [],
       "source": [
         "\n",
         "def resample_classes(df, classColumnName, numberOfInstances):\n",
@@ -201,20 +187,16 @@
         "  fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n",
         "  return df.groupby(classColumnName, as_index=False).apply(fn)\n",
         "    \n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "-Rh3JMDh7zYd"
       },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": []
     },
     {
       "cell_type": "markdown",
@@ -227,34 +209,35 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "FnbNT4NF7zal"
       },
+      "outputs": [],
       "source": [
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n",
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n",
         "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "WNqDms64lfaS"
       },
+      "outputs": [],
       "source": [
         "# download FastText\n",
         "zip_file_url = \"https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip\"\n",
         "r = requests.get(zip_file_url)\n",
         "z = zipfile.ZipFile(io.BytesIO(r.content))\n",
         "z.extractall()"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -262,73 +245,74 @@
         "id": "PGMIi0CAmqSd",
         "outputId": "09c034fd-f689-43a9-fd75-5923906d89bf"
       },
-      "source": [
-        "print('loading word embeddings...')\n",
-        "\n",
-        "embeddings_index = {}\n",
-        "f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')\n",
-        "\n",
-        "for line in tqdm(f):\n",
-        "    values = line.rstrip().rsplit(' ')\n",
-        "    word = values[0]\n",
-        "    coefs = np.asarray(values[1:], dtype='float32')\n",
-        "    embeddings_index[word] = coefs\n",
-        "f.close()\n",
-        "\n",
-        "print('found %s word vectors' % len(embeddings_index))"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "loading word embeddings...\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "1999996it [03:40, 9087.22it/s]"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "found 1999996 word vectors\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "\n"
           ]
         }
+      ],
+      "source": [
+        "print('loading word embeddings...')\n",
+        "\n",
+        "embeddings_index = {}\n",
+        "f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')\n",
+        "\n",
+        "for line in tqdm(f):\n",
+        "    values = line.rstrip().rsplit(' ')\n",
+        "    word = values[0]\n",
+        "    coefs = np.asarray(values[1:], dtype='float32')\n",
+        "    embeddings_index[word] = coefs\n",
+        "f.close()\n",
+        "\n",
+        "print('found %s word vectors' % len(embeddings_index))"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "nRLaQUO97zcq"
       },
+      "outputs": [],
       "source": [
         "df_train = pd.read_csv(train_path, sep=\"\\t\")\n",
         "df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)\n",
         "\n",
         "df_validation = pd.read_csv(validation_path, sep=\"\\t\")\n",
         "df_validation = resample_classes(df_validation, columnClass, maxOfInstancePerClass)\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "vGWAgBH87ze8"
       },
+      "outputs": [],
       "source": [
         "y_train  = df_train[columnClass]\n",
         "y_validation = df_validation[columnClass]\n",
@@ -338,12 +322,11 @@
         "\n",
         "y_train = encoder.fit_transform(y_train)\n",
         "y_validation = encoder.fit_transform(y_validation)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -352,13 +335,8 @@
         "id": "7OYjo_uhoqcX",
         "outputId": "79c4ff25-0476-4e12-d6ff-a8e073ee3f6c"
       },
-      "source": [
-        "df_validation.head()"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
             "text/html": [
               "<div>\n",
@@ -492,9 +470,13 @@
               "[5 rows x 13 columns]"
             ]
           },
+          "execution_count": 10,
           "metadata": {},
-          "execution_count": 10
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "df_validation.head()"
       ]
     },
     {
@@ -508,6 +490,7 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -515,6 +498,31 @@
         "id": "NTNh6kMTp_eU",
         "outputId": "3c1eb88c-7f1d-48f1-92bc-bc671f5e1bc1"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "pre-processing train data...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "100%|██████████| 21129/21129 [00:15<00:00, 1359.31it/s]\n",
+            "100%|██████████| 10079/10079 [00:07<00:00, 1378.11it/s]\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "tokenizing input data...\n",
+            "dictionary size:  95254\n"
+          ]
+        }
+      ],
       "source": [
         "#https://github.com/emmanuellaanggi/disaster_tweet_sentiment/blob/master/(Medium)_Text_Classification_Disaster_Tweet_.ipynb\n",
         "\n",
@@ -551,36 +559,11 @@
         "#pad sequences\n",
         "word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_len)\n",
         "word_seq_validation = sequence.pad_sequences(word_seq_validation, maxlen=max_len)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "pre-processing train data...\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "100%|██████████| 21129/21129 [00:15<00:00, 1359.31it/s]\n",
-            "100%|██████████| 10079/10079 [00:07<00:00, 1378.11it/s]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "tokenizing input data...\n",
-            "dictionary size:  95254\n"
-          ]
-        }
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -588,13 +571,8 @@
         "id": "Wj8RkOhT_e2c",
         "outputId": "56152da7-47b7-4b07-84e7-8c499671d53e"
       },
-      "source": [
-        "word_seq_validation"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
             "text/plain": [
               "array([[  0,   0,   0, ..., 293,   8,   7],\n",
@@ -606,13 +584,18 @@
               "       [  0,   0,   0, ..., 188, 213,  37]], dtype=int32)"
             ]
           },
+          "execution_count": 12,
           "metadata": {},
-          "execution_count": 12
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "word_seq_validation"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -620,6 +603,16 @@
         "id": "wGjQI0YgpQAS",
         "outputId": "43a3d902-5a8d-4159-a21e-419b5ee35d7d"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "preparing embedding matrix...\n",
+            "number of null word embeddings: 70\n"
+          ]
+        }
+      ],
       "source": [
         "#embedding matrix\n",
         "\n",
@@ -639,21 +632,11 @@
         "    else:\n",
         "        words_not_found.append(word)\n",
         "print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "preparing embedding matrix...\n",
-            "number of null word embeddings: 70\n"
-          ]
-        }
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -661,23 +644,23 @@
         "id": "hjaeYIZCtGca",
         "outputId": "5ab4dd1a-a500-479f-e289-892242c83de8"
       },
-      "source": [
-        "print(\"sample words not found: \", np.random.choice(words_not_found, 10))"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "sample words not found:  ['especes' \"d'argent\" \"d'où\" \"d'argent\" \"qu'elle\" \"qu'elle\" \"c'étoit\"\n",
             " 'différens' 'faisoit' 'faisoit']\n"
           ]
         }
+      ],
+      "source": [
+        "print(\"sample words not found: \", np.random.choice(words_not_found, 10))"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -685,25 +668,10 @@
         "id": "4O0gnsX8pNVU",
         "outputId": "46feba64-b608-4b53-de15-b586dc24b880"
       },
-      "source": [
-        "from keras.layers import BatchNormalization\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "model = tf.keras.Sequential()\n",
-        "\n",
-        "model.add(Embedding(nb_words,embedding_dim,input_length=max_len, weights=[embedding_matrix],trainable=False))\n",
-        "model.add(Bidirectional(LSTM(100)))\n",
-        "model.add(Dense(64,activation='relu'))\n",
-        "model.add(Dropout(0.2))\n",
-        "#model.add(Dense(numberOfClasses,activation='sigmoid'))\n",
-        "model.add(Dense(numberOfClasses,activation='softmax'))\n",
-        "model.summary()"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Model: \"sequential\"\n",
             "_________________________________________________________________\n",
@@ -727,25 +695,40 @@
             "_________________________________________________________________\n"
           ]
         }
+      ],
+      "source": [
+        "from keras.layers import BatchNormalization\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "model = tf.keras.Sequential()\n",
+        "\n",
+        "model.add(Embedding(nb_words,embedding_dim,input_length=max_len, weights=[embedding_matrix],trainable=False))\n",
+        "model.add(Bidirectional(LSTM(100)))\n",
+        "model.add(Dense(64,activation='relu'))\n",
+        "model.add(Dropout(0.2))\n",
+        "#model.add(Dense(numberOfClasses,activation='sigmoid'))\n",
+        "model.add(Dense(numberOfClasses,activation='softmax'))\n",
+        "model.summary()"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "GcfMJl8f-cBA"
       },
+      "outputs": [],
       "source": [
         "\n",
         "#model = NN_withEmbeddings(longueur_dict, embedding_dim, max_len, numberOfClasses)\n",
         "\n",
         "model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
         "#model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC(multi_label=True)])"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -753,15 +736,10 @@
         "id": "OTQTH5VDuA3I",
         "outputId": "b8286232-4938-4591-b483-6b6d1bdc015e"
       },
-      "source": [
-        "#model.fit(padded, np.array(y_train), epochs=epochs, batch_size = batch_size) \n",
-        "model.fit(word_seq_train, y_train, batch_size=256, epochs=epochs, validation_data=(word_seq_validation, y_validation), shuffle=True)"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Epoch 1/20\n",
             "83/83 [==============================] - 530s 6s/step - loss: 3.0575 - accuracy: 0.1886 - val_loss: 2.2493 - val_accuracy: 0.4315\n",
@@ -806,15 +784,19 @@
           ]
         },
         {
-          "output_type": "execute_result",
           "data": {
             "text/plain": [
               "<keras.callbacks.History at 0x7f4269526a90>"
             ]
           },
+          "execution_count": 17,
           "metadata": {},
-          "execution_count": 17
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "#model.fit(padded, np.array(y_train), epochs=epochs, batch_size = batch_size) \n",
+        "model.fit(word_seq_train, y_train, batch_size=256, epochs=epochs, validation_data=(word_seq_validation, y_validation), shuffle=True)"
       ]
     },
     {
@@ -828,27 +810,27 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ykTp9lyRaAma"
       },
+      "outputs": [],
       "source": [
         "model.save(\"drive/MyDrive/Classification-EDdA/lstm_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "5J4xDoqRUSfS"
       },
+      "outputs": [],
       "source": [
         "# save embeddings\n",
         "\n",
         "# saving embeddings index \n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -861,14 +843,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "fKt8ft1t_Cxx"
       },
+      "outputs": [],
       "source": [
         "model = keras.models.load_model(\"drive/MyDrive/Classification-EDdA/lstm_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -881,471 +863,150 @@
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "id": "G9pjdMdNW_KS"
-      },
-      "source": [
-        "predictions = model.predict(word_seq_validation)\n",
-        "predictions = np.argmax(predictions,axis=1)"
-      ],
       "execution_count": null,
-      "outputs": []
+      "metadata": {},
+      "outputs": [],
+      "source": []
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "IHpVJ79IW_M0",
-        "outputId": "78e2a1aa-d35c-428c-e6c3-0ad332abcdfd"
-      },
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
       "source": [
-        "report = classification_report(predictions, y_validation, output_dict = True)\n",
+        "from sklearn.metrics import confusion_matrix\n",
         "\n",
-        "accuracy = report['accuracy']\n",
-        "weighted_avg = report['weighted avg']\n",
         "\n",
-        "print(accuracy, weighted_avg)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "0.5773390217283461 {'precision': 0.5977985581006744, 'recall': 0.5773390217283461, 'f1-score': 0.5808733866443131, 'support': 10079}\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "9SKjWffUW_PC"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "LpgkGq-fW_RN"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "4gGNaPY1iuXD"
-      },
-      "source": [
-        "df_test = pd.read_csv(test_path, sep=\"\\t\")\n",
+        "for dataset in [\"test\", \"validation\"]:\n",
         "\n",
-        "encoder = preprocessing.LabelEncoder()\n",
-        "y_test = encoder.fit_transform(df_test[columnClass])\n"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "P67p7BUZiuZV",
-        "outputId": "f958a063-ee95-4157-fcd9-796991615f03"
-      },
-      "source": [
-        "raw_docs_test = df_test[columnText].tolist()\n",
+        "  df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n",
+        "  df_eval = df_eval.dropna(subset=[columnClass]).reset_index(drop=True)   # supprimer les NaN...\n",
+        "    \n",
+        "  encoder = preprocessing.LabelEncoder()\n",
+        "  y_test = encoder.fit_transform(df_eval[columnClass])\n",
         "\n",
-        "print(\"pre-processing test data...\")\n",
         "\n",
-        "stop_words = set(stopwords.words('french'))\n",
+        "  raw_docs_test = df_eval[columnText].tolist()\n",
         "\n",
-        "processed_docs_test = []\n",
-        "for doc in tqdm(raw_docs_test):\n",
-        "    tokens = word_tokenize(doc, language='french')\n",
-        "    filtered = [word for word in tokens if word not in stop_words]\n",
-        "    processed_docs_test.append(\" \".join(filtered))\n",
-        "#end for\n",
+        "  print(\"pre-processing test data...\")\n",
         "\n",
-        "print(\"tokenizing input data...\")\n",
-        "#tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n",
-        "#tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation)  #leaky\n",
-        "word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)\n",
+        "  stop_words = set(stopwords.words('french'))\n",
         "\n",
-        "#pad sequences\n",
-        "word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "pre-processing test data...\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "100%|██████████| 13137/13137 [00:09<00:00, 1317.07it/s]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "tokenizing input data...\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "czeIqlD5iudH"
-      },
-      "source": [
-        "predictions = model.predict(word_seq_test)\n",
-        "predictions = np.argmax(predictions,axis=1)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Q9eYqi5SW_Ta",
-        "outputId": "3682a42a-7c07-446e-d913-3d20640fb2bf"
-      },
-      "source": [
-        "report = classification_report(predictions, y_test, output_dict = True)\n",
+        "  processed_docs_test = []\n",
+        "  for doc in tqdm(raw_docs_test):\n",
+        "      tokens = word_tokenize(doc, language='french')\n",
+        "      filtered = [word for word in tokens if word not in stop_words]\n",
+        "      processed_docs_test.append(\" \".join(filtered))\n",
+        "  #end for\n",
         "\n",
-        "accuracy = report['accuracy']\n",
-        "weighted_avg = report['weighted avg']\n",
+        "  print(\"tokenizing input data...\")\n",
+        "  #tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n",
+        "  #tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation)  #leaky\n",
+        "  word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)\n",
         "\n",
-        "print(accuracy, weighted_avg)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "0.5957220065463956 {'precision': 0.6075119377257042, 'recall': 0.5957220065463956, 'f1-score': 0.59493432234528, 'support': 13137}\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n",
-            "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, msg_start, len(result))\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "ra4FOHVniwUI",
-        "outputId": "cbe576f6-ce14-49ef-9aba-2d26f76cab92"
-      },
-      "source": [
-        "from sklearn.metrics import confusion_matrix\n",
+        "  #pad sequences\n",
+        "  word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)\n",
         "\n",
-        "classesName = encoder.classes_\n",
-        "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
+        "  for maxOfInstancePerClass in [500, 1500, 10000]:\n",
+        "      # il manque le model BERT s500 ...\n",
+        "      \n",
+        "      for classifier_name in [\"lstm\", 'cnn']:\n",
         "\n",
-        "precision = []\n",
-        "recall = []\n",
-        "f1 = []\n",
-        "support = []\n",
-        "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
-        "for c in classes:\n",
-        "  precision.append(report[c]['precision'])\n",
-        "  recall.append(report[c]['recall'])\n",
-        "  f1.append(report[c]['f1-score'])\n",
-        "  support.append(report[c]['support'])\n",
+        "        model = keras.models.load_model(\"drive/MyDrive/Classification-EDdA/\"+classifier_name+\"_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n",
         "\n",
-        "accuracy = report['accuracy']\n",
-        "weighted_avg = report['weighted avg']\n",
         "\n",
+        "        predictions = model.predict(word_seq_test)\n",
+        "        predictions = np.argmax(predictions,axis=1)\n",
         "\n",
-        "cnf_matrix = confusion_matrix(y_test, predictions)\n",
-        "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
-        "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
-        "TP = np.diag(cnf_matrix)\n",
-        "TN = cnf_matrix.sum() - (FP + FN + TP)\n",
         "\n",
-        "dff['className'] = classesName\n",
-        "dff['precision'] = precision\n",
-        "dff['recall'] = recall\n",
-        "dff['f1-score'] = f1\n",
-        "dff['support'] = support\n",
-        "dff['FP'] = FP\n",
-        "dff['FN'] = FN\n",
-        "dff['TP'] = TP\n",
-        "dff['TN'] = TN\n",
+        "        report = classification_report(y_test, predictions, output_dict = True)\n",
         "\n",
-        "print(\"test_lstm_s\"+str(maxOfInstancePerClass))\n",
+        "        accuracy = report['accuracy']\n",
+        "        weighted_avg = report['weighted avg']\n",
         "\n",
-        "print(weighted_avg)\n",
-        "print(accuracy)\n",
-        "print(dff)\n",
+        "        print(accuracy, weighted_avg)\n",
         "\n",
-        "dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_test_lstm_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "test_lstm_s1500\n",
-            "{'precision': 0.6075119377257042, 'recall': 0.5957220065463956, 'f1-score': 0.59493432234528, 'support': 13137}\n",
-            "0.5957220065463956\n",
-            "                                      className  precision  ...    TP     TN\n",
-            "0               Agriculture - Economie rustique   0.259843  ...    66  12780\n",
-            "1                                      Anatomie   0.446429  ...   100  12818\n",
-            "2                                     Antiquité   0.525316  ...   166  12425\n",
-            "3                                  Architecture   0.518868  ...   165  12597\n",
-            "4                               Arts et métiers   0.007752  ...     1  13002\n",
-            "5                                    Beaux-arts   0.020000  ...     2  13016\n",
-            "6                       Belles-lettres - Poésie   0.200000  ...    47  12667\n",
-            "7                                        Blason   0.466667  ...    49  12908\n",
-            "8                                    Caractères   0.074074  ...     2  13110\n",
-            "9                                        Chasse   0.262295  ...    32  12929\n",
-            "10                                       Chimie   0.348214  ...    39  12952\n",
-            "11                                     Commerce   0.524249  ...   227  12442\n",
-            "12                        Droit - Jurisprudence   0.750176  ...  1063  11473\n",
-            "13                          Economie domestique   0.000000  ...     0  13106\n",
-            "14                                    Grammaire   0.587500  ...   329  12094\n",
-            "15                                   Géographie   0.830753  ...  2361  10167\n",
-            "16                                     Histoire   0.459916  ...   327  11749\n",
-            "17                           Histoire naturelle   0.687835  ...   769  11871\n",
-            "18                                          Jeu   0.415385  ...    27  13034\n",
-            "19                                       Marine   0.708046  ...   308  12497\n",
-            "20                           Maréchage - Manège   0.784483  ...    91  12991\n",
-            "21                                Mathématiques   0.450331  ...    68  12922\n",
-            "22                                       Mesure   0.333333  ...    14  13078\n",
-            "23              Militaire (Art) - Guerre - Arme   0.510135  ...   151  12719\n",
-            "24                                  Minéralogie   0.000000  ...     0  13111\n",
-            "25                                      Monnaie   0.041096  ...     3  13057\n",
-            "26                                      Musique   0.525000  ...    84  12922\n",
-            "27                                    Médailles   0.000000  ...     0  13109\n",
-            "28                         Médecine - Chirurgie   0.584795  ...   300  12279\n",
-            "29                                      Métiers   0.592378  ...   715  11248\n",
-            "30                                    Pharmacie   0.014085  ...     1  13065\n",
-            "31                                  Philosophie   0.160714  ...    18  12934\n",
-            "32  Physique - [Sciences physico-mathématiques]   0.533784  ...   158  12690\n",
-            "33                                    Politique   0.000000  ...     0  13111\n",
-            "34                                        Pêche   0.127660  ...     6  13067\n",
-            "35                                     Religion   0.357702  ...   137  12580\n",
-            "36                                    Spectacle   0.000000  ...     0  13126\n",
-            "37                                 Superstition   0.000000  ...     0  13112\n",
-            "\n",
-            "[38 rows x 9 columns]\n"
-          ]
-        }
+        "        classesName = encoder.classes_\n",
+        "        classes = [str(e) for e in encoder.transform(encoder.classes_)]\n",
+        "\n",
+        "        precision = []\n",
+        "        recall = []\n",
+        "        f1 = []\n",
+        "        support = []\n",
+        "        dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n",
+        "        for c in classes:\n",
+        "          precision.append(report[c]['precision'])\n",
+        "          recall.append(report[c]['recall'])\n",
+        "          f1.append(report[c]['f1-score'])\n",
+        "          support.append(report[c]['support'])\n",
+        "\n",
+        "        accuracy = report['accuracy']\n",
+        "        weighted_avg = report['weighted avg']\n",
+        "\n",
+        "\n",
+        "        cnf_matrix = confusion_matrix(y_test, predictions)\n",
+        "        FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n",
+        "        FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n",
+        "        TP = np.diag(cnf_matrix)\n",
+        "        TN = cnf_matrix.sum() - (FP + FN + TP)\n",
+        "\n",
+        "        dff['className'] = classesName\n",
+        "        dff['precision'] = precision\n",
+        "        dff['recall'] = recall\n",
+        "        dff['f1-score'] = f1\n",
+        "        dff['support'] = support\n",
+        "        dff['FP'] = FP\n",
+        "        dff['FN'] = FN\n",
+        "        dff['TP'] = TP\n",
+        "        dff['TN'] = TN\n",
+        "\n",
+        "        print(dataset+\"_\"+classifier_name+\"_s\"+str(maxOfInstancePerClass))\n",
+        "\n",
+        "        print(weighted_avg)\n",
+        "        print(accuracy)\n",
+        "        print(dff)\n",
+        "\n",
+        "        dff.to_csv(\"drive/MyDrive/Classification-EDdA/reports/report_\"+dataset+\"_\"+classifier_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n",
+        "        # enregistrer les predictions\n",
+        "        pd.DataFrame({'labels': pd.Series(y_test), 'predictions': pd.Series(predictions)}).to_csv(\"drive/MyDrive/Classification-EDdA/predictions/predictions_\"+dataset+\"_\"+classifier_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\")\n"
       ]
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "id": "x03FC0D-iwWP"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "gSVqcywgiwYH"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "-T5LfFtwiwaV"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Yjd5c70_iwcY"
-      },
-      "source": [
-        ""
-      ],
       "execution_count": null,
-      "outputs": []
+      "metadata": {},
+      "outputs": [],
+      "source": []
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "id": "2UNjiHYliwes"
-      },
-      "source": [
-        ""
-      ],
       "execution_count": null,
-      "outputs": []
+      "metadata": {},
+      "outputs": [],
+      "source": []
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "id": "vLGTnit_W_V8"
-      },
-      "source": [
-        ""
-      ],
       "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "R-3lBXjDD9wE"
-      },
-      "source": [
-        "def predict(data, max_len):\n",
-        "  \n",
-        "  pad_sequ_test, _ = prepare_sequence(data, max_len)\n",
-        "  pred_labels_ = model.predict(pad_sequ_test)\n",
-        "\n",
-        "  return np.argmax(pred_labels_,axis=1)\n",
-        "\n",
-        "\n",
-        "def eval(data, labels, max_len):\n",
-        "  \n",
-        "  pred_labels_ = predict(data, max_len)\n",
-        "  report = classification_report(pred_labels_, labels, output_dict = True)\n",
-        "\n",
-        "  accuracy = report['accuracy']\n",
-        "  weighted_avg = report['weighted avg']\n",
-        "  \n",
-        "  print(accuracy, weighted_avg)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      "metadata": {},
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "EDdA-Classification_DeepLearning.ipynb",
+      "provenance": []
     },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "6T3kAvKvExgc",
-        "outputId": "c6d4560e-fc64-4579-9adb-79c2e36d2386"
-      },
-      "source": [
-        "# evaluation sur le jeu de validation\n",
-        "eval(df_validation[columnText], y_validation, max_len)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
-            "  return np.array(self.texts_to_sequences(texts))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "0.06925290207361841 {'precision': 0.09108131158125257, 'recall': 0.06925290207361841, 'f1-score': 0.06099084715237025, 'support': 10079}\n"
-          ]
-        }
-      ]
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
     },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "pTDJA03_-8yu",
-        "outputId": "d8bcdf73-c4c3-4c88-b063-90bd1cad5122"
-      },
-      "source": [
-        "# evaluation sur le jeu de test\n",
-        "df_test = pd.read_csv(test_path, sep=\"\\t\")\n",
-        "#df_test = resample_classes(df_test, columnClass, maxOfInstancePerClass)\n",
-        "\n",
-        "y_test = df_test[columnClass]\n",
-        "encoder = preprocessing.LabelEncoder()\n",
-        "y_test = encoder.fit_transform(y_test)\n",
-        "\n",
-        "eval(df_test[columnText], y_test, max_len)\n"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
-            "  return np.array(self.texts_to_sequences(texts))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "0.07231483595950369 {'precision': 0.081194635559303, 'recall': 0.07231483595950369, 'f1-score': 0.06322383877903374, 'support': 13137}\n"
-          ]
-        }
-      ]
+    "language_info": {
+      "name": "python"
     }
-  ]
-}
\ No newline at end of file
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}