diff --git a/notebooks/Preprocessing.ipynb b/notebooks/Preprocessing.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..13d04f15852cab1e85a73ab0dccca39cb026eb9e
--- /dev/null
+++ b/notebooks/Preprocessing.ipynb
@@ -0,0 +1,2991 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "metallic-shelf",
+   "metadata": {},
+   "source": [
+    "# Préparation du corpus EDdA pour la classification en domaine"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "designing-advice",
+   "metadata": {},
+   "source": [
+    "## Preparing data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "floppy-fleet",
+   "metadata": {},
+   "source": [
+    "### Import des librairies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "appreciated-victim",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from bs4 import BeautifulSoup\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "import urllib, json\n",
+    "from urllib.request import urlopen"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c7fc80b7",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "framed-fossil",
+   "metadata": {},
+   "source": [
+    "### Parsing des articles TEI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "suburban-honduras",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_path = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/Alice/EDdA/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "scenic-vermont",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Volume :  11\n",
+      "Volume :  16\n",
+      "Volume :  17\n",
+      "Volume :  10\n",
+      "Volume :  5\n",
+      "Volume :  2\n",
+      "Volume :  3\n",
+      "Volume :  4\n",
+      "Volume :  15\n",
+      "Volume :  12\n",
+      "Volume :  13\n",
+      "Volume :  14\n",
+      "Volume :  1\n",
+      "Volume :  6\n",
+      "Volume :  8\n",
+      "Volume :  9\n",
+      "Volume :  7\n"
+     ]
+    }
+   ],
+   "source": [
+    "# récupération dans une liste des métadonnées (volume, numéro, nom de l'article, classe et auteur) à partir des fichiers TEI\n",
+    "data = []\n",
+    "\n",
+    "for tome in os.listdir(input_path):\n",
+    "    volume = tome[1:]\n",
+    "    print(\"Volume : \", volume)\n",
+    "    \n",
+    "    for article in os.listdir(input_path + tome +\"/\"):\n",
+    "        #print(\"Article : \", article[7:-4])\n",
+    "        numero = article[7:-4]\n",
+    "        extension = article[-4:]\n",
+    "        if extension == '.tei':\n",
+    "\n",
+    "            try:\n",
+    "                soup = BeautifulSoup(open(input_path+tome+\"/\"+article))\n",
+    "\n",
+    "                head = soup.find(type=\"head\")\n",
+    "                author = soup.find(type=\"author\")\n",
+    "                normclass = soup.find(type=\"normclass\")\n",
+    "                classEDdA = soup.find(type=\"class\")\n",
+    "                \n",
+    "                #print(volume, numero, head.get('value'), normclass.get('value'), author.get('value'))\n",
+    "                data.append([int(volume), int(numero), head.get('value').strip(), normclass.get('value').strip(), classEDdA.get('value').strip(), author.get('value').strip()])\n",
+    "            \n",
+    "            except AttributeError as e:\n",
+    "                #print('Volume : ', volume, ' Numéro : ', numero)\n",
+    "                #print('Error : ' + str(e))\n",
+    "                pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "excess-waterproof",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# transformation de la liste en dataframe\n",
+    "df = pd.DataFrame(data, columns=['volume', 'numero', 'head', 'normClass', 'classEDdA', 'author'])\n",
+    "df = df.sort_values(['volume', 'numero']).reset_index(drop = True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "blocked-reading",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>volume</th>\n",
+       "      <th>numero</th>\n",
+       "      <th>head</th>\n",
+       "      <th>normClass</th>\n",
+       "      <th>classEDdA</th>\n",
+       "      <th>author</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>41327</th>\n",
+       "      <td>10</td>\n",
+       "      <td>2211</td>\n",
+       "      <td>MILIAIRE fievre</td>\n",
+       "      <td>Médecine</td>\n",
+       "      <td>Medecine.</td>\n",
+       "      <td>Jaucourt</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>69509</th>\n",
+       "      <td>16</td>\n",
+       "      <td>3317</td>\n",
+       "      <td>TRIMONTIUM</td>\n",
+       "      <td>Géographie ancienne</td>\n",
+       "      <td>Géog. anc.</td>\n",
+       "      <td>unsigned</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32448</th>\n",
+       "      <td>8</td>\n",
+       "      <td>1600</td>\n",
+       "      <td>HRADSCHIN</td>\n",
+       "      <td>Géographie</td>\n",
+       "      <td>Géog.</td>\n",
+       "      <td>unsigned</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>60365</th>\n",
+       "      <td>14</td>\n",
+       "      <td>4069</td>\n",
+       "      <td>Sauveur</td>\n",
+       "      <td>Art numismatique</td>\n",
+       "      <td>Art numismat.</td>\n",
+       "      <td>Jaucourt</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15763</th>\n",
+       "      <td>3</td>\n",
+       "      <td>3846</td>\n",
+       "      <td>CONFORMISTES, (non-)</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unsigned</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33076</th>\n",
+       "      <td>8</td>\n",
+       "      <td>2228</td>\n",
+       "      <td>Jardin</td>\n",
+       "      <td>Fauconnerie</td>\n",
+       "      <td>Fauconnerie.</td>\n",
+       "      <td>unsigned</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6245</th>\n",
+       "      <td>2</td>\n",
+       "      <td>985</td>\n",
+       "      <td>Bassin</td>\n",
+       "      <td>Boulangerie</td>\n",
+       "      <td>terme de Boulanger</td>\n",
+       "      <td>unsigned</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1967</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1969</td>\n",
+       "      <td>ALTIN</td>\n",
+       "      <td>Commerce</td>\n",
+       "      <td>Commerce.</td>\n",
+       "      <td>unsigned</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11512</th>\n",
+       "      <td>2</td>\n",
+       "      <td>6252</td>\n",
+       "      <td>CAVER</td>\n",
+       "      <td>Escrime</td>\n",
+       "      <td>en Escrime.</td>\n",
+       "      <td>unsigned</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23387</th>\n",
+       "      <td>5</td>\n",
+       "      <td>2106</td>\n",
+       "      <td>EMPANNON</td>\n",
+       "      <td>Charpenterie</td>\n",
+       "      <td>Charpent.</td>\n",
+       "      <td>unsigned</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       volume  numero                  head            normClass  \\\n",
+       "41327      10    2211       MILIAIRE fievre             Médecine   \n",
+       "69509      16    3317            TRIMONTIUM  Géographie ancienne   \n",
+       "32448       8    1600             HRADSCHIN           Géographie   \n",
+       "60365      14    4069               Sauveur     Art numismatique   \n",
+       "15763       3    3846  CONFORMISTES, (non-)         unclassified   \n",
+       "33076       8    2228                Jardin          Fauconnerie   \n",
+       "6245        2     985                Bassin          Boulangerie   \n",
+       "1967        1    1969                 ALTIN             Commerce   \n",
+       "11512       2    6252                 CAVER              Escrime   \n",
+       "23387       5    2106              EMPANNON         Charpenterie   \n",
+       "\n",
+       "                classEDdA    author  \n",
+       "41327           Medecine.  Jaucourt  \n",
+       "69509          Géog. anc.  unsigned  \n",
+       "32448               Géog.  unsigned  \n",
+       "60365       Art numismat.  Jaucourt  \n",
+       "15763        unclassified  unsigned  \n",
+       "33076        Fauconnerie.  unsigned  \n",
+       "6245   terme de Boulanger  unsigned  \n",
+       "1967            Commerce.  unsigned  \n",
+       "11512         en Escrime.  unsigned  \n",
+       "23387           Charpent.  unsigned  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# affichage aléatoire de 50 lignes du dataframe\n",
+    "df.sample(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "expired-click",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "74190"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# nombre d'articles dans le dataframe\n",
+    "len(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "considered-adjustment",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>volume</th>\n",
+       "      <th>numero</th>\n",
+       "      <th>head</th>\n",
+       "      <th>classEDdA</th>\n",
+       "      <th>author</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>normClass</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <td>44</td>\n",
+       "      <td>44</td>\n",
+       "      <td>44</td>\n",
+       "      <td>44</td>\n",
+       "      <td>44</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>17</td>\n",
+       "      <td>17</td>\n",
+       "      <td>17</td>\n",
+       "      <td>17</td>\n",
+       "      <td>17</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Abus des langues</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Accord de sons</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Acoustique</th>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  volume  numero  head  classEDdA  author\n",
+       "normClass                                                \n",
+       "                      44      44    44         44      44\n",
+       "0                     17      17    17         17      17\n",
+       "Abus des langues       1       1     1          1       1\n",
+       "Accord de sons         1       1     1          1       1\n",
+       "Acoustique             6       6     6          6       6"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# On regroupe les lignes du dataframe en fonction du normclass\n",
+    "classes = df.groupby(['normClass']).count()\n",
+    "classes.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "instructional-variation",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2908"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Nombre de classes \n",
+    "len(classes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "handmade-contest",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "12685"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# nombre d'articles 'unclassified'\n",
+    "len(df.loc[df['normClass']==\"unclassified\",:])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "crude-olympus",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1614"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# nombre de classes avec un seul article\n",
+    "len(classes.loc[classes['volume']==1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "sized-barrier",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2656"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# nombre de classes avec moins de 20 articles\n",
+    "len(classes.loc[classes['volume']<20])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "indian-selection",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "weighted-hanging",
+   "metadata": {},
+   "source": [
+    "### Enregistrement"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "stainless-stewart",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# enregistrement du résultat du groupby\n",
+    "#classes['volume'].to_csv('classesEDdA.tsv',sep='\\t',header=False)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "hearing-olive",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# enregistrement du dataframe (permet de ne pas reparser tous les fichiers TEI pour recharger ce dataframe)\n",
+    "df.to_csv('../../../Data/EDdA-Classification/EDdA_dataframe_orginal.tsv',sep='\\t', index=False)  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "stuck-courage",
+   "metadata": {},
+   "source": [
+    "### Lecture"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "thick-destiny",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('../../../Data/EDdA-Classification/EDdA_dataframe_orginal.tsv', sep='\\t')  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "typical-munich",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>volume</th>\n",
+       "      <th>numero</th>\n",
+       "      <th>head</th>\n",
+       "      <th>normClass</th>\n",
+       "      <th>classEDdA</th>\n",
+       "      <th>author</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Title Page</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unsigned</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>Diderot &amp; d'Alembert</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>d'Alembert</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>A, a &amp; a</td>\n",
+       "      <td>Grammaire</td>\n",
+       "      <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n",
+       "      <td>Dumarsais5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>A</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>Dumarsais5</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   volume  numero                                head     normClass  \\\n",
+       "0       1       1                          Title Page  unclassified   \n",
+       "1       1       2   A MONSEIGNEUR LE COMTE D'ARGENSON  unclassified   \n",
+       "2       1       3  DISCOURS PRÉLIMINAIRE DES EDITEURS  unclassified   \n",
+       "3       1       5                            A, a & a     Grammaire   \n",
+       "4       1       6                                   A  unclassified   \n",
+       "\n",
+       "                                           classEDdA                author  \n",
+       "0                                       unclassified              unsigned  \n",
+       "1                                       unclassified  Diderot & d'Alembert  \n",
+       "2                                       unclassified            d'Alembert  \n",
+       "3  ordre Encyclopéd. Entend. Science de l'homme, ...            Dumarsais5  \n",
+       "4                                       unclassified            Dumarsais5  "
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "individual-protection",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ensembles de domaines et domaines ENCCRE\n",
+    "domaines_regroupes = {}\n",
+    "domaines_regroupes['Agriculture - Economie rustique'] = ['Agriculture', 'Economie rustique', 'Fontainier', 'Graines', 'Jardinage', 'Moulin', 'Sucre', 'Tabac', 'Vigne', 'Vin']\n",
+    "domaines_regroupes['Anatomie'] = ['Anatomie', 'Economie animale']\n",
+    "domaines_regroupes['Antiquité'] = ['Antiquité', 'Iconologie', 'Mythologie']\n",
+    "domaines_regroupes['Architecture'] = ['Architecture', 'Carreleur', 'Carrier', 'Coupe des pierres', 'Couvreur', 'Décoration', 'Maçonnerie']\n",
+    "domaines_regroupes['Arts et métiers'] = ['Arts et métiers', 'Arts mécaniques', 'Manufacture']\n",
+    "domaines_regroupes['Beaux-arts'] = ['Beaux-arts', 'Dessin', 'Gravue', 'Peinture', 'Sculpture']\n",
+    "domaines_regroupes['Belles-lettres - Poésie'] = ['Belles-lettres', 'Eloquence', 'Littérature', 'Poésie', 'Rhétorique']\n",
+    "domaines_regroupes['Blason'] = ['Blason']\n",
+    "domaines_regroupes['Caractères'] = ['Caractères', 'Ecriture']\n",
+    "domaines_regroupes['Chasse'] = ['Chasse', 'Fauconnerie', 'Oisellerie', 'Vénerie']\n",
+    "domaines_regroupes['Chimie'] = ['Alchimie', 'Chimie', 'Docimasie']\n",
+    "domaines_regroupes['Commerce'] = ['Commerce', 'Marchand', 'Voiturier']\n",
+    "domaines_regroupes['Droit - Jurisprudence'] = ['Chancellerie', 'Corporation', 'Douane', 'Droit', 'Eaux et Forêts', 'Finance', 'Jurisprudence', 'Palais']\n",
+    "domaines_regroupes['Economie domestique'] = ['Cuisine','Economie domestique']\n",
+    "#domaines_regroupes['Géographie'] = ['Géographie', 'Géographie Histoire naturelle', 'Géographie ancienne', 'Géographie des Arabes', 'Géographie du moyen âge',\n",
+    "#                                   'Géographie ecclésiastique', 'Géographie historique', 'Géographie maritime ancienne', 'Géographie des Romains', 'Géographie morderne',\n",
+    "#                                   'Géographie naturelle', 'Géographie physique', 'Géographie sacrée', 'Géographie sainte', 'Géographie transcendante', 'Géographie transcendantee']\n",
+    "domaines_regroupes['Géographie'] = ['Géographie', 'Topographie']\n",
+    "domaines_regroupes['Grammaire'] = ['Grammaire', 'Langues', 'Synonymes']\n",
+    "domaines_regroupes['Histoire'] = ['Calendrier','Chevalerie','Chronologie','Coutumes','Généalogie','Histoire','Inscriptions','Inventions', 'Voyage']\n",
+    "domaines_regroupes['Histoire naturelle'] = ['Botanique','Conchyliologie','Fossiles','Histoire naturelle', 'Ichtyologie','Insectologie','Ophiologie','Ornithologie','Zoologie']\n",
+    "domaines_regroupes['Jeu'] = ['Jeu']\n",
+    "domaines_regroupes['Maréchage - Manège'] = ['Maréchage', 'Manège']\n",
+    "domaines_regroupes['Marine'] = ['Galère','Marine', 'Navigation', 'Rivière']\n",
+    "domaines_regroupes['Mathématiques'] = ['Algèbre','Analyse des hasards', 'Arithmétique', 'Arpentage','Géométrie', 'Mathématiques', 'Trigonométrie']\n",
+    "domaines_regroupes['Médailles'] = ['Médailles','Numismatique']\n",
+    "domaines_regroupes['Médecine - Chirurgie'] = ['Chirurgie', 'Diète', 'Gymnastique', 'Maladie', 'Matière médicale', 'Médecine', 'Pathologie', 'Physiologie', 'Séméiotique', 'Thérapeutique']\n",
+    "domaines_regroupes['Mesure'] = ['Balancier', 'Jaugeage', 'Mesure', 'Poids']\n",
+    "domaines_regroupes[\"Métiers\"] = ['Boucherie', 'Boulangerie', 'Brasserie', 'Charcuterie', 'Confiserie', 'Distillation', 'Epicerie', 'Pâtisserie', 'Rôtisserie', 'Vinaigrier']\n",
+    "domaines_regroupes[\"Métiers\"] += ['Bois', 'Boissellerie', 'Charpenterie', 'Charronnage', 'Coffretier', 'Ebénisterie', 'Formier', 'Layeterie', 'Menuiserie', 'Tonnelier', 'Vannerie']\n",
+    "domaines_regroupes[\"Métiers\"] += ['Bourrelier', 'Boyaudier', 'Cardier', 'Chamoiseur', 'Corroierie', 'Cuir', 'Gainier', 'Hongroyeur', 'Maroquinier', 'Mégisserie', 'Parcheminerie', 'Peausserie', 'Pelleterie', 'Sellier', 'Tannerie']\n",
+    "domaines_regroupes[\"Métiers\"] += ['Aiguilletier-Epinglier', 'Ardoiserie', 'Argent', \"Batteur d'or\", 'Bijouterie', 'Bimblotier', 'Chaînetier', 'Chaudronnerie', 'Ciselure', 'Cloche', 'Cloutier', 'Coutellerie', 'Cuivre', 'Diamantaire', 'Dorure', 'Eperonnier', 'Fer']\n",
+    "domaines_regroupes[\"Métiers\"] += ['Ferblanterie', 'Fonderie', 'Forge', 'Fourbisseur', 'Glaces', 'Joaillier', 'Lapidaire', 'Lunetier', 'Marbrier', 'Maréchal-grossier', 'Métal', 'Metteur en oeuvre', 'Miroiterie', 'Or', 'Orfèvrerie']\n",
+    "domaines_regroupes[\"Métiers\"] += ['Pierres', 'Plomberie', \"Potier d'étain\", 'Serrurerie', 'Taillanderie', \"Tireur d'or\", 'Verrerie', 'Vitrerie']\n",
+    "domaines_regroupes[\"Métiers\"] += ['Cartier', 'Cartonnier', 'Imprimerie', 'Librairie', 'Marbreur de papier', 'Papeterie', 'Reliure']\n",
+    "domaines_regroupes[\"Métiers\"] += ['Bas au métier', 'Blanchissage des toiles', 'Blondier', 'Bonneterie', 'Bottier', 'Bourserie', 'Boutonnier', 'Broderie', 'Cardeur', 'Ceinturier', 'Chapellerie', 'Cordonnerie','Coton', 'Couture', 'Découpeur', 'Dentelle', 'Draperie']\n",
+    "domaines_regroupes[\"Métiers\"] += ['Etoffe', 'Fil', 'Friseur', 'Ganterie', 'Gazier', 'Laine', 'Lingerie', 'Mode', 'Ourdissage', 'Passementerie', 'Perruquier', 'Plumasserie', 'Rubanerie', 'Soierie', 'Tailleur', 'Tapisserie', 'Teinturerie', 'Tisserand', 'Toilerie', 'Tonderie de drap']\n",
+    "domaines_regroupes[\"Métiers\"] += ['Amidonnier', 'Blanchisserie de cire', 'Chandelier', 'Cirerie', 'Corderie', 'Emailleur', 'Eventailliste', 'Faïencier', 'Filassier', 'Fleuriste', 'Horlogerie', 'Marqueterie', 'Métiers peu attestés', 'Parfumeur', 'Paumier', 'Poterie']\n",
+    "domaines_regroupes[\"Métiers\"] += ['Salpêtrerie', 'Savonnerie', 'Sel', 'Tabatière', 'Tabletier-Cornetier', 'Tourneur', 'Vergetier', 'Vernisseur']\n",
+    "domaines_regroupes['Militaire (Art) - Guerre - Arme'] = ['Armurerie', 'Artificier', 'Artillerie', 'Canon','Escrime','Fortification','Guerre','Milice','Militaire']\n",
+    "domaines_regroupes['Minéralogie'] = ['Lithologie','Métallurgie','Minéralogie']\n",
+    "domaines_regroupes['Monnaie'] = ['Monnaie']\n",
+    "domaines_regroupes['Musique'] = ['Danse', 'Lutherie','Musique','Orgue', 'Voix']\n",
+    "domaines_regroupes['Pêche'] = ['Pêche']\n",
+    "domaines_regroupes['Pharmacie'] = ['Drogues', 'Pharmacie']\n",
+    "domaines_regroupes['Philosophie'] = ['Education', 'Logique', 'Métaphysique', 'Morale', 'Philologie','Philosophie', 'Sciences']\n",
+    "domaines_regroupes['Physique - [Sciences physico-mathématiques]'] = ['Acoustique', 'Astrologie', 'Astronomie', 'Cosmographie-Cosmologie', 'Gnomonique', 'Hydraulique', 'Mécanique', 'Optique', 'Perspective', 'Physique', 'Science microscopique']\n",
+    "domaines_regroupes['Politique'] = ['Economie', 'Gouvernement', 'Police', 'Politique']\n",
+    "domaines_regroupes['Religion'] = ['Critique sacrée', 'Culte', 'Eglise', 'Histoire ecclésiastique', 'Idolâtrie', 'Religion', 'Théologie']\n",
+    "domaines_regroupes['Spectacle'] = ['Opéra','Spectacle', 'Théâtre']\n",
+    "domaines_regroupes['Superstition'] = ['Divination', 'Magie', 'Superstition']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "variable-instrument",
+   "metadata": {},
+   "source": [
+    "### Récupération correspondance EDdA / ENCCRE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "south-equation",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_correspondances = pd.read_csv(\"/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Classification domaines EDdA/correspondances_ARTFL-ENCCRE.csv\") \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "protecting-incentive",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>path</th>\n",
+       "      <th>entreeid</th>\n",
+       "      <th>tome</th>\n",
+       "      <th>article</th>\n",
+       "      <th>adresse</th>\n",
+       "      <th>entree</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>T1/article5</td>\n",
+       "      <td>v1-1-0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>T1/article6</td>\n",
+       "      <td>v1-1-1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>T1/article7</td>\n",
+       "      <td>v1-1-2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>T1/article8</td>\n",
+       "      <td>v1-1-3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>8</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>T1/article9</td>\n",
+       "      <td>v1-1-4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          path entreeid  tome  article  adresse  entree\n",
+       "0  T1/article5   v1-1-0     1        5        1       0\n",
+       "1  T1/article6   v1-1-1     1        6        1       1\n",
+       "2  T1/article7   v1-1-2     1        7        1       2\n",
+       "3  T1/article8   v1-1-3     1        8        1       3\n",
+       "4  T1/article9   v1-1-4     1        9        1       4"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_correspondances.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "continuous-feedback",
+   "metadata": {},
+   "source": [
+    "### Test récupération données ENCCRE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "7820200b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import urllib, json\n",
+    "from urllib.request import urlopen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "spread-feature",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json_url = urlopen(\"http://enccre.academie-sciences.fr/icefront/api/article/v1-544-0\")\n",
+    "data = json.loads(json_url.read())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "facial-syndicate",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'géographie'"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['annotations']['constit'][0]['domgen'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "removed-nickel",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_key(val):\n",
+    "    for key, value in domaines_regroupes.items():\n",
+    "        for v in value:\n",
+    "            v = v.replace(\" \", \"\")\n",
+    "            if val == v.lower():\n",
+    "                return key\n",
+    " \n",
+    "    return None\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "nuclear-murder",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Histoire naturelle\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(get_key(\"histoirenaturelle\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "placed-homework",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git &> /dev/null\n",
+    "!pip install spacy\n",
+    "!python -m spacy download fr_core_news_sm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "extraordinary-settlement",
+   "metadata": {},
+   "source": [
+    "### Ajout des colonnes domaines, texte, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "0c378939",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     /Users/lmoncla/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n",
+      "[nltk_data] Downloading package wordnet to /Users/lmoncla/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import string\n",
+    "import nltk\n",
+    "from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer\n",
+    "\n",
+    "import spacy\n",
+    "nlp = spacy.load(\"fr_core_news_sm\")\n",
+    "\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('wordnet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "96448195",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "lst_stopwords = nltk.corpus.stopwords.words(\"french\")\n",
+    "lst_stopwords += ['plus',  'dun', 'deux', 'autre', 'cette', 'quelque', 'étoit', 'avoit', 'si', 'dont', 'quon', 'voyez', 'lautre', 'comme', 'fait', 'aussi', 'leurs', 'tous', 'toute', 'autres', 'dit', 'selon', 'tout']\n",
+    "lst_stopwords += ['étoient', 'faire', 'lon', 'celle', 'ainsi', 'quelle', 'être', 'faut', 'peut', 'entre', 'elles', 'ceux', 'donc', 'celui', 'nest', 'dautre', 'doit', 'cet', ]\n",
+    "lst_stopwords += [\"un\", \"deux\", \"trois\", \"quatre\", \"cinq\", \"six\", \"sept\", \"huit\", \"neuf\", \"dix\", \"très\", \"plus\", \"ni\", \"fit\", \"parce\", \"dire\"]\n",
+    "lst_stopwords += [\"douze\", \"toutes\", \"après\"]\n",
+    "lst_stopwords += [\"l\\'\", \"qu'\", \"s'\", \"c'\", \"d'\", \"n'\", \"j'\", \"m'\", \"t'\", \"jusqu'\", \"lorsqu'\", \"puisqu'\", \"quoiqu'\"]\n",
+    "\n",
+    "lem = FrenchLefffLemmatizer()\n",
+    "\n",
+    "def utils_preprocess_text(content_str):\n",
+    "    #text = re.sub(r'[^\\w\\s]', '', str(text).lower().strip())\n",
+    "    #text = nltk.tokenize.word_tokenize(text, language = \"french\")\n",
+    "    \n",
+    "    #text = unescape(mt.tokenize(text, return_str=True))\n",
+    "    \n",
+    "    text = nlp(content_str) # spacy\n",
+    "\n",
+    "    return len(text), \" \".join([lem.lemmatize(word.text.lower()) for word in text if word.text.lower() not in lst_stopwords and word.text not in string.punctuation])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "de693cc7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8 indication donner aujourd'hui ...\n"
+     ]
+    }
+   ],
+   "source": [
+    "a, b = utils_preprocess_text(\"L'indication qu'il faut donner aujourd'hui ...\")\n",
+    "print(a, b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "cf7bb9ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"A, préposition vient du latin à, à dextris, à sinistris, à droite, à gauche. Plus souvent encore notre\\nà vient de la préposition latine ad, loqui ad, parler\\nà. On trouve aussi dicere ad. Cic. It lucrum ad me,\\n(Plaute) le profit en vient à moi. Sinite parvulos venire ad me, laissez venir ces enfans à moi.\\nObservez que a mot, n'est jamais que ou la troisieme \\npersonne du présent de l'indicatis du verbe\\navoir, ou une simple préposition. Ainsi à n'est jamais\\nadverbe, comme quelques Grammairiens l'ont cru,\\nquoiqu il entre dans plusieurs façons de parler adverbiales.\\nCar l'adverbe n'a pas besoin d'être suivi d'un\\nautre mot qui le détermine, ou, comme disent communément \\nles Grammairiens, l'adverbe n'a jamais\\nde régime ; parce que l'adverbe renferme en soi la\\npréposition & le nom : prudemment, avec prudence.\\n(V. Adverbe) au lieu que la préposition a toûjours\\nun régime, c'est-à-dire, qu'elle est toujours suivie\\nd'un autre mot, qui détermine la relation ou l'espece\\nde rapport que la préposition indique. Ainsi la préposition \\nà peut bien entrer, comme toutes les autres\\nprépositions, dans des façons de parler adverbiales:\\nmais comme elle est toûjours suivie de son complément,\\nou, comme on dit, de son régime, elle ne peut\\njamais être adverbe.\\nA n'est pas non plus une simple particule qui marque\\n\\n le datif ; parce qu'en françois nous n'avons ni\\ndéclinaison, ni cas, ni par conséquent de datif. V.\\nCas. Le rapport que les Latins marquoient par la\\nterminaison du datif, nous l'indiquons par la préposition \\nà. C'est ainsi que les Latins mêmes se sont servis\\nde la préposition ad, quod attinet ad me. Cic. Accedit\\nad, referre ad aliquem, & alicui. Ils disoient aussi également loqui ad aliquem, & loqui alicui, parler à quelqu'un, &c.\\nA l'égard des différens usages de la préposition à,\\nil faut observer 1. que toute préposition est entre\\ndeux termes, qu'elle lie & qu'elle met en rapport.\\n2. Que ce rapport est souvent marqué par la signification \\npropre de la préposition même, comme\\navec, dans, sur, &c.\\n3. Mais que souvent aussi les prépositions, surtout\\nà, de ou du, outre le rapport qu'elles indiquent quand\\nelles sont prises dans leur sens primitif & propre, ne\\nsont ensuite par figure & par extension, que de simples \\nprépositions unitives ou indicatives, qui ne font\\nque mettre deux mots en rapport ; ensorte qu'alors\\nc'est à l'esprit même à remarquer la sorte de rapport\\nqu'il y a entre les deux termes de la relation unis entre-eux par la préposition : par exemple, approchez-vous du feu : du, lie feu avec approchez-vous, & l'esprit \\nobserve ensuite un rapport d'approximation,\\nque du ne marque pas. Eloignez-vous du feu ; du, lie\\nfeu avec éloignez-vous, & l'esprit observe-là un rapport \\nd'éloignement. Vous voyez que la même préposition \\nsert à marquer des rapports opposés. On dit\\nde même donner à & ôter à. Ainsi ces sortes de rapports\\ndifferent autant que les mots different entre-eux.\\nJe crois donc que lorsque les prépositions ne sont,\\nou ne paroissent pas prises dans le sens propre de leur\\npremiere destination, & que par conséquent elles\\nn'indiquent pas par elles-mêmes la sorte de rapport\\nparticulier que celui qui parle veut faire entendre ;\\nalors c'est à celui qui écoute ou qui lit, à reconnoître\\nla sorte de rapport qui se trouve entre les mots liés\\npar la préposition simplement unitive & indicative.\\nCependant quelques Grammairiens ont mieux aimé \\népuiser la Métaphysique la plus recherchée, &\\nsi je l'ose dire, la plus inutile & la plus vaine, que\\nd'abandonner le Lecteur au discernement que lui donne \\nla connoissance & l'usage de sa propre Langue.\\nRapport de cause, rapport d'effet, d'instrument, de situation,\\nd'époque, table à pieds de biche, c'est-là un rapport \\nde forme, dit M. l'Abbé Girard, tom. II. p. 199.\\nBassin à barbe, rapport de service, (id. ib.) Pierre à feu,\\nrapport de propriété productive, (id. ib.) &c. La préposition \\nà n'est point destinée à marquer par elle-même un rapport de propriété productive, ou de service,\\nou de forme, &c. quoique ces rapports se trouvent\\nentre les mots liés par la préposition à. D'ailleurs,\\nles mêmes rapports sont souvent indiqués par des\\nprépositions différentes, & souvent des rapports opposés \\nsont indiqués par la même préposition.\\nIl me paroit donc que l'on doit d'abord observer la\\npremiere & principale destination d'une préposition.\\nPar exemple : la principale destination de la préposition \\nà, est de marquer la relation d'une chose à une\\nautre, comme, le terme où l'on va, ou à quoi ce\\nqu'on fait se termine, le but, la sin, l'attribution,\\nle pourquoi. Aller à Rome, préter de l'argent à usure,\\nà gros intérét. Donner quelque chose à quelqu'un, &c.\\nLes autres usages de cette préposition reviennent ensuite \\nà ceux-là par catachrese, abus, extension, ou\\nimitation : mais il est bon de remarquer quelques-uns\\nde ces usages, afin d'avoir des exemples qui puissent\\nservir de regle, & aider à décider les doutes par analogie \\n& par imitation. On dit donc:\\n\\nAprès un nom substantif.\\nAir à chanter. Billet à ordre, c'est-à-dire, payable\\n\\n\\nà ordre. Chaise à deux. Doute à éclaircir. Entreprise à\\nexécuter. Femme à la hotte? (au vocatif). Grenier à\\nsel. Habit à la mode. Instrument à vent. Lettre de change\\nà vûe, à dix jours de vûe. Matiere à procès. Nez à lunette.\\nOEufs à la coque. Plaine à perte de vûe. Question\\nà juger. Route à gauche. Vache à lait.\\n\\nA après un adjectif\\nAgréable à la vûe. Bon à prendre & à laisser. Contraire à la santé. Délicieux à manger. Facile à faire.\\nObservez qu'on dit : Il est facile de faire cela.\\n\\nQuand on le veut il est facile\\nDe s'assûrer un repos plein d'appas. Quinault.\\nLa raison de cette différence est que dans le dernier \\nexemple de n'a pas rapport à facile, mais à il ; il,\\nhoc, cela, à savoir de faire, &c. est facile, est une\\nchose facile. Ainsi, il, de s'assûrer un repos plein d'appas, est le sujet de la proposition, & est facile en est\\nl'attribut.\\n\\nQu'il est doux de trouver dans un amant qu'on aime\\nUn époux que l'on doit aimer! (Idem.)\\nIl, à savoir, de trouver un époux dans un amant,\\n&c. est doux, est une chose douce. (V. Proposition).\\nIl est gauche à tout ce qu'il fait. Heureux à la guerre.\\nHabile à dessiner, à écrire. Payable a ordre. Pareil à,\\n&c. Propre à, &c. Semblable à, &c. Utile à la santé.\\n\\nAprès un verbe.\\nS'abandonner à ses passions. S'amuser à des bagatelles.\\nApplaudir à quelqu'un. Aimer à boire, à faire du bien.\\nLes hommes n'aiment point à admirer les autres ; ils\\ncherchent eux-mêmes à être goûtés & à être applaudis.\\nLa Bruyere. Aller à cheval, à califourchon, c'est-à-dire, jambe deçà, jambe delà. S'appliquer à, &c. S'attacher à, &c. Blesser a, il a été blessé à la jambe. Crier\\nà l'aide, au feu, au secours. Conseiller quelque chose à\\nquelqu'un. Donner à boire à quelqu'un. Demander à\\nboire. Etre à. Il est à écrire, à jouer. Il est à jeun. Il\\nest à Rome. Il est à cent lieues. Il est long-tems à venir.\\nCela est à faire, à taire, à publier, à payer. C'est à vous\\nà mettre le prix à votre marchandise. J'ai fait cela à votre\\nconsidération, à votre intention. Il faut des livres à votre\\nfils. Joüer à Colin Maillard, joüer à l'ombre, aux échecs.\\nGarder à vûe. La dépense se monte à cent écus, & la recette \\nà, &c. Monter à cheval. Payer à quelqu'un. Payer\\nà vûe, à jour marqué. Persuader à. Préter à. Puiser à\\nla source. Prendre garde à soi. Prendre à gauche. Ils\\nvont un à un, deux à deux, trois à trois. Voyons à qui\\nl'aura, c'est-à-dire, voyons à ceci, (attendamus ad\\nhoc nempe) à savoir qui l'aura.\\n\\nA avant une autre Préposition.\\nA se trouve quelquefois avant la préposition de\\ncomme en ces exemples.\\n\\nPeut-on ne pas céder à de si puissans charmes?\\nEt peut-on refuser son coeur\\nA de beaux yeux qui le demandent?\\nJe crois qu'en ces occasions il y a une ellipse synthétique.\\nL'esprit est occupé des charmes particuliers\\nqui l'ont frappé; & il met ces charmes au rang des\\ncharmes puissans, dont on ne sauroit se garantir.\\nPeut-on ne pas céder à ces charmes qui sont du nombre \\ndes charmes si puissans, &c. Peut-on ne pas céder\\nà l'attrait, au pouvoir de si puissans charmes? Peut-on\\nrefuser son coeur à ces yeux, qui sont de la classe\\ndes beaux yeux. L'usage abrege ensuite l'expression,\\n& introduit des façons de parler particulieres auxquelles \\non doit se conformer, & qui ne détruisent\\npas les regles.\\nAinsi, je crois que de ou des sont toûjours des prépositions \\nextractives, & que quand on dit des Savans\\nsoûtiennent, des hommes m'ont dit, &c. des Savans, des\\nhommes, ne sont pas au nominatif. Et de même quand\\non dit, j'ai vû des hommes, j'ai vû des femmes, &c. des\\n\\nhommes, des femmes, ne sont pas à l'accusatif ; car,\\nsi l'on veut bien y prendre garde, on reconnoîtra\\nque ex hominibus, ex mulieribus, &c. ne peuvent\\nêtre ni le sujet de la proposition, ni le terme de l'action \\ndu verbe ; & que celui qui parle veut dire, que\\nquelques-uns des Savans soûtiennent, &c. quelques-uns des hommes, quelques-unes des femmes, disent, &c.\\n\\nA après des adverbes.\\nOn ne se sert de la préposition à après un adverbe,\\nque lorsque l'adverbe marque relation. Alors l'adverbe \\nexprime la sorte de relation, & la préposition\\nindique le corrélatif. Ainsi, on dit conformément à.\\nOn a jugé conformément à l'Ordonnance de 1667. On\\ndit aussi relativement à.\\nD'ailleurs l'adverbe ne marquant qu'une circonstance \\nabsolue & déterminée de l'action, n'est pas\\nsuivi de la préposition à.\\n\\nA en des façons de parler adverbiales, & en celles qui\\nsont équivalentes à des prépositions Latines, ou de\\nquelqu'autre Langue.\\nA jamais, à toûjours. A l'encontre. Tour à tour.\\nPas à pas. Vis-à-vis. A pleines mains. A fur & à mesure.\\nA la fin, tandem, aliquando, C'est-à-dire, nempe,\\nscilicet. Suivre à la piste. Faire le diable à quatre.\\nSe faire tenir à quatre. A cause, qu'on rend en latin par\\nla proposition propter. A raison de. Jusqu'à, ou jusques\\nà. Au-delà. Au-dessus. Au-dessous. A quoi bon, quorsùm.\\nA la vûe, à la présence, ou en présence, coram.\\nTelles sont les principales occasions où l'usage a\\nconsacré la préposition à. Les exemples que nous venons \\nde rapporter, serviront a décider par analogie\\nles difficultés que l'on pourroit avoir sur cette préposition.\\nAu reste la préposition au est la même que la préposition \\nà. La seule différence qu'il y a entre l'une\\n& l'autre, c'est que à est un mot simple, & que au\\nest un mot composé.\\nAinsi il faut considérer la préposition à en deux\\nétats différens.\\nI. Dans son état simple : 1°. Rendez à César ce\\nqui appartient à Céfar ; 2°. se prêter à l'exemple ;\\n3°. se rendre à la raison. Dans le premier exemple\\nà est devant un nom sans article. Dans le second\\nexemple à est suivi de l'article masculin, parce que\\nle mot commence par une voyelle : à l'exemple, à\\nl'esprit, à l'amour. Enfin dans le dernier, la préposition \\nà précede l'article féminin, à la raison, à l'autorité.\\nII. Hors de ces trois cas, la préposition à devient\\nun mot composé par sa jonction avec l'article le ou\\navec l'article pluriel les. L'article le à cause du son\\nsourd de l'e muet a amené au, de sorte qu'au lieu\\nde dire à le nous disons au, si le nom ne commence\\npas par une voyelle. S'adonner au bien ; & au pluriel\\nau lieu de dire à les, nous changeons l en u, ce qui\\narrive souvent dans notre Langue, & nous disons\\naux, soit que le nom commence par une voyelle ou\\npar une consonne : aux hommes, aux femmes, &c.\\nainsi au est autant que à le, & aux que à les.\\nA est aussi une préposition inséparable qui entre\\ndans la composition des mots ; donner, s'adonner,\\nporter, apporter, mener, amener, &c. ce qui sert ou à\\nl'énergie, ou à marquer d'autres points de vûe ajoûtés \\nà la premiere signification du mot.\\nIl faut encore observer qu'en Grec à marque\\n1. Privation, & alors on l'appelle alpha privatif,\\nce que les Latins ont quelquefois imité, comme dans\\namens qui est compose de mens, entendement, intelligence,\\n& de l'alpha privatif. Nous avons conservé\\nplusieurs mots où se trouve l'alpha privatif, comme\\namazone, asyle, abysme, &c. l'alpha privatif vient\\nde la préposition ἄτερ, sine, sans.\\n2. A en composition marque augmentation, & alors\\nil vient de ἄγαν, beaucoup.\\n3. A avec un accent circonflexe & un esprit doux\\nἆ marque admiration, desir, surprise, comme notre\\nah! ou ha! vox quiritantis, optantis, admirantis, dit\\nRobertson. Ces divers usages de l'a en Grec ont\\ndonné lieu à ce vers des Racines Greques\\n\\nA fait un, prive, augmente, admire.\\nEn terme de Grammaire, & sur-tout de Grammaire Greque, on appelle a pur un a qui seul fait\\nune syllabe comme en φιλία, amicitia. (F)\""
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "## cellule de test\n",
+    "volume = 1\n",
+    "numero = 7\n",
+    "\n",
+    "txt_file = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/all_txt/volume0\"+str(volume)+\"-\"+str(numero)+\".txt\"\n",
+    "\n",
+    "txtContent = open(txt_file, \"r\").read()\n",
+    "txtContent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "pursuant-camel",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def getDomaineEnccre(volume, numero, classEDDA):\n",
+    "    #print(volume, ' ', numero)\n",
+    "\n",
+    "    domaine = \"\"\n",
+    "    ensemble_domaine = \"\"\n",
+    "    entreeid = \"\"\n",
+    "    \n",
+    "    try :  \n",
+    "        #entreeid = df_correspondances.loc[(df_correspondances['tome']==volume) & (df_correspondances['article']==numero)]['entreeid'][0]\n",
+    "        d = df_correspondances.loc[(df_correspondances['tome']==volume) & (df_correspondances['article']==numero)].reset_index(drop=True)\n",
+    "        entreeid = d['entreeid'][0]\n",
+    "\n",
+    "        json_url = urlopen(\"http://enccre.academie-sciences.fr/icefront/api/article/\" + entreeid)\n",
+    "        data = json.loads(json_url.read())\n",
+    "        #print(data['annotations']['constit'][0]['domgen'][0])\n",
+    "        \n",
+    "        try :  \n",
+    "\n",
+    "            domaine = data['annotations']['constit'][0]['domgen'][0]\n",
+    "            ensemble_domaine = get_key(domaine)\n",
+    "            \n",
+    "            '''\n",
+    "            for constit in data['annotations']['constit']:\n",
+    "                \n",
+    "                domaine = constit['domgen'][0]\n",
+    "                print(domaine)\n",
+    "\n",
+    "                for domgen in constit['domgen']:  \n",
+    "                    domaine_multi += domgen + \";\"\n",
+    "                    ensemble = get_key(domgen)\n",
+    "                    if ensemble:\n",
+    "                        ensemble_domaine_multi.append(ensemble)\n",
+    "                \n",
+    "            #print(domaine)\n",
+    "            '''\n",
+    "        except KeyError:\n",
+    "            pass\n",
+    "     \n",
+    "    except KeyError:\n",
+    "        pass\n",
+    "       \n",
+    "    try :\n",
+    "        if volume < 10:\n",
+    "            txt_file = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/all_txt/volume0\"+str(volume)+\"-\"+str(numero)+\".txt\"\n",
+    "        else :\n",
+    "            txt_file = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/all_txt/volume\"+str(volume)+\"-\"+str(numero)+\".txt\"\n",
+    "\n",
+    "        txtContent = open(txt_file, \"r\").read()\n",
+    "        \n",
+    "        classEDDA = str(classEDDA)\n",
+    "        \n",
+    "        #supprime le désignant du texte\n",
+    "        classEDDA_with_brcts = '('+ classEDDA +')'\n",
+    "        txtContentWithoutClass = txtContent.replace(classEDDA_with_brcts, \"\")\n",
+    "        txtContentWithoutClass = txtContentWithoutClass.replace(classEDDA, \"\")\n",
+    "        \n",
+    "        firstParagraph = txtContentWithoutClass.split('\\n \\n')[0]   ## ne fonctionne pas !\n",
+    "        \n",
+    "        nbWords, txtContentWithoutClass = utils_preprocess_text(txtContentWithoutClass)\n",
+    "        nbWords1stPara, firstParagraph = utils_preprocess_text(firstParagraph)\n",
+    "        \n",
+    "\n",
+    "    except FileNotFoundError:\n",
+    "        txtContent = \"\"\n",
+    "        txtContentWithoutClass = \"\"\n",
+    "        firstParagraph = \"\"\n",
+    "        nbWords = 0\n",
+    "        nbWords1stPara = 0\n",
+    "        \n",
+    "    #ensemble_domaine_multi = ';'.join(list(set(ensemble_domaine)))\n",
+    "    \n",
+    "    #print(entreeid, domaine, ensemble_domaine, txtContent, txtContentWithoutClass, firstParagraph)\n",
+    "    \n",
+    "    return pd.Series([entreeid, domaine, ensemble_domaine, txtContent, txtContentWithoutClass, firstParagraph, nbWords, nbWords1stPara])\n",
+    "        \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "36ae000f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>volume</th>\n",
+       "      <th>numero</th>\n",
+       "      <th>head</th>\n",
+       "      <th>normClass</th>\n",
+       "      <th>classEDdA</th>\n",
+       "      <th>author</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>21735</th>\n",
+       "      <td>5</td>\n",
+       "      <td>454</td>\n",
+       "      <td>Doyen des Avocats</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>Boucher d'Argis</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28995</th>\n",
+       "      <td>7</td>\n",
+       "      <td>1532</td>\n",
+       "      <td>GARBIN</td>\n",
+       "      <td>Marine</td>\n",
+       "      <td>Marine.</td>\n",
+       "      <td>Bellin</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5415</th>\n",
+       "      <td>2</td>\n",
+       "      <td>155</td>\n",
+       "      <td>BAGUE</td>\n",
+       "      <td>Histoire ancienne | Histoire moderne</td>\n",
+       "      <td>Hist. anc. &amp; mod.</td>\n",
+       "      <td>Diderot</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>53231</th>\n",
+       "      <td>13</td>\n",
+       "      <td>1375</td>\n",
+       "      <td>Prevôt des Bandes ou des Bandes françoises</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>Boucher d'Argis</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27878</th>\n",
+       "      <td>7</td>\n",
+       "      <td>415</td>\n",
+       "      <td>FOSSET</td>\n",
+       "      <td>Tonnelier | Economie rustique</td>\n",
+       "      <td>Econom. rustiq. ou Tonnelier.</td>\n",
+       "      <td>Diderot</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       volume  numero                                        head  \\\n",
+       "21735       5     454                           Doyen des Avocats   \n",
+       "28995       7    1532                                      GARBIN   \n",
+       "5415        2     155                                       BAGUE   \n",
+       "53231      13    1375  Prevôt des Bandes ou des Bandes françoises   \n",
+       "27878       7     415                                      FOSSET   \n",
+       "\n",
+       "                                  normClass                      classEDdA  \\\n",
+       "21735                          unclassified                   unclassified   \n",
+       "28995                                Marine                        Marine.   \n",
+       "5415   Histoire ancienne | Histoire moderne              Hist. anc. & mod.   \n",
+       "53231                          unclassified                   unclassified   \n",
+       "27878         Tonnelier | Economie rustique  Econom. rustiq. ou Tonnelier.   \n",
+       "\n",
+       "                author  \n",
+       "21735  Boucher d'Argis  \n",
+       "28995           Bellin  \n",
+       "5415           Diderot  \n",
+       "53231  Boucher d'Argis  \n",
+       "27878          Diderot  "
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.sample(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "christian-advice",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "df['id_enccre'], df['domaine_enccre'],  df['ensemble_domaine_enccre'], df['content'], df['contentWithoutClass'], df['firstParagraph'], df['nb_words'], df['nb_words_1stPara'] = df.apply(lambda row: getDomaineEnccre(row.volume, row.numero, row.classEDdA), axis=1).T.values\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "9ea62866",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>volume</th>\n",
+       "      <th>numero</th>\n",
+       "      <th>head</th>\n",
+       "      <th>normClass</th>\n",
+       "      <th>classEDdA</th>\n",
+       "      <th>author</th>\n",
+       "      <th>id_enccre</th>\n",
+       "      <th>domaine_enccre</th>\n",
+       "      <th>ensemble_domaine_enccre</th>\n",
+       "      <th>content</th>\n",
+       "      <th>contentWithoutClass</th>\n",
+       "      <th>firstParagraph</th>\n",
+       "      <th>nb_words</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>34467</th>\n",
+       "      <td>8</td>\n",
+       "      <td>3619</td>\n",
+       "      <td>JOUBARBE</td>\n",
+       "      <td>Botanique</td>\n",
+       "      <td>Botan.</td>\n",
+       "      <td>Jaucourt</td>\n",
+       "      <td>v8-2770-0</td>\n",
+       "      <td>botanique</td>\n",
+       "      <td>Histoire naturelle</td>\n",
+       "      <td>JOUBARBE, s. f. (Botan.) Sedum, genre de plant...</td>\n",
+       "      <td>joubarbe s. f.   sedum genre plante \\n fleur r...</td>\n",
+       "      <td>joubarbe s. f.   sedum genre plante \\n fleur r...</td>\n",
+       "      <td>854</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>67530</th>\n",
+       "      <td>16</td>\n",
+       "      <td>1338</td>\n",
+       "      <td>THYONÉ</td>\n",
+       "      <td>Mythologie</td>\n",
+       "      <td>Mytholog.</td>\n",
+       "      <td>unsigned</td>\n",
+       "      <td>v16-815-0</td>\n",
+       "      <td>mythologie</td>\n",
+       "      <td>Antiquité</td>\n",
+       "      <td>THYONÉ, (Mytholog.) c'est, selon Ovide, le\\nno...</td>\n",
+       "      <td>thyoné   ovide \\n nom sou lequel sémélé mise j...</td>\n",
+       "      <td>thyoné   ovide \\n nom sou lequel sémélé mise j...</td>\n",
+       "      <td>47</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25346</th>\n",
+       "      <td>6</td>\n",
+       "      <td>408</td>\n",
+       "      <td>Evêque de la cour</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unsigned</td>\n",
+       "      <td>v6-181-7</td>\n",
+       "      <td>histoireecclésiastique</td>\n",
+       "      <td>Religion</td>\n",
+       "      <td>Evêque de la cour ; on donne quelquetois ce\\nt...</td>\n",
+       "      <td>evêque cour donne quelquetois \\n titre grand a...</td>\n",
+       "      <td>evêque cour donne quelquetois \\n titre grand a...</td>\n",
+       "      <td>20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22927</th>\n",
+       "      <td>5</td>\n",
+       "      <td>1646</td>\n",
+       "      <td>EGRATIGNÉE (Maniere)</td>\n",
+       "      <td>Peinture</td>\n",
+       "      <td>Peint.</td>\n",
+       "      <td>Jaucourt</td>\n",
+       "      <td>v5-775-0</td>\n",
+       "      <td>peinture</td>\n",
+       "      <td>Beaux-arts</td>\n",
+       "      <td>EGRATIGNÉE, (Maniere) Peint. espece de\\npeintu...</td>\n",
+       "      <td>egratignée maniere   espece \\n peinture fresqu...</td>\n",
+       "      <td>egratignée maniere   espece \\n peinture fresqu...</td>\n",
+       "      <td>256</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>66518</th>\n",
+       "      <td>16</td>\n",
+       "      <td>326</td>\n",
+       "      <td>Tenaille</td>\n",
+       "      <td>Docimastique</td>\n",
+       "      <td>Docimastique.</td>\n",
+       "      <td>Jaucourt</td>\n",
+       "      <td>v16-170-1</td>\n",
+       "      <td>docimasie</td>\n",
+       "      <td>Chimie</td>\n",
+       "      <td>Tenaille, s. f. (Docimastique.) entre les uste...</td>\n",
+       "      <td>tenaille s. f.   ustensile \\n art essai rend i...</td>\n",
+       "      <td>tenaille s. f.   ustensile \\n art essai rend i...</td>\n",
+       "      <td>753</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       volume  numero                  head     normClass      classEDdA  \\\n",
+       "34467       8    3619              JOUBARBE     Botanique         Botan.   \n",
+       "67530      16    1338                THYONÉ    Mythologie      Mytholog.   \n",
+       "25346       6     408     Evêque de la cour  unclassified   unclassified   \n",
+       "22927       5    1646  EGRATIGNÉE (Maniere)      Peinture         Peint.   \n",
+       "66518      16     326              Tenaille  Docimastique  Docimastique.   \n",
+       "\n",
+       "         author  id_enccre          domaine_enccre ensemble_domaine_enccre  \\\n",
+       "34467  Jaucourt  v8-2770-0               botanique      Histoire naturelle   \n",
+       "67530  unsigned  v16-815-0              mythologie               Antiquité   \n",
+       "25346  unsigned   v6-181-7  histoireecclésiastique                Religion   \n",
+       "22927  Jaucourt   v5-775-0                peinture              Beaux-arts   \n",
+       "66518  Jaucourt  v16-170-1               docimasie                  Chimie   \n",
+       "\n",
+       "                                                 content  \\\n",
+       "34467  JOUBARBE, s. f. (Botan.) Sedum, genre de plant...   \n",
+       "67530  THYONÉ, (Mytholog.) c'est, selon Ovide, le\\nno...   \n",
+       "25346  Evêque de la cour ; on donne quelquetois ce\\nt...   \n",
+       "22927  EGRATIGNÉE, (Maniere) Peint. espece de\\npeintu...   \n",
+       "66518  Tenaille, s. f. (Docimastique.) entre les uste...   \n",
+       "\n",
+       "                                     contentWithoutClass  \\\n",
+       "34467  joubarbe s. f.   sedum genre plante \\n fleur r...   \n",
+       "67530  thyoné   ovide \\n nom sou lequel sémélé mise j...   \n",
+       "25346  evêque cour donne quelquetois \\n titre grand a...   \n",
+       "22927  egratignée maniere   espece \\n peinture fresqu...   \n",
+       "66518  tenaille s. f.   ustensile \\n art essai rend i...   \n",
+       "\n",
+       "                                          firstParagraph nb_words  \n",
+       "34467  joubarbe s. f.   sedum genre plante \\n fleur r...      854  \n",
+       "67530  thyoné   ovide \\n nom sou lequel sémélé mise j...       47  \n",
+       "25346  evêque cour donne quelquetois \\n titre grand a...       20  \n",
+       "22927  egratignée maniere   espece \\n peinture fresqu...      256  \n",
+       "66518  tenaille s. f.   ustensile \\n art essai rend i...      753  "
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.sample(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "daily-office",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "11640"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# nombre d'articles non classés par ENCCRE (à partir de la correspondance automatique)\n",
+    "len(df.loc[(df['domaine_enccre']==\"\")])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "suited-methodology",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "12685"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# nombre d'article non classés par ARTFL\n",
+    "len(df.loc[(df['normClass']==\"unclassified\")])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "special-investigation",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>volume</th>\n",
+       "      <th>numero</th>\n",
+       "      <th>head</th>\n",
+       "      <th>normClass</th>\n",
+       "      <th>classEDdA</th>\n",
+       "      <th>author</th>\n",
+       "      <th>id_enccre</th>\n",
+       "      <th>ensemble_domaine_enccre</th>\n",
+       "      <th>content</th>\n",
+       "      <th>contentWithoutClass</th>\n",
+       "      <th>firstParagraph</th>\n",
+       "      <th>nb_words</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>domaine_enccre</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <td>11640</td>\n",
+       "      <td>11640</td>\n",
+       "      <td>11640</td>\n",
+       "      <td>11636</td>\n",
+       "      <td>11637</td>\n",
+       "      <td>11640</td>\n",
+       "      <td>11640</td>\n",
+       "      <td>11640</td>\n",
+       "      <td>11640</td>\n",
+       "      <td>11640</td>\n",
+       "      <td>11640</td>\n",
+       "      <td>11640</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>acoustique</th>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>agriculture</th>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "      <td>112</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>aiguilletierepinglier</th>\n",
+       "      <td>114</td>\n",
+       "      <td>114</td>\n",
+       "      <td>114</td>\n",
+       "      <td>114</td>\n",
+       "      <td>114</td>\n",
+       "      <td>114</td>\n",
+       "      <td>114</td>\n",
+       "      <td>0</td>\n",
+       "      <td>114</td>\n",
+       "      <td>114</td>\n",
+       "      <td>114</td>\n",
+       "      <td>114</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>alchimie</th>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                       volume  numero   head  normClass  classEDdA  author  \\\n",
+       "domaine_enccre                                                               \n",
+       "                        11640   11640  11640      11636      11637   11640   \n",
+       "acoustique                 12      12     12         12         12      12   \n",
+       "agriculture               112     112    112        112        112     112   \n",
+       "aiguilletierepinglier     114     114    114        114        114     114   \n",
+       "alchimie                   24      24     24         24         24      24   \n",
+       "\n",
+       "                       id_enccre  ensemble_domaine_enccre  content  \\\n",
+       "domaine_enccre                                                       \n",
+       "                           11640                    11640    11640   \n",
+       "acoustique                    12                       12       12   \n",
+       "agriculture                  112                      112      112   \n",
+       "aiguilletierepinglier        114                        0      114   \n",
+       "alchimie                      24                       24       24   \n",
+       "\n",
+       "                       contentWithoutClass  firstParagraph  nb_words  \n",
+       "domaine_enccre                                                        \n",
+       "                                     11640           11640     11640  \n",
+       "acoustique                              12              12        12  \n",
+       "agriculture                            112             112       112  \n",
+       "aiguilletierepinglier                  114             114       114  \n",
+       "alchimie                                24              24        24  "
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# nombre de classe ENCCRE\n",
+    "\n",
+    "classes_enccre = df.groupby(['domaine_enccre']).count()\n",
+    "classes_enccre.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "legendary-independence",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "312"
+      ]
+     },
+     "execution_count": 72,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(classes_enccre)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "fourth-involvement",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# enregistrement du dataframe dans un fichier tsv\n",
+    "df.to_csv('../../../Data/EDdA-Classification/EDdA_dataframe_withContent.tsv', sep='\\t', index=False)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "framed-sodium",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('../../../Data/EDdA-Classification/EDdA_dataframe_withContent.tsv', sep='\\t')  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "tutorial-savannah",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "74190"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "minus-waterproof",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre'], inplace= True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "scenic-sugar",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "61362"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "seasonal-suspect",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# enregistrement du dataframe dans un fichier tsv\n",
+    "#df.to_csv('../../../Data/EDdA-Classification/EDdA_dataframe_withContent.tsv',sep='\\t',index=False)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "opposed-binding",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "######\n",
+    "#df = pd.read_csv('../../../Data/EDdA-Classification/EDdA_dataframe_withContent.tsv', sep='\\t')  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "id": "endless-cathedral",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(61362, 13)"
+      ]
+     },
+     "execution_count": 78,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "id": "4062b7f5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>volume</th>\n",
+       "      <th>numero</th>\n",
+       "      <th>head</th>\n",
+       "      <th>normClass</th>\n",
+       "      <th>classEDdA</th>\n",
+       "      <th>author</th>\n",
+       "      <th>id_enccre</th>\n",
+       "      <th>domaine_enccre</th>\n",
+       "      <th>ensemble_domaine_enccre</th>\n",
+       "      <th>content</th>\n",
+       "      <th>contentWithoutClass</th>\n",
+       "      <th>firstParagraph</th>\n",
+       "      <th>nb_words</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>A, a &amp; a</td>\n",
+       "      <td>Grammaire</td>\n",
+       "      <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n",
+       "      <td>Dumarsais5</td>\n",
+       "      <td>v1-1-0</td>\n",
+       "      <td>grammaire</td>\n",
+       "      <td>Grammaire</td>\n",
+       "      <td>A, a &amp; a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n",
+       "      <td>a a a s.m ordre encyclopéd \\n entend science h...</td>\n",
+       "      <td>a a a s.m ordre encyclopéd \\n entend science h...</td>\n",
+       "      <td>1092</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>A</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>Dumarsais5</td>\n",
+       "      <td>v1-1-1</td>\n",
+       "      <td>grammaire</td>\n",
+       "      <td>Grammaire</td>\n",
+       "      <td>A, mot, est 1. la troisieme personne du présen...</td>\n",
+       "      <td>a mot 1 troisieme personne présent \\n indicati...</td>\n",
+       "      <td>a mot 1 troisieme personne présent \\n indicati...</td>\n",
+       "      <td>381</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>A</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>Dumarsais</td>\n",
+       "      <td>v1-1-2</td>\n",
+       "      <td>grammaire</td>\n",
+       "      <td>Grammaire</td>\n",
+       "      <td>A, préposition vient du latin à, à dextris, à ...</td>\n",
+       "      <td>a préposition vient latin dextris sinistris dr...</td>\n",
+       "      <td>a préposition vient latin dextris sinistris dr...</td>\n",
+       "      <td>3067</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>1</td>\n",
+       "      <td>10</td>\n",
+       "      <td>A, numismatique ou monétaire</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>Mallet</td>\n",
+       "      <td>v1-1-5</td>\n",
+       "      <td>numismatique</td>\n",
+       "      <td>Médailles</td>\n",
+       "      <td>A, numismatique ou monétaire, sur le revers de...</td>\n",
+       "      <td>a numismatique monétaire revers \\n ancien méda...</td>\n",
+       "      <td>a numismatique monétaire revers \\n ancien méda...</td>\n",
+       "      <td>156</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>1</td>\n",
+       "      <td>11</td>\n",
+       "      <td>A, lapidaire</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>unclassified</td>\n",
+       "      <td>Mallet</td>\n",
+       "      <td>v1-1-6</td>\n",
+       "      <td>inscriptions</td>\n",
+       "      <td>Histoire</td>\n",
+       "      <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n",
+       "      <td>a lapidaire ancien inscription \\n marbre c. si...</td>\n",
+       "      <td>a lapidaire ancien inscription \\n marbre c. si...</td>\n",
+       "      <td>122</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   volume  numero                          head     normClass  \\\n",
+       "3       1       5                      A, a & a     Grammaire   \n",
+       "4       1       6                             A  unclassified   \n",
+       "5       1       7                             A  unclassified   \n",
+       "8       1      10  A, numismatique ou monétaire  unclassified   \n",
+       "9       1      11                  A, lapidaire  unclassified   \n",
+       "\n",
+       "                                           classEDdA      author id_enccre  \\\n",
+       "3  ordre Encyclopéd. Entend. Science de l'homme, ...  Dumarsais5    v1-1-0   \n",
+       "4                                       unclassified  Dumarsais5    v1-1-1   \n",
+       "5                                       unclassified   Dumarsais    v1-1-2   \n",
+       "8                                       unclassified      Mallet    v1-1-5   \n",
+       "9                                       unclassified      Mallet    v1-1-6   \n",
+       "\n",
+       "  domaine_enccre ensemble_domaine_enccre  \\\n",
+       "3      grammaire               Grammaire   \n",
+       "4      grammaire               Grammaire   \n",
+       "5      grammaire               Grammaire   \n",
+       "8   numismatique               Médailles   \n",
+       "9   inscriptions                Histoire   \n",
+       "\n",
+       "                                             content  \\\n",
+       "3  A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...   \n",
+       "4  A, mot, est 1. la troisieme personne du présen...   \n",
+       "5  A, préposition vient du latin à, à dextris, à ...   \n",
+       "8  A, numismatique ou monétaire, sur le revers de...   \n",
+       "9  A, lapidaire, dans les anciennes inscriptions ...   \n",
+       "\n",
+       "                                 contentWithoutClass  \\\n",
+       "3  a a a s.m ordre encyclopéd \\n entend science h...   \n",
+       "4  a mot 1 troisieme personne présent \\n indicati...   \n",
+       "5  a préposition vient latin dextris sinistris dr...   \n",
+       "8  a numismatique monétaire revers \\n ancien méda...   \n",
+       "9  a lapidaire ancien inscription \\n marbre c. si...   \n",
+       "\n",
+       "                                      firstParagraph  nb_words  \n",
+       "3  a a a s.m ordre encyclopéd \\n entend science h...      1092  \n",
+       "4  a mot 1 troisieme personne présent \\n indicati...       381  \n",
+       "5  a préposition vient latin dextris sinistris dr...      3067  \n",
+       "8  a numismatique monétaire revers \\n ancien méda...       156  \n",
+       "9  a lapidaire ancien inscription \\n marbre c. si...       122  "
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "corrected-batman",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.loc[(df['nb_words']>=15)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "documentary-prince",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(58509, 13)"
+      ]
+     },
+     "execution_count": 82,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "opened-november",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "\n",
+    "train_x, validation_x, train_y, validation_y = train_test_split(df, df[\"ensemble_domaine_enccre\"], test_size=0.2, random_state=42, stratify = df[\"ensemble_domaine_enccre\"] )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "noticed-evanescence",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(46807, 13)"
+      ]
+     },
+     "execution_count": 84,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_x.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "welcome-homework",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(11702, 13)"
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "validation_x.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "hearing-moses",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_x.to_csv('../../../Data/EDdA-Classification/training_set.tsv',sep='\\t',index=False) \n",
+    "validation_x.to_csv('../../../Data/EDdA-Classification/test_set.tsv',sep='\\t',index=False)  \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "exterior-praise",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ensemble_domaine_enccre</th>\n",
+       "      <th>counts</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Agriculture - Economie rustique</td>\n",
+       "      <td>1163</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Anatomie</td>\n",
+       "      <td>1073</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Antiquité</td>\n",
+       "      <td>1362</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Architecture</td>\n",
+       "      <td>1389</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Arts et métiers</td>\n",
+       "      <td>563</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Beaux-arts</td>\n",
+       "      <td>429</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Belles-lettres - Poésie</td>\n",
+       "      <td>1031</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Blason</td>\n",
+       "      <td>539</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Caractères</td>\n",
+       "      <td>114</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Chasse</td>\n",
+       "      <td>581</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Chimie</td>\n",
+       "      <td>520</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Commerce</td>\n",
+       "      <td>1879</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Droit - Jurisprudence</td>\n",
+       "      <td>6419</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Economie domestique</td>\n",
+       "      <td>135</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Grammaire</td>\n",
+       "      <td>2258</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Géographie</td>\n",
+       "      <td>13104</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>Histoire</td>\n",
+       "      <td>3080</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Histoire naturelle</td>\n",
+       "      <td>4814</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Jeu</td>\n",
+       "      <td>282</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Marine</td>\n",
+       "      <td>2076</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Maréchage - Manège</td>\n",
+       "      <td>524</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>Mathématiques</td>\n",
+       "      <td>698</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>Mesure</td>\n",
+       "      <td>184</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>Militaire (Art) - Guerre - Arme</td>\n",
+       "      <td>1289</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>Minéralogie</td>\n",
+       "      <td>111</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>Monnaie</td>\n",
+       "      <td>317</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>Musique</td>\n",
+       "      <td>685</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>Médailles</td>\n",
+       "      <td>117</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>Médecine - Chirurgie</td>\n",
+       "      <td>2275</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>Métiers</td>\n",
+       "      <td>5254</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>Pharmacie</td>\n",
+       "      <td>326</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>Philosophie</td>\n",
+       "      <td>470</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>Physique - [Sciences physico-mathématiques]</td>\n",
+       "      <td>1324</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>Politique</td>\n",
+       "      <td>116</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>Pêche</td>\n",
+       "      <td>210</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>Religion</td>\n",
+       "      <td>1641</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>Spectacle</td>\n",
+       "      <td>48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>Superstition</td>\n",
+       "      <td>109</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        ensemble_domaine_enccre  counts\n",
+       "0               Agriculture - Economie rustique    1163\n",
+       "1                                      Anatomie    1073\n",
+       "2                                     Antiquité    1362\n",
+       "3                                  Architecture    1389\n",
+       "4                               Arts et métiers     563\n",
+       "5                                    Beaux-arts     429\n",
+       "6                       Belles-lettres - Poésie    1031\n",
+       "7                                        Blason     539\n",
+       "8                                    Caractères     114\n",
+       "9                                        Chasse     581\n",
+       "10                                       Chimie     520\n",
+       "11                                     Commerce    1879\n",
+       "12                        Droit - Jurisprudence    6419\n",
+       "13                          Economie domestique     135\n",
+       "14                                    Grammaire    2258\n",
+       "15                                   Géographie   13104\n",
+       "16                                     Histoire    3080\n",
+       "17                           Histoire naturelle    4814\n",
+       "18                                          Jeu     282\n",
+       "19                                       Marine    2076\n",
+       "20                           Maréchage - Manège     524\n",
+       "21                                Mathématiques     698\n",
+       "22                                       Mesure     184\n",
+       "23              Militaire (Art) - Guerre - Arme    1289\n",
+       "24                                  Minéralogie     111\n",
+       "25                                      Monnaie     317\n",
+       "26                                      Musique     685\n",
+       "27                                    Médailles     117\n",
+       "28                         Médecine - Chirurgie    2275\n",
+       "29                                      Métiers    5254\n",
+       "30                                    Pharmacie     326\n",
+       "31                                  Philosophie     470\n",
+       "32  Physique - [Sciences physico-mathématiques]    1324\n",
+       "33                                    Politique     116\n",
+       "34                                        Pêche     210\n",
+       "35                                     Religion    1641\n",
+       "36                                    Spectacle      48\n",
+       "37                                 Superstition     109"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "unable-agenda",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ensemble_domaine_enccre</th>\n",
+       "      <th>counts</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Agriculture - Economie rustique</td>\n",
+       "      <td>930</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Anatomie</td>\n",
+       "      <td>858</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Antiquité</td>\n",
+       "      <td>1090</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Architecture</td>\n",
+       "      <td>1111</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Arts et métiers</td>\n",
+       "      <td>451</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Beaux-arts</td>\n",
+       "      <td>343</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Belles-lettres - Poésie</td>\n",
+       "      <td>825</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Blason</td>\n",
+       "      <td>431</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Caractères</td>\n",
+       "      <td>91</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Chasse</td>\n",
+       "      <td>465</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Chimie</td>\n",
+       "      <td>416</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Commerce</td>\n",
+       "      <td>1503</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Droit - Jurisprudence</td>\n",
+       "      <td>5135</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Economie domestique</td>\n",
+       "      <td>108</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Grammaire</td>\n",
+       "      <td>1806</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Géographie</td>\n",
+       "      <td>10483</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>Histoire</td>\n",
+       "      <td>2464</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Histoire naturelle</td>\n",
+       "      <td>3851</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Jeu</td>\n",
+       "      <td>226</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Marine</td>\n",
+       "      <td>1661</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Maréchage - Manège</td>\n",
+       "      <td>419</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>Mathématiques</td>\n",
+       "      <td>558</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>Mesure</td>\n",
+       "      <td>147</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>Militaire (Art) - Guerre - Arme</td>\n",
+       "      <td>1031</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>Minéralogie</td>\n",
+       "      <td>89</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>Monnaie</td>\n",
+       "      <td>254</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>Musique</td>\n",
+       "      <td>548</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>Médailles</td>\n",
+       "      <td>94</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>Médecine - Chirurgie</td>\n",
+       "      <td>1820</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>Métiers</td>\n",
+       "      <td>4203</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>Pharmacie</td>\n",
+       "      <td>261</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>Philosophie</td>\n",
+       "      <td>376</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>Physique - [Sciences physico-mathématiques]</td>\n",
+       "      <td>1059</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>Politique</td>\n",
+       "      <td>93</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>Pêche</td>\n",
+       "      <td>168</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>Religion</td>\n",
+       "      <td>1313</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>Spectacle</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>Superstition</td>\n",
+       "      <td>87</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        ensemble_domaine_enccre  counts\n",
+       "0               Agriculture - Economie rustique     930\n",
+       "1                                      Anatomie     858\n",
+       "2                                     Antiquité    1090\n",
+       "3                                  Architecture    1111\n",
+       "4                               Arts et métiers     451\n",
+       "5                                    Beaux-arts     343\n",
+       "6                       Belles-lettres - Poésie     825\n",
+       "7                                        Blason     431\n",
+       "8                                    Caractères      91\n",
+       "9                                        Chasse     465\n",
+       "10                                       Chimie     416\n",
+       "11                                     Commerce    1503\n",
+       "12                        Droit - Jurisprudence    5135\n",
+       "13                          Economie domestique     108\n",
+       "14                                    Grammaire    1806\n",
+       "15                                   Géographie   10483\n",
+       "16                                     Histoire    2464\n",
+       "17                           Histoire naturelle    3851\n",
+       "18                                          Jeu     226\n",
+       "19                                       Marine    1661\n",
+       "20                           Maréchage - Manège     419\n",
+       "21                                Mathématiques     558\n",
+       "22                                       Mesure     147\n",
+       "23              Militaire (Art) - Guerre - Arme    1031\n",
+       "24                                  Minéralogie      89\n",
+       "25                                      Monnaie     254\n",
+       "26                                      Musique     548\n",
+       "27                                    Médailles      94\n",
+       "28                         Médecine - Chirurgie    1820\n",
+       "29                                      Métiers    4203\n",
+       "30                                    Pharmacie     261\n",
+       "31                                  Philosophie     376\n",
+       "32  Physique - [Sciences physico-mathématiques]    1059\n",
+       "33                                    Politique      93\n",
+       "34                                        Pêche     168\n",
+       "35                                     Religion    1313\n",
+       "36                                    Spectacle      39\n",
+       "37                                 Superstition      87"
+      ]
+     },
+     "execution_count": 88,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_x.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "potential-friday",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ensemble_domaine_enccre</th>\n",
+       "      <th>counts</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Agriculture - Economie rustique</td>\n",
+       "      <td>233</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Anatomie</td>\n",
+       "      <td>215</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Antiquité</td>\n",
+       "      <td>272</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Architecture</td>\n",
+       "      <td>278</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Arts et métiers</td>\n",
+       "      <td>112</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Beaux-arts</td>\n",
+       "      <td>86</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Belles-lettres - Poésie</td>\n",
+       "      <td>206</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Blason</td>\n",
+       "      <td>108</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Caractères</td>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Chasse</td>\n",
+       "      <td>116</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Chimie</td>\n",
+       "      <td>104</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Commerce</td>\n",
+       "      <td>376</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Droit - Jurisprudence</td>\n",
+       "      <td>1284</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Economie domestique</td>\n",
+       "      <td>27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Grammaire</td>\n",
+       "      <td>452</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Géographie</td>\n",
+       "      <td>2621</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>Histoire</td>\n",
+       "      <td>616</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Histoire naturelle</td>\n",
+       "      <td>963</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Jeu</td>\n",
+       "      <td>56</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Marine</td>\n",
+       "      <td>415</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Maréchage - Manège</td>\n",
+       "      <td>105</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>Mathématiques</td>\n",
+       "      <td>140</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>Mesure</td>\n",
+       "      <td>37</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>Militaire (Art) - Guerre - Arme</td>\n",
+       "      <td>258</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>Minéralogie</td>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>Monnaie</td>\n",
+       "      <td>63</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>Musique</td>\n",
+       "      <td>137</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>Médailles</td>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>Médecine - Chirurgie</td>\n",
+       "      <td>455</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>Métiers</td>\n",
+       "      <td>1051</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>Pharmacie</td>\n",
+       "      <td>65</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>Philosophie</td>\n",
+       "      <td>94</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>Physique - [Sciences physico-mathématiques]</td>\n",
+       "      <td>265</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>Politique</td>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>Pêche</td>\n",
+       "      <td>42</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>Religion</td>\n",
+       "      <td>328</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>Spectacle</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>Superstition</td>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        ensemble_domaine_enccre  counts\n",
+       "0               Agriculture - Economie rustique     233\n",
+       "1                                      Anatomie     215\n",
+       "2                                     Antiquité     272\n",
+       "3                                  Architecture     278\n",
+       "4                               Arts et métiers     112\n",
+       "5                                    Beaux-arts      86\n",
+       "6                       Belles-lettres - Poésie     206\n",
+       "7                                        Blason     108\n",
+       "8                                    Caractères      23\n",
+       "9                                        Chasse     116\n",
+       "10                                       Chimie     104\n",
+       "11                                     Commerce     376\n",
+       "12                        Droit - Jurisprudence    1284\n",
+       "13                          Economie domestique      27\n",
+       "14                                    Grammaire     452\n",
+       "15                                   Géographie    2621\n",
+       "16                                     Histoire     616\n",
+       "17                           Histoire naturelle     963\n",
+       "18                                          Jeu      56\n",
+       "19                                       Marine     415\n",
+       "20                           Maréchage - Manège     105\n",
+       "21                                Mathématiques     140\n",
+       "22                                       Mesure      37\n",
+       "23              Militaire (Art) - Guerre - Arme     258\n",
+       "24                                  Minéralogie      22\n",
+       "25                                      Monnaie      63\n",
+       "26                                      Musique     137\n",
+       "27                                    Médailles      23\n",
+       "28                         Médecine - Chirurgie     455\n",
+       "29                                      Métiers    1051\n",
+       "30                                    Pharmacie      65\n",
+       "31                                  Philosophie      94\n",
+       "32  Physique - [Sciences physico-mathématiques]     265\n",
+       "33                                    Politique      23\n",
+       "34                                        Pêche      42\n",
+       "35                                     Religion     328\n",
+       "36                                    Spectacle       9\n",
+       "37                                 Superstition      22"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "validation_x.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "demanding-essay",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "vanilla-italy",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "verified-compression",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}