From c83c5d0dfaba790e7a0c026d4ce3c5e786bd0bbc Mon Sep 17 00:00:00 2001
From: Ludovic Moncla <moncla.ludovic@gmail.com>
Date: Tue, 14 Mar 2023 22:16:29 +0100
Subject: [PATCH] Update Predict.ipynb

---
 notebooks/Predict.ipynb | 1949 +++++++++++++++++++++++++++------------
 1 file changed, 1382 insertions(+), 567 deletions(-)

diff --git a/notebooks/Predict.ipynb b/notebooks/Predict.ipynb
index 57eced6..e9e6378 100644
--- a/notebooks/Predict.ipynb
+++ b/notebooks/Predict.ipynb
@@ -169,20 +169,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 13,
-      "metadata": {
-        "id": "M2awiee1r0zV"
-      },
-      "outputs": [],
-      "source": [
-        "#drive_path = \"drive/MyDrive/Classification-EDdA/\"\n",
-        "drive_path = \"../\"\n",
-        "path = \"/Users/lmoncla/git/gitlab.liris/GEODE/EDdA/output/\""
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 3,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -194,21 +181,29 @@
       "source": [
         "#!wget https://geode.liris.cnrs.fr/files/datasets/EDdA/Classification/LGE_withContent.tsv\n",
         "#!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/EDdA_dataset_articles_no_superdomain.tsv\n",
-        "!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/Parallel_datatset_articles_230215.tsv"
+        "#!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/Parallel_datatset_articles_230215.tsv"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": 4,
       "metadata": {
-        "id": "eea7F4vato1x"
+        "id": "M2awiee1r0zV"
       },
       "outputs": [],
       "source": [
-        "#filepath = \"data/LGE_withContent.tsv\"\n",
-        "#filepath = \"EDdA_dataset_articles_no_superdomain.tsv\"\n",
+        "#drive_path = \"drive/MyDrive/Classification-EDdA/\"\n",
+        "drive_path = \"../\"\n",
+        "#path = \"/Users/lmoncla/git/gitlab.liris/GEODE/EDdA/output/\"\n",
+        "path = \"/Users/lmoncla/git/gitlab.liris/GEODE/LGE/output/\"\n",
+        "\n",
+        "\n",
         "#filepath = \"Parallel_datatset_articles_230215.tsv\"\n",
-        "filepath = \"EDdA_dataset_articles.tsv\""
+        "#filepath = \"EDdA_dataset_articles.tsv\"\n",
+        "filepath = \"LGE_dataset_articles_230314.tsv\"\n",
+        "\n",
+        "corpus = 'lge'\n",
+        "#corpus = ''"
       ]
     },
     {
@@ -244,135 +239,90 @@
               "  <thead>\n",
               "    <tr style=\"text-align: right;\">\n",
               "      <th></th>\n",
-              "      <th>volume</th>\n",
-              "      <th>numero</th>\n",
-              "      <th>head</th>\n",
-              "      <th>author</th>\n",
-              "      <th>edda_class</th>\n",
-              "      <th>enccre_id</th>\n",
-              "      <th>enccre_class</th>\n",
-              "      <th>content</th>\n",
-              "      <th>content_without_designant</th>\n",
-              "      <th>first_paragraph</th>\n",
-              "      <th>nb_words</th>\n",
-              "      <th>super_domain</th>\n",
+              "      <th>uid</th>\n",
+              "      <th>lge-volume</th>\n",
+              "      <th>lge-numero</th>\n",
+              "      <th>lge-head</th>\n",
+              "      <th>lge-page</th>\n",
+              "      <th>lge-id</th>\n",
+              "      <th>lge-content</th>\n",
+              "      <th>lge-nbWords</th>\n",
               "    </tr>\n",
               "  </thead>\n",
               "  <tbody>\n",
               "    <tr>\n",
               "      <th>0</th>\n",
+              "      <td>lge_1_a-0</td>\n",
               "      <td>1</td>\n",
               "      <td>1</td>\n",
-              "      <td>Title Page</td>\n",
-              "      <td>unsigned</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n",
-              "      <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n",
-              "      <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n",
-              "      <td>151</td>\n",
-              "      <td>Unclassified</td>\n",
+              "      <td>A</td>\n",
+              "      <td>0</td>\n",
+              "      <td>a-0</td>\n",
+              "      <td>A(Ling.). Son vocal et première lettre de notr...</td>\n",
+              "      <td>1761.0</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>1</th>\n",
+              "      <td>lge_1_a-1</td>\n",
               "      <td>1</td>\n",
               "      <td>2</td>\n",
-              "      <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n",
-              "      <td>Diderot &amp; d'Alembert</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n",
-              "      <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n",
-              "      <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n",
-              "      <td>208</td>\n",
-              "      <td>Unclassified</td>\n",
+              "      <td>A</td>\n",
+              "      <td>1</td>\n",
+              "      <td>a-1</td>\n",
+              "      <td>A(Paléogr.). C’est à l’alphabet phénicien, on ...</td>\n",
+              "      <td>839.0</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>2</th>\n",
+              "      <td>lge_1_a-2</td>\n",
               "      <td>1</td>\n",
               "      <td>3</td>\n",
-              "      <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n",
-              "      <td>d'Alembert</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n",
-              "      <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n",
-              "      <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n</td>\n",
-              "      <td>44669</td>\n",
-              "      <td>Unclassified</td>\n",
+              "      <td>A</td>\n",
+              "      <td>4</td>\n",
+              "      <td>a-2</td>\n",
+              "      <td>A(Log.). Cette voyelle désigne les proposition...</td>\n",
+              "      <td>56.0</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>3</th>\n",
+              "      <td>lge_1_a-3</td>\n",
               "      <td>1</td>\n",
-              "      <td>5</td>\n",
-              "      <td>A, a &amp; a</td>\n",
-              "      <td>Dumarsais5</td>\n",
-              "      <td>Grammaire</td>\n",
-              "      <td>v1-1-0</td>\n",
-              "      <td>Grammaire</td>\n",
-              "      <td>\\nA, a &amp; a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n",
-              "      <td>\\nA, a &amp; a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n",
-              "      <td>\\nA, a &amp; a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n",
-              "      <td>711</td>\n",
-              "      <td>Philosophie</td>\n",
+              "      <td>4</td>\n",
+              "      <td>A</td>\n",
+              "      <td>4</td>\n",
+              "      <td>a-3</td>\n",
+              "      <td>A(Mus.). La lettre a est employée par les musi...</td>\n",
+              "      <td>267.0</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>4</th>\n",
+              "      <td>lge_1_a-4</td>\n",
               "      <td>1</td>\n",
-              "      <td>6</td>\n",
+              "      <td>5</td>\n",
               "      <td>A</td>\n",
-              "      <td>Dumarsais5</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-1-1</td>\n",
-              "      <td>Grammaire</td>\n",
-              "      <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n",
-              "      <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n",
-              "      <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n",
-              "      <td>238</td>\n",
-              "      <td>Unclassified</td>\n",
+              "      <td>4</td>\n",
+              "      <td>a-4</td>\n",
+              "      <td>A(Numis.). Dans la numismatique grecque, la le...</td>\n",
+              "      <td>67.0</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
               "</div>"
             ],
             "text/plain": [
-              "   volume  numero                                head                author  \\\n",
-              "0       1       1                          Title Page              unsigned   \n",
-              "1       1       2   A MONSEIGNEUR LE COMTE D'ARGENSON  Diderot & d'Alembert   \n",
-              "2       1       3  DISCOURS PRÉLIMINAIRE DES EDITEURS            d'Alembert   \n",
-              "3       1       5                            A, a & a            Dumarsais5   \n",
-              "4       1       6                                   A            Dumarsais5   \n",
-              "\n",
-              "     edda_class enccre_id enccre_class  \\\n",
-              "0  unclassified       NaN          NaN   \n",
-              "1  unclassified       NaN          NaN   \n",
-              "2  unclassified       NaN          NaN   \n",
-              "3     Grammaire    v1-1-0    Grammaire   \n",
-              "4  unclassified    v1-1-1    Grammaire   \n",
-              "\n",
-              "                                             content  \\\n",
-              "0  \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...   \n",
-              "1  \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...   \n",
-              "2  \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...   \n",
-              "3  \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...   \n",
-              "4  \\nA, mot, est 1. la troisieme personne du prés...   \n",
+              "         uid  lge-volume  lge-numero lge-head  lge-page lge-id  \\\n",
+              "0  lge_1_a-0           1           1        A         0    a-0   \n",
+              "1  lge_1_a-1           1           2        A         1    a-1   \n",
+              "2  lge_1_a-2           1           3        A         4    a-2   \n",
+              "3  lge_1_a-3           1           4        A         4    a-3   \n",
+              "4  lge_1_a-4           1           5        A         4    a-4   \n",
               "\n",
-              "                           content_without_designant  \\\n",
-              "0  \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...   \n",
-              "1  \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...   \n",
-              "2  \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...   \n",
-              "3  \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...   \n",
-              "4  \\nA, mot, est 1. la troisieme personne du prés...   \n",
-              "\n",
-              "                                     first_paragraph  nb_words  super_domain  \n",
-              "0  \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...       151  Unclassified  \n",
-              "1  \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...       208  Unclassified  \n",
-              "2       \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n     44669  Unclassified  \n",
-              "3  \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...       711   Philosophie  \n",
-              "4  \\nA, mot, est 1. la troisieme personne du prés...       238  Unclassified  "
+              "                                         lge-content  lge-nbWords  \n",
+              "0  A(Ling.). Son vocal et première lettre de notr...       1761.0  \n",
+              "1  A(Paléogr.). C’est à l’alphabet phénicien, on ...        839.0  \n",
+              "2  A(Log.). Cette voyelle désigne les proposition...         56.0  \n",
+              "3  A(Mus.). La lettre a est employée par les musi...        267.0  \n",
+              "4  A(Numis.). Dans la numismatique grecque, la le...         67.0  "
             ]
           },
           "execution_count": 5,
@@ -393,9 +343,7 @@
       },
       "outputs": [],
       "source": [
-        "#corpus = 'LGE'\n",
-        "corpus = ''\n",
-        "data = df['content'+corpus].values\n"
+        "data = df[corpus+'-content'].values"
       ]
     },
     {
@@ -411,7 +359,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 14,
+      "execution_count": 7,
       "metadata": {
         "id": "0qDZ86qTr0zX"
       },
@@ -426,7 +374,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 9,
+      "execution_count": 8,
       "metadata": {
         "id": "KEljGX0br0zX"
       },
@@ -532,7 +480,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
+      "execution_count": 9,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -607,7 +555,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 11,
+      "execution_count": 10,
       "metadata": {
         "id": "-O6NspVTr0zZ"
       },
@@ -616,7 +564,7 @@
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "Token indices sequence length is longer than the specified maximum sequence length for this model (75311 > 512). Running this sequence through the model will result in indexing errors\n"
+            "Token indices sequence length is longer than the specified maximum sequence length for this model (3408 > 512). Running this sequence through the model will result in indexing errors\n"
           ]
         }
       ],
@@ -638,7 +586,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 15,
+      "execution_count": 11,
       "metadata": {
         "id": "CN8EZst-r0zZ"
       },
@@ -653,7 +601,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 16,
+      "execution_count": 12,
       "metadata": {
         "id": "_fzgS5USJeAF"
       },
@@ -664,7 +612,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 13,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -672,14 +620,1025 @@
         "id": "ISkijyclr0za",
         "outputId": "8120e858-9950-4380-f887-70ca47360c76"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[13,\n",
+              " 6,\n",
+              " 13,\n",
+              " 10,\n",
+              " 7,\n",
+              " 4,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 11,\n",
+              " 7,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 8,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 4,\n",
+              " 8,\n",
+              " 7,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 16,\n",
+              " 7,\n",
+              " 10,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 11,\n",
+              " 3,\n",
+              " 9,\n",
+              " 7,\n",
+              " 4,\n",
+              " 6,\n",
+              " 7,\n",
+              " 14,\n",
+              " 1,\n",
+              " 8,\n",
+              " 6,\n",
+              " 8,\n",
+              " 7,\n",
+              " 5,\n",
+              " 7,\n",
+              " 14,\n",
+              " 6,\n",
+              " 3,\n",
+              " 16,\n",
+              " 9,\n",
+              " 2,\n",
+              " 1,\n",
+              " 1,\n",
+              " 7,\n",
+              " 7,\n",
+              " 5,\n",
+              " 6,\n",
+              " 7,\n",
+              " 8,\n",
+              " 7,\n",
+              " 8,\n",
+              " 0,\n",
+              " 9,\n",
+              " 14,\n",
+              " 6,\n",
+              " 8,\n",
+              " 6,\n",
+              " 7,\n",
+              " 6,\n",
+              " 9,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 5,\n",
+              " 5,\n",
+              " 8,\n",
+              " 5,\n",
+              " 5,\n",
+              " 5,\n",
+              " 5,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 6,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 4,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 4,\n",
+              " 0,\n",
+              " 4,\n",
+              " 4,\n",
+              " 0,\n",
+              " 8,\n",
+              " 9,\n",
+              " 1,\n",
+              " 1,\n",
+              " 6,\n",
+              " 7,\n",
+              " 1,\n",
+              " 9,\n",
+              " 5,\n",
+              " 7,\n",
+              " 5,\n",
+              " 8,\n",
+              " 2,\n",
+              " 6,\n",
+              " 7,\n",
+              " 8,\n",
+              " 5,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 3,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 2,\n",
+              " 7,\n",
+              " 2,\n",
+              " 8,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 1,\n",
+              " 11,\n",
+              " 1,\n",
+              " 1,\n",
+              " 7,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 10,\n",
+              " 6,\n",
+              " 16,\n",
+              " 12,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 6,\n",
+              " 7,\n",
+              " 3,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 6,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 1,\n",
+              " 2,\n",
+              " 2,\n",
+              " 16,\n",
+              " 2,\n",
+              " 9,\n",
+              " 11,\n",
+              " 16,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 8,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 1,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 13,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 5,\n",
+              " 8,\n",
+              " 9,\n",
+              " 11,\n",
+              " 8,\n",
+              " 7,\n",
+              " 11,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 7,\n",
+              " 13,\n",
+              " 8,\n",
+              " 7,\n",
+              " 12,\n",
+              " 6,\n",
+              " 7,\n",
+              " 5,\n",
+              " 8,\n",
+              " 11,\n",
+              " 8,\n",
+              " 14,\n",
+              " 2,\n",
+              " 11,\n",
+              " 1,\n",
+              " 7,\n",
+              " 10,\n",
+              " 11,\n",
+              " 8,\n",
+              " 7,\n",
+              " 6,\n",
+              " 6,\n",
+              " 7,\n",
+              " 16,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 1,\n",
+              " 8,\n",
+              " 10,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 1,\n",
+              " 1,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 9,\n",
+              " 13,\n",
+              " 8,\n",
+              " 16,\n",
+              " 7,\n",
+              " 6,\n",
+              " 8,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 16,\n",
+              " 13,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 5,\n",
+              " 6,\n",
+              " 7,\n",
+              " 8,\n",
+              " 7,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 11,\n",
+              " 8,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 8,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 11,\n",
+              " 1,\n",
+              " 6,\n",
+              " 11,\n",
+              " 14,\n",
+              " 6,\n",
+              " 10,\n",
+              " 6,\n",
+              " 6,\n",
+              " 8,\n",
+              " 5,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 16,\n",
+              " 7,\n",
+              " 7,\n",
+              " 13,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 9,\n",
+              " 13,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 5,\n",
+              " 7,\n",
+              " 8,\n",
+              " 3,\n",
+              " 14,\n",
+              " 8,\n",
+              " 14,\n",
+              " 8,\n",
+              " 7,\n",
+              " 5,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 3,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 5,\n",
+              " 6,\n",
+              " 6,\n",
+              " 5,\n",
+              " 16,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 9,\n",
+              " 6,\n",
+              " 16,\n",
+              " 6,\n",
+              " 7,\n",
+              " 5,\n",
+              " 6,\n",
+              " 8,\n",
+              " 11,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 6,\n",
+              " 5,\n",
+              " 2,\n",
+              " 7,\n",
+              " 8,\n",
+              " 6,\n",
+              " 13,\n",
+              " 11,\n",
+              " 14,\n",
+              " 7,\n",
+              " 8,\n",
+              " 16,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 9,\n",
+              " 0,\n",
+              " 2,\n",
+              " 6,\n",
+              " 8,\n",
+              " 3,\n",
+              " 6,\n",
+              " 1,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 16,\n",
+              " 7,\n",
+              " 3,\n",
+              " 16,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 13,\n",
+              " 5,\n",
+              " 7,\n",
+              " 9,\n",
+              " 7,\n",
+              " 2,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 7,\n",
+              " 13,\n",
+              " 6,\n",
+              " 14,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 5,\n",
+              " 5,\n",
+              " 6,\n",
+              " 7,\n",
+              " 6,\n",
+              " 8,\n",
+              " 9,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 5,\n",
+              " 7,\n",
+              " 11,\n",
+              " 7,\n",
+              " 4,\n",
+              " 6,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 3,\n",
+              " 6,\n",
+              " 12,\n",
+              " 9,\n",
+              " 7,\n",
+              " 1,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 13,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 13,\n",
+              " 7,\n",
+              " 6,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 7,\n",
+              " 1,\n",
+              " 2,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 5,\n",
+              " 9,\n",
+              " 7,\n",
+              " 2,\n",
+              " 6,\n",
+              " 3,\n",
+              " 4,\n",
+              " 6,\n",
+              " 16,\n",
+              " 5,\n",
+              " 5,\n",
+              " 5,\n",
+              " 5,\n",
+              " 5,\n",
+              " 5,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 13,\n",
+              " 5,\n",
+              " 5,\n",
+              " 5,\n",
+              " 1,\n",
+              " 8,\n",
+              " 7,\n",
+              " 2,\n",
+              " 14,\n",
+              " 8,\n",
+              " 11,\n",
+              " 8,\n",
+              " 7,\n",
+              " 16,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 16,\n",
+              " 7,\n",
+              " 7,\n",
+              " 16,\n",
+              " 7,\n",
+              " 7,\n",
+              " 16,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 11,\n",
+              " 8,\n",
+              " 6,\n",
+              " 8,\n",
+              " 7,\n",
+              " 6,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 12,\n",
+              " 8,\n",
+              " 11,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 10,\n",
+              " 14,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 14,\n",
+              " 7,\n",
+              " 5,\n",
+              " 7,\n",
+              " 0,\n",
+              " 5,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 1,\n",
+              " 0,\n",
+              " 8,\n",
+              " 8,\n",
+              " 9,\n",
+              " 9,\n",
+              " 3,\n",
+              " 6,\n",
+              " 13,\n",
+              " 6,\n",
+              " 5,\n",
+              " 4,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 1,\n",
+              " 8,\n",
+              " 7,\n",
+              " 8,\n",
+              " 3,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 0,\n",
+              " 6,\n",
+              " 9,\n",
+              " 6,\n",
+              " 8,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 14,\n",
+              " 5,\n",
+              " 5,\n",
+              " 1,\n",
+              " 1,\n",
+              " 12,\n",
+              " 8,\n",
+              " 11,\n",
+              " 11,\n",
+              " 7,\n",
+              " 13,\n",
+              " 16,\n",
+              " 13,\n",
+              " 14,\n",
+              " 14,\n",
+              " 11,\n",
+              " 14,\n",
+              " 11,\n",
+              " 14,\n",
+              " 16,\n",
+              " 7,\n",
+              " 7,\n",
+              " 5,\n",
+              " 5,\n",
+              " 13,\n",
+              " 11,\n",
+              " 16,\n",
+              " 7,\n",
+              " 13,\n",
+              " 14,\n",
+              " 14,\n",
+              " 13,\n",
+              " 8,\n",
+              " 7,\n",
+              " 7,\n",
+              " 10,\n",
+              " 9,\n",
+              " 4,\n",
+              " 8,\n",
+              " 2,\n",
+              " 9,\n",
+              " 8,\n",
+              " 8,\n",
+              " 3,\n",
+              " 5,\n",
+              " 13,\n",
+              " 5,\n",
+              " 5,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 6,\n",
+              " 6,\n",
+              " 7,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 13,\n",
+              " 6,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 8,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 6,\n",
+              " 8,\n",
+              " 13,\n",
+              " 7,\n",
+              " 13,\n",
+              " 7,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 5,\n",
+              " 1,\n",
+              " 7,\n",
+              " 1,\n",
+              " 7,\n",
+              " 6,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 7,\n",
+              " 6,\n",
+              " 8,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 7,\n",
+              " 8,\n",
+              " 7,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 11,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 3,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 11,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 7,\n",
+              " 8,\n",
+              " 5,\n",
+              " 6,\n",
+              " 6,\n",
+              " 11,\n",
+              " 8,\n",
+              " 8,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 13,\n",
+              " 7,\n",
+              " 7,\n",
+              " 13,\n",
+              " 7,\n",
+              " 7,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 7,\n",
+              " 7,\n",
+              " 9,\n",
+              " 7,\n",
+              " 7,\n",
+              " 7,\n",
+              " 10,\n",
+              " 9,\n",
+              " 10,\n",
+              " 14,\n",
+              " 3,\n",
+              " 14,\n",
+              " 14,\n",
+              " 9,\n",
+              " 16,\n",
+              " 5,\n",
+              " 7,\n",
+              " 13,\n",
+              " 8,\n",
+              " 13,\n",
+              " 5,\n",
+              " 5,\n",
+              " 5,\n",
+              " 5,\n",
+              " 13,\n",
+              " 16,\n",
+              " 5,\n",
+              " 13,\n",
+              " 2,\n",
+              " 11,\n",
+              " 8,\n",
+              " 10,\n",
+              " 7,\n",
+              " 1,\n",
+              " 14,\n",
+              " 14,\n",
+              " 10,\n",
+              " 9,\n",
+              " 5,\n",
+              " 8,\n",
+              " 8,\n",
+              " 4,\n",
+              " 2,\n",
+              " 7,\n",
+              " 13,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 1,\n",
+              " 8,\n",
+              " 7,\n",
+              " 0,\n",
+              " 6,\n",
+              " 9,\n",
+              " 2,\n",
+              " 1,\n",
+              " 8,\n",
+              " 11,\n",
+              " 12,\n",
+              " 9,\n",
+              " 10,\n",
+              " 7,\n",
+              " 13,\n",
+              " 11,\n",
+              " 13,\n",
+              " 1,\n",
+              " 5,\n",
+              " 10,\n",
+              " 10,\n",
+              " 10,\n",
+              " 10,\n",
+              " 2,\n",
+              " 9,\n",
+              " 3,\n",
+              " 9,\n",
+              " 6,\n",
+              " 1,\n",
+              " 13,\n",
+              " 11,\n",
+              " 11,\n",
+              " 11,\n",
+              " 1,\n",
+              " 1,\n",
+              " 13,\n",
+              " 3,\n",
+              " 1,\n",
+              " 9,\n",
+              " 6,\n",
+              " 12,\n",
+              " 7,\n",
+              " 3,\n",
+              " 8,\n",
+              " 12,\n",
+              " 12,\n",
+              " 12,\n",
+              " 12,\n",
+              " 8,\n",
+              " 0,\n",
+              " 3,\n",
+              " 7,\n",
+              " 7,\n",
+              " 3,\n",
+              " 9,\n",
+              " 9,\n",
+              " 9,\n",
+              " 14,\n",
+              " 14,\n",
+              " 8,\n",
+              " 5,\n",
+              " 6,\n",
+              " 7,\n",
+              " 5,\n",
+              " 5,\n",
+              " 13,\n",
+              " 5,\n",
+              " 5,\n",
+              " 5,\n",
+              " 16,\n",
+              " 14,\n",
+              " 11,\n",
+              " 8,\n",
+              " 9,\n",
+              " 11,\n",
+              " 11,\n",
+              " 11,\n",
+              " 8,\n",
+              " 11,\n",
+              " 11,\n",
+              " 11,\n",
+              " 11,\n",
+              " 11,\n",
+              " 8,\n",
+              " 8,\n",
+              " 12,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 11,\n",
+              " 8,\n",
+              " 11,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 8,\n",
+              " 8,\n",
+              " 8,\n",
+              " 6,\n",
+              " 7,\n",
+              " 13,\n",
+              " ...]"
+            ]
+          },
+          "execution_count": 13,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "pred"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 17,
+      "execution_count": 14,
       "metadata": {
         "id": "fo6k4li1r0za"
       },
@@ -704,7 +1663,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 18,
+      "execution_count": 15,
       "metadata": {
         "id": "UU7qg7zVr0zb"
       },
@@ -715,18 +1674,18 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 19,
+      "execution_count": 16,
       "metadata": {
         "id": "w4eHpBztr0zb"
       },
       "outputs": [],
       "source": [
-        "df['superdomainBert'+corpus] = p2"
+        "df[corpus+'-superdomainBert'] = p2"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 17,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -734,14 +1693,27 @@
         "id": "KsJQMhCBxpSF",
         "outputId": "2ffa7475-e6de-4c42-a413-22c0d4b2d45f"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "ename": "AttributeError",
+          "evalue": "'DataFrame' object has no attribute 'numero'",
+          "output_type": "error",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+            "\u001b[0;32m/var/folders/qm/v_b1md29221_cnpcxf5qc43c0000gn/T/ipykernel_3552/1621721301.py\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnumero\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2835\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'-content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+            "\u001b[0;32m/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   5900\u001b[0m         ):\n\u001b[1;32m   5901\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5902\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   5903\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5904\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'numero'"
+          ]
+        }
+      ],
       "source": [
-        "df[df.numero == 2835]['content'+corpus].values"
+        "df[df.numero == 2835][corpus+'-content'].values"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 20,
+      "execution_count": 18,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -772,261 +1744,181 @@
               "  <thead>\n",
               "    <tr style=\"text-align: right;\">\n",
               "      <th></th>\n",
-              "      <th>volume</th>\n",
-              "      <th>numero</th>\n",
-              "      <th>head</th>\n",
-              "      <th>author</th>\n",
-              "      <th>edda_class</th>\n",
-              "      <th>enccre_id</th>\n",
-              "      <th>enccre_class</th>\n",
-              "      <th>content</th>\n",
-              "      <th>content_without_designant</th>\n",
-              "      <th>first_paragraph</th>\n",
-              "      <th>nb_words</th>\n",
-              "      <th>super_domain</th>\n",
-              "      <th>superdomainBert</th>\n",
+              "      <th>uid</th>\n",
+              "      <th>lge-volume</th>\n",
+              "      <th>lge-numero</th>\n",
+              "      <th>lge-head</th>\n",
+              "      <th>lge-page</th>\n",
+              "      <th>lge-id</th>\n",
+              "      <th>lge-content</th>\n",
+              "      <th>lge-nbWords</th>\n",
+              "      <th>lge-superdomainBert</th>\n",
               "    </tr>\n",
               "  </thead>\n",
               "  <tbody>\n",
               "    <tr>\n",
               "      <th>0</th>\n",
+              "      <td>lge_1_a-0</td>\n",
               "      <td>1</td>\n",
               "      <td>1</td>\n",
-              "      <td>Title Page</td>\n",
-              "      <td>unsigned</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n",
-              "      <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n",
-              "      <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n",
-              "      <td>151</td>\n",
-              "      <td>Unclassified</td>\n",
+              "      <td>A</td>\n",
+              "      <td>0</td>\n",
+              "      <td>a-0</td>\n",
+              "      <td>A(Ling.). Son vocal et première lettre de notr...</td>\n",
+              "      <td>1761.0</td>\n",
               "      <td>Philosophie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>1</th>\n",
+              "      <td>lge_1_a-1</td>\n",
               "      <td>1</td>\n",
               "      <td>2</td>\n",
-              "      <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n",
-              "      <td>Diderot &amp; d'Alembert</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n",
-              "      <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n",
-              "      <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n",
-              "      <td>208</td>\n",
-              "      <td>Unclassified</td>\n",
-              "      <td>Philosophie</td>\n",
+              "      <td>A</td>\n",
+              "      <td>1</td>\n",
+              "      <td>a-1</td>\n",
+              "      <td>A(Paléogr.). C’est à l’alphabet phénicien, on ...</td>\n",
+              "      <td>839.0</td>\n",
+              "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>2</th>\n",
+              "      <td>lge_1_a-2</td>\n",
               "      <td>1</td>\n",
               "      <td>3</td>\n",
-              "      <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n",
-              "      <td>d'Alembert</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n",
-              "      <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n",
-              "      <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n</td>\n",
-              "      <td>44669</td>\n",
-              "      <td>Unclassified</td>\n",
-              "      <td>Belles-lettres</td>\n",
+              "      <td>A</td>\n",
+              "      <td>4</td>\n",
+              "      <td>a-2</td>\n",
+              "      <td>A(Log.). Cette voyelle désigne les proposition...</td>\n",
+              "      <td>56.0</td>\n",
+              "      <td>Philosophie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>3</th>\n",
+              "      <td>lge_1_a-3</td>\n",
               "      <td>1</td>\n",
-              "      <td>5</td>\n",
-              "      <td>A, a &amp; a</td>\n",
-              "      <td>Dumarsais5</td>\n",
-              "      <td>Grammaire</td>\n",
-              "      <td>v1-1-0</td>\n",
-              "      <td>Grammaire</td>\n",
-              "      <td>\\nA, a &amp; a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n",
-              "      <td>\\nA, a &amp; a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n",
-              "      <td>\\nA, a &amp; a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n",
-              "      <td>711</td>\n",
-              "      <td>Philosophie</td>\n",
-              "      <td>Philosophie</td>\n",
+              "      <td>4</td>\n",
+              "      <td>A</td>\n",
+              "      <td>4</td>\n",
+              "      <td>a-3</td>\n",
+              "      <td>A(Mus.). La lettre a est employée par les musi...</td>\n",
+              "      <td>267.0</td>\n",
+              "      <td>Musique</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>4</th>\n",
+              "      <td>lge_1_a-4</td>\n",
               "      <td>1</td>\n",
-              "      <td>6</td>\n",
+              "      <td>5</td>\n",
               "      <td>A</td>\n",
-              "      <td>Dumarsais5</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-1-1</td>\n",
-              "      <td>Grammaire</td>\n",
-              "      <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n",
-              "      <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n",
-              "      <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n",
-              "      <td>238</td>\n",
-              "      <td>Unclassified</td>\n",
-              "      <td>Philosophie</td>\n",
+              "      <td>4</td>\n",
+              "      <td>a-4</td>\n",
+              "      <td>A(Numis.). Dans la numismatique grecque, la le...</td>\n",
+              "      <td>67.0</td>\n",
+              "      <td>Histoire</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>5</th>\n",
+              "      <td>lge_1_aa-0</td>\n",
               "      <td>1</td>\n",
-              "      <td>7</td>\n",
-              "      <td>A</td>\n",
-              "      <td>Dumarsais</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-1-2</td>\n",
-              "      <td>Grammaire</td>\n",
-              "      <td>\\nA, préposition vient du latin à, à dextris, ...</td>\n",
-              "      <td>\\nA, préposition vient du latin à, à dextris, ...</td>\n",
-              "      <td>\\nA, préposition vient du latin à, à dextris, ...</td>\n",
-              "      <td>1980</td>\n",
-              "      <td>Unclassified</td>\n",
-              "      <td>Philosophie</td>\n",
+              "      <td>6</td>\n",
+              "      <td>AA</td>\n",
+              "      <td>4</td>\n",
+              "      <td>aa-0</td>\n",
+              "      <td>AA. Ces deux lettres désignent l’atelier monét...</td>\n",
+              "      <td>14.0</td>\n",
+              "      <td>Commerce</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>6</th>\n",
+              "      <td>lge_1_aa-1</td>\n",
               "      <td>1</td>\n",
-              "      <td>8</td>\n",
-              "      <td>A</td>\n",
-              "      <td>Mallet</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-1-3</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n",
-              "      <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n",
-              "      <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n",
-              "      <td>200</td>\n",
-              "      <td>Unclassified</td>\n",
-              "      <td>Histoire</td>\n",
+              "      <td>7</td>\n",
+              "      <td>AA</td>\n",
+              "      <td>4</td>\n",
+              "      <td>aa-1</td>\n",
+              "      <td>AA. Nom de plusieurs cours d’eau de l’Europe o...</td>\n",
+              "      <td>75.0</td>\n",
+              "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>7</th>\n",
+              "      <td>lge_1_aa-2</td>\n",
               "      <td>1</td>\n",
-              "      <td>9</td>\n",
-              "      <td>A, lettre symbolique</td>\n",
-              "      <td>Mallet</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-1-4</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n",
-              "      <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n",
-              "      <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n",
-              "      <td>82</td>\n",
-              "      <td>Unclassified</td>\n",
-              "      <td>Histoire</td>\n",
+              "      <td>8</td>\n",
+              "      <td>AA</td>\n",
+              "      <td>5</td>\n",
+              "      <td>aa-2</td>\n",
+              "      <td>AA. Rivière de France, prend sa source aux Tro...</td>\n",
+              "      <td>165.0</td>\n",
+              "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>8</th>\n",
+              "      <td>lge_1_aa-3</td>\n",
               "      <td>1</td>\n",
-              "      <td>10</td>\n",
-              "      <td>A, numismatique ou monétaire</td>\n",
-              "      <td>Mallet</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-1-5</td>\n",
-              "      <td>Médailles</td>\n",
-              "      <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n",
-              "      <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n",
-              "      <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n",
-              "      <td>112</td>\n",
-              "      <td>Unclassified</td>\n",
-              "      <td>Histoire</td>\n",
+              "      <td>9</td>\n",
+              "      <td>AA</td>\n",
+              "      <td>5</td>\n",
+              "      <td>aa-3</td>\n",
+              "      <td>AA. Rivière de Hollande, affluent de la Dommel...</td>\n",
+              "      <td>17.0</td>\n",
+              "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>9</th>\n",
+              "      <td>lge_1_aa-4</td>\n",
               "      <td>1</td>\n",
-              "      <td>11</td>\n",
-              "      <td>A, lapidaire</td>\n",
-              "      <td>Mallet</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-1-6</td>\n",
-              "      <td>Histoire</td>\n",
-              "      <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n",
-              "      <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n",
-              "      <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n",
-              "      <td>80</td>\n",
-              "      <td>Unclassified</td>\n",
-              "      <td>Histoire</td>\n",
+              "      <td>10</td>\n",
+              "      <td>AA</td>\n",
+              "      <td>5</td>\n",
+              "      <td>aa-4</td>\n",
+              "      <td>AA. Nom de deux fleuves de la Russie. Le premi...</td>\n",
+              "      <td>71.0</td>\n",
+              "      <td>Géographie</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
               "</div>"
             ],
             "text/plain": [
-              "   volume  numero                                head                author  \\\n",
-              "0       1       1                          Title Page              unsigned   \n",
-              "1       1       2   A MONSEIGNEUR LE COMTE D'ARGENSON  Diderot & d'Alembert   \n",
-              "2       1       3  DISCOURS PRÉLIMINAIRE DES EDITEURS            d'Alembert   \n",
-              "3       1       5                            A, a & a            Dumarsais5   \n",
-              "4       1       6                                   A            Dumarsais5   \n",
-              "5       1       7                                   A             Dumarsais   \n",
-              "6       1       8                                   A                Mallet   \n",
-              "7       1       9                A, lettre symbolique                Mallet   \n",
-              "8       1      10        A, numismatique ou monétaire                Mallet   \n",
-              "9       1      11                        A, lapidaire                Mallet   \n",
-              "\n",
-              "     edda_class enccre_id enccre_class  \\\n",
-              "0  unclassified       NaN          NaN   \n",
-              "1  unclassified       NaN          NaN   \n",
-              "2  unclassified       NaN          NaN   \n",
-              "3     Grammaire    v1-1-0    Grammaire   \n",
-              "4  unclassified    v1-1-1    Grammaire   \n",
-              "5  unclassified    v1-1-2    Grammaire   \n",
-              "6  unclassified    v1-1-3          NaN   \n",
-              "7  unclassified    v1-1-4          NaN   \n",
-              "8  unclassified    v1-1-5    Médailles   \n",
-              "9  unclassified    v1-1-6     Histoire   \n",
-              "\n",
-              "                                             content  \\\n",
-              "0  \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...   \n",
-              "1  \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...   \n",
-              "2  \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...   \n",
-              "3  \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...   \n",
-              "4  \\nA, mot, est 1. la troisieme personne du prés...   \n",
-              "5  \\nA, préposition vient du latin à, à dextris, ...   \n",
-              "6  \\nA, étoit une lettre numérale parmi les Ancie...   \n",
-              "7  \\nA, lettre symbolique, étoit un hiéroglyphe c...   \n",
-              "8  \\nA, numismatique ou monétaire, sur le revers ...   \n",
-              "9  \\nA, lapidaire, dans les anciennes inscription...   \n",
+              "          uid  lge-volume  lge-numero lge-head  lge-page lge-id  \\\n",
+              "0   lge_1_a-0           1           1        A         0    a-0   \n",
+              "1   lge_1_a-1           1           2        A         1    a-1   \n",
+              "2   lge_1_a-2           1           3        A         4    a-2   \n",
+              "3   lge_1_a-3           1           4        A         4    a-3   \n",
+              "4   lge_1_a-4           1           5        A         4    a-4   \n",
+              "5  lge_1_aa-0           1           6       AA         4   aa-0   \n",
+              "6  lge_1_aa-1           1           7       AA         4   aa-1   \n",
+              "7  lge_1_aa-2           1           8       AA         5   aa-2   \n",
+              "8  lge_1_aa-3           1           9       AA         5   aa-3   \n",
+              "9  lge_1_aa-4           1          10       AA         5   aa-4   \n",
               "\n",
-              "                           content_without_designant  \\\n",
-              "0  \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...   \n",
-              "1  \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...   \n",
-              "2  \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...   \n",
-              "3  \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...   \n",
-              "4  \\nA, mot, est 1. la troisieme personne du prés...   \n",
-              "5  \\nA, préposition vient du latin à, à dextris, ...   \n",
-              "6  \\nA, étoit une lettre numérale parmi les Ancie...   \n",
-              "7  \\nA, lettre symbolique, étoit un hiéroglyphe c...   \n",
-              "8  \\nA, numismatique ou monétaire, sur le revers ...   \n",
-              "9  \\nA, lapidaire, dans les anciennes inscription...   \n",
+              "                                         lge-content  lge-nbWords  \\\n",
+              "0  A(Ling.). Son vocal et première lettre de notr...       1761.0   \n",
+              "1  A(Paléogr.). C’est à l’alphabet phénicien, on ...        839.0   \n",
+              "2  A(Log.). Cette voyelle désigne les proposition...         56.0   \n",
+              "3  A(Mus.). La lettre a est employée par les musi...        267.0   \n",
+              "4  A(Numis.). Dans la numismatique grecque, la le...         67.0   \n",
+              "5  AA. Ces deux lettres désignent l’atelier monét...         14.0   \n",
+              "6  AA. Nom de plusieurs cours d’eau de l’Europe o...         75.0   \n",
+              "7  AA. Rivière de France, prend sa source aux Tro...        165.0   \n",
+              "8  AA. Rivière de Hollande, affluent de la Dommel...         17.0   \n",
+              "9  AA. Nom de deux fleuves de la Russie. Le premi...         71.0   \n",
               "\n",
-              "                                     first_paragraph  nb_words  super_domain  \\\n",
-              "0  \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...       151  Unclassified   \n",
-              "1  \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...       208  Unclassified   \n",
-              "2       \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n     44669  Unclassified   \n",
-              "3  \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...       711   Philosophie   \n",
-              "4  \\nA, mot, est 1. la troisieme personne du prés...       238  Unclassified   \n",
-              "5  \\nA, préposition vient du latin à, à dextris, ...      1980  Unclassified   \n",
-              "6  \\nA, étoit une lettre numérale parmi les Ancie...       200  Unclassified   \n",
-              "7  \\nA, lettre symbolique, étoit un hiéroglyphe c...        82  Unclassified   \n",
-              "8  \\nA, numismatique ou monétaire, sur le revers ...       112  Unclassified   \n",
-              "9  \\nA, lapidaire, dans les anciennes inscription...        80  Unclassified   \n",
-              "\n",
-              "  superdomainBert  \n",
-              "0     Philosophie  \n",
-              "1     Philosophie  \n",
-              "2  Belles-lettres  \n",
-              "3     Philosophie  \n",
-              "4     Philosophie  \n",
-              "5     Philosophie  \n",
-              "6        Histoire  \n",
-              "7        Histoire  \n",
-              "8        Histoire  \n",
-              "9        Histoire  "
+              "  lge-superdomainBert  \n",
+              "0         Philosophie  \n",
+              "1          Géographie  \n",
+              "2         Philosophie  \n",
+              "3             Musique  \n",
+              "4            Histoire  \n",
+              "5            Commerce  \n",
+              "6          Géographie  \n",
+              "7          Géographie  \n",
+              "8          Géographie  \n",
+              "9          Géographie  "
             ]
           },
-          "execution_count": 20,
+          "execution_count": 18,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -1037,40 +1929,30 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 21,
+      "execution_count": 19,
       "metadata": {
         "id": "J9rObbvVr0zc"
       },
       "outputs": [],
       "source": [
-        "df.to_csv(drive_path + \"predictions/EDdA_dataset_articles_superdomainBERT_230313.tsv\", sep=\"\\t\")"
+        "#df.to_csv(drive_path + \"predictions/EDdA_dataset_articles_superdomainBERT_230313.tsv\", sep=\"\\t\")\n",
+        "df.to_csv(drive_path + \"predictions/LGE_dataset_articles_superdomainBERT_230314.tsv\", sep=\"\\t\", index=False)"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 20,
       "metadata": {
         "id": "8cX6XBq8_F5T"
       },
       "outputs": [],
       "source": [
-        "df.drop(columns=['contentLGE', 'contentEDdA'], inplace=True)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "7fx6BPpg0iNc"
-      },
-      "outputs": [],
-      "source": [
-        "df.to_csv(drive_path + \"predictions/metadata_parallel_predictions_superdomain.csv\", sep=\",\", index=False)"
+        "#df.drop(columns=['contentLGE', 'contentEDdA'], inplace=True)"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 22,
+      "execution_count": 21,
       "metadata": {
         "id": "7TD1mbKj_fXH"
       },
@@ -1096,100 +1978,76 @@
               "  <thead>\n",
               "    <tr style=\"text-align: right;\">\n",
               "      <th></th>\n",
-              "      <th>volume</th>\n",
-              "      <th>numero</th>\n",
-              "      <th>head</th>\n",
-              "      <th>author</th>\n",
-              "      <th>edda_class</th>\n",
-              "      <th>enccre_id</th>\n",
-              "      <th>enccre_class</th>\n",
-              "      <th>content</th>\n",
-              "      <th>content_without_designant</th>\n",
-              "      <th>first_paragraph</th>\n",
-              "      <th>nb_words</th>\n",
-              "      <th>super_domain</th>\n",
-              "      <th>superdomainBert</th>\n",
+              "      <th>uid</th>\n",
+              "      <th>lge-volume</th>\n",
+              "      <th>lge-numero</th>\n",
+              "      <th>lge-head</th>\n",
+              "      <th>lge-page</th>\n",
+              "      <th>lge-id</th>\n",
+              "      <th>lge-content</th>\n",
+              "      <th>lge-nbWords</th>\n",
+              "      <th>lge-superdomainBert</th>\n",
               "    </tr>\n",
               "  </thead>\n",
               "  <tbody>\n",
               "    <tr>\n",
-              "      <th>24</th>\n",
+              "      <th>1</th>\n",
+              "      <td>lge_1_a-1</td>\n",
               "      <td>1</td>\n",
-              "      <td>26</td>\n",
+              "      <td>2</td>\n",
               "      <td>A</td>\n",
-              "      <td>Diderot</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-9-0</td>\n",
-              "      <td>Géographie</td>\n",
-              "      <td>\\n* A, s. petite riviere de France, qui a sa s...</td>\n",
-              "      <td>\\n* A, s. petite riviere de France, qui a sa s...</td>\n",
-              "      <td>\\n* A, s. petite riviere de France, qui a sa s...</td>\n",
-              "      <td>15</td>\n",
-              "      <td>Unclassified</td>\n",
+              "      <td>1</td>\n",
+              "      <td>a-1</td>\n",
+              "      <td>A(Paléogr.). C’est à l’alphabet phénicien, on ...</td>\n",
+              "      <td>839.0</td>\n",
               "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
-              "      <th>25</th>\n",
+              "      <th>6</th>\n",
+              "      <td>lge_1_aa-1</td>\n",
               "      <td>1</td>\n",
-              "      <td>27</td>\n",
+              "      <td>7</td>\n",
               "      <td>AA</td>\n",
-              "      <td>Diderot</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-10-0</td>\n",
-              "      <td>Géographie</td>\n",
-              "      <td>\\n* AA, s. f. riviere de France, qui prend sa ...</td>\n",
-              "      <td>\\n* AA, s. f. riviere de France, qui prend sa ...</td>\n",
-              "      <td>\\n* AA, s. f. riviere de France, qui prend sa ...</td>\n",
-              "      <td>46</td>\n",
-              "      <td>Unclassified</td>\n",
+              "      <td>4</td>\n",
+              "      <td>aa-1</td>\n",
+              "      <td>AA. Nom de plusieurs cours d’eau de l’Europe o...</td>\n",
+              "      <td>75.0</td>\n",
               "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
-              "      <th>27</th>\n",
+              "      <th>7</th>\n",
+              "      <td>lge_1_aa-2</td>\n",
               "      <td>1</td>\n",
-              "      <td>29</td>\n",
-              "      <td>AACH ou ACH</td>\n",
-              "      <td>Diderot</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-12-0</td>\n",
-              "      <td>Géographie</td>\n",
-              "      <td>\\n* AACH ou ACH, s. f. petite ville d'Allemagn...</td>\n",
-              "      <td>\\n* AACH ou ACH, s. f. petite ville d'Allemagn...</td>\n",
-              "      <td>\\n* AACH ou ACH, s. f. petite ville d'Allemagn...</td>\n",
-              "      <td>24</td>\n",
-              "      <td>Unclassified</td>\n",
+              "      <td>8</td>\n",
+              "      <td>AA</td>\n",
+              "      <td>5</td>\n",
+              "      <td>aa-2</td>\n",
+              "      <td>AA. Rivière de France, prend sa source aux Tro...</td>\n",
+              "      <td>165.0</td>\n",
               "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
-              "      <th>28</th>\n",
+              "      <th>8</th>\n",
+              "      <td>lge_1_aa-3</td>\n",
               "      <td>1</td>\n",
-              "      <td>30</td>\n",
-              "      <td>AAHUS</td>\n",
-              "      <td>Diderot</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-13-0</td>\n",
-              "      <td>Géographie</td>\n",
-              "      <td>\\n* AAHUS, s. petite ville d'Allemagne dans le...</td>\n",
-              "      <td>\\n* AAHUS, s. petite ville d'Allemagne dans le...</td>\n",
-              "      <td>\\n* AAHUS, s. petite ville d'Allemagne dans le...</td>\n",
-              "      <td>21</td>\n",
-              "      <td>Unclassified</td>\n",
+              "      <td>9</td>\n",
+              "      <td>AA</td>\n",
+              "      <td>5</td>\n",
+              "      <td>aa-3</td>\n",
+              "      <td>AA. Rivière de Hollande, affluent de la Dommel...</td>\n",
+              "      <td>17.0</td>\n",
               "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
-              "      <th>30</th>\n",
+              "      <th>9</th>\n",
+              "      <td>lge_1_aa-4</td>\n",
               "      <td>1</td>\n",
-              "      <td>32</td>\n",
-              "      <td>AAR</td>\n",
-              "      <td>Diderot</td>\n",
-              "      <td>unclassified</td>\n",
-              "      <td>v1-15-0</td>\n",
-              "      <td>Géographie</td>\n",
-              "      <td>\\n* AAR, s. grande riviere qui a sa source pro...</td>\n",
-              "      <td>\\n* AAR, s. grande riviere qui a sa source pro...</td>\n",
-              "      <td>\\n* AAR, s. grande riviere qui a sa source pro...</td>\n",
-              "      <td>30</td>\n",
-              "      <td>Unclassified</td>\n",
+              "      <td>10</td>\n",
+              "      <td>AA</td>\n",
+              "      <td>5</td>\n",
+              "      <td>aa-4</td>\n",
+              "      <td>AA. Nom de deux fleuves de la Russie. Le premi...</td>\n",
+              "      <td>71.0</td>\n",
               "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
@@ -1203,176 +2061,133 @@
               "      <td>...</td>\n",
               "      <td>...</td>\n",
               "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
               "    </tr>\n",
               "    <tr>\n",
-              "      <th>74051</th>\n",
-              "      <td>17</td>\n",
-              "      <td>3070</td>\n",
-              "      <td>ZYGRIS</td>\n",
-              "      <td>Jaucourt</td>\n",
-              "      <td>Géographie ancienne</td>\n",
-              "      <td>v17-2068-0</td>\n",
-              "      <td>Géographie</td>\n",
-              "      <td>\\nZYGRIS, (Géog. anc.) ville du nôme de Lybie\\...</td>\n",
-              "      <td>\\nZYGRIS,  ville du nôme de Lybie\\nsur la côte...</td>\n",
-              "      <td>\\nZYGRIS,  ville du nôme de Lybie\\nsur la côte...</td>\n",
-              "      <td>38</td>\n",
-              "      <td>Géographie</td>\n",
+              "      <th>134800</th>\n",
+              "      <td>lge_31_zvornix-0</td>\n",
+              "      <td>31</td>\n",
+              "      <td>7757</td>\n",
+              "      <td>ZVORNIX</td>\n",
+              "      <td>1370</td>\n",
+              "      <td>zvornix-0</td>\n",
+              "      <td>ZVORNIX. Ville de Bosnie, sur la r. g. de la D...</td>\n",
+              "      <td>27.0</td>\n",
               "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
-              "      <th>74054</th>\n",
-              "      <td>17</td>\n",
-              "      <td>3073</td>\n",
-              "      <td>ZYRAS</td>\n",
-              "      <td>Jaucourt</td>\n",
-              "      <td>Géographie ancienne</td>\n",
-              "      <td>v17-2071-0</td>\n",
-              "      <td>Géographie</td>\n",
-              "      <td>\\nZYRAS, (Géog. anc.) fleuve de Thrace. Pline,...</td>\n",
-              "      <td>\\nZYRAS,  fleuve de Thrace. Pline,\\nliv. IV. c...</td>\n",
-              "      <td>\\nZYRAS,  fleuve de Thrace. Pline,\\nliv. IV. c...</td>\n",
-              "      <td>28</td>\n",
-              "      <td>Géographie</td>\n",
+              "      <th>134801</th>\n",
+              "      <td>lge_31_zweibrücken-0</td>\n",
+              "      <td>31</td>\n",
+              "      <td>7758</td>\n",
+              "      <td>ZWEIBRÜCKEN</td>\n",
+              "      <td>1370</td>\n",
+              "      <td>zweibrücken-0</td>\n",
+              "      <td>ZWEIBRÜCKEN. Ville de Bavière (V. Deux-Ponts).\\n</td>\n",
+              "      <td>6.0</td>\n",
               "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
-              "      <th>74055</th>\n",
-              "      <td>17</td>\n",
-              "      <td>3074</td>\n",
-              "      <td>ZZUÉNÉ ou ZZEUENE</td>\n",
-              "      <td>Jaucourt</td>\n",
-              "      <td>Géographie ancienne</td>\n",
-              "      <td>v17-2072-0</td>\n",
-              "      <td>Géographie</td>\n",
-              "      <td>\\nZZUÉNÉ ou ZZEUENE, (Géog. anc.) ville située...</td>\n",
-              "      <td>\\nZZUÉNÉ ou ZZEUENE,  ville située\\nsur la riv...</td>\n",
-              "      <td>\\nZZUÉNÉ ou ZZEUENE,  ville située\\nsur la riv...</td>\n",
-              "      <td>149</td>\n",
-              "      <td>Géographie</td>\n",
+              "      <th>134803</th>\n",
+              "      <td>lge_31_zwickau-0</td>\n",
+              "      <td>31</td>\n",
+              "      <td>7760</td>\n",
+              "      <td>ZWICKAU</td>\n",
+              "      <td>1370</td>\n",
+              "      <td>zwickau-0</td>\n",
+              "      <td>ZWICKAU. Ville de Saxe, ch.-l. d’un cercle, su...</td>\n",
+              "      <td>92.0</td>\n",
               "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
-              "      <th>74080</th>\n",
-              "      <td>17</td>\n",
-              "      <td>3099</td>\n",
-              "      <td>CABOTAGE</td>\n",
-              "      <td>Jaucourt</td>\n",
-              "      <td>Navigation</td>\n",
-              "      <td>v17-2097-0</td>\n",
-              "      <td>Marine</td>\n",
-              "      <td>\\nCABOTAGE, s. m. (Navigation.) le cabotage es...</td>\n",
-              "      <td>\\nCABOTAGE, s. m.  le cabotage est\\nune naviga...</td>\n",
-              "      <td>\\nCABOTAGE, s. m.  le cabotage est\\nune naviga...</td>\n",
-              "      <td>192</td>\n",
-              "      <td>Géographie</td>\n",
+              "      <th>134806</th>\n",
+              "      <td>lge_31_zwolle-0</td>\n",
+              "      <td>31</td>\n",
+              "      <td>7763</td>\n",
+              "      <td>ZWOLLE</td>\n",
+              "      <td>1371</td>\n",
+              "      <td>zwolle-0</td>\n",
+              "      <td>ZWOLLE. Ville des Pays-Bas, ch.-l. de la prov....</td>\n",
+              "      <td>115.0</td>\n",
               "      <td>Géographie</td>\n",
               "    </tr>\n",
               "    <tr>\n",
-              "      <th>74165</th>\n",
-              "      <td>17</td>\n",
-              "      <td>3184</td>\n",
-              "      <td>GUAYAQUIL</td>\n",
-              "      <td>La Condamine</td>\n",
-              "      <td>Géographie</td>\n",
-              "      <td>v17-2177-0</td>\n",
-              "      <td>Géographie</td>\n",
-              "      <td>\\nGUAYAQUIL, (Géograph.) nom d'une ville &amp;\\nd'...</td>\n",
-              "      <td>\\nGUAYAQUIL,  nom d'une ville &amp;\\nd'une grande ...</td>\n",
-              "      <td>\\nGUAYAQUIL,  nom d'une ville &amp;\\nd'une grande ...</td>\n",
-              "      <td>446</td>\n",
-              "      <td>Géographie</td>\n",
+              "      <th>134819</th>\n",
+              "      <td>lge_31_zyrmi-0</td>\n",
+              "      <td>31</td>\n",
+              "      <td>7776</td>\n",
+              "      <td>ZYRMI</td>\n",
+              "      <td>1372</td>\n",
+              "      <td>zyrmi-0</td>\n",
+              "      <td>ZYRMI. Ville du Soudan. Ancienne capitale du p...</td>\n",
+              "      <td>16.0</td>\n",
               "      <td>Géographie</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
-              "<p>15383 rows × 13 columns</p>\n",
+              "<p>50917 rows × 9 columns</p>\n",
               "</div>"
             ],
             "text/plain": [
-              "       volume  numero               head        author           edda_class  \\\n",
-              "24          1      26                  A       Diderot         unclassified   \n",
-              "25          1      27                 AA       Diderot         unclassified   \n",
-              "27          1      29        AACH ou ACH       Diderot         unclassified   \n",
-              "28          1      30              AAHUS       Diderot         unclassified   \n",
-              "30          1      32                AAR       Diderot         unclassified   \n",
-              "...       ...     ...                ...           ...                  ...   \n",
-              "74051      17    3070             ZYGRIS      Jaucourt  Géographie ancienne   \n",
-              "74054      17    3073              ZYRAS      Jaucourt  Géographie ancienne   \n",
-              "74055      17    3074  ZZUÉNÉ ou ZZEUENE      Jaucourt  Géographie ancienne   \n",
-              "74080      17    3099           CABOTAGE      Jaucourt           Navigation   \n",
-              "74165      17    3184          GUAYAQUIL  La Condamine           Géographie   \n",
-              "\n",
-              "        enccre_id enccre_class  \\\n",
-              "24         v1-9-0   Géographie   \n",
-              "25        v1-10-0   Géographie   \n",
-              "27        v1-12-0   Géographie   \n",
-              "28        v1-13-0   Géographie   \n",
-              "30        v1-15-0   Géographie   \n",
-              "...           ...          ...   \n",
-              "74051  v17-2068-0   Géographie   \n",
-              "74054  v17-2071-0   Géographie   \n",
-              "74055  v17-2072-0   Géographie   \n",
-              "74080  v17-2097-0       Marine   \n",
-              "74165  v17-2177-0   Géographie   \n",
-              "\n",
-              "                                                 content  \\\n",
-              "24     \\n* A, s. petite riviere de France, qui a sa s...   \n",
-              "25     \\n* AA, s. f. riviere de France, qui prend sa ...   \n",
-              "27     \\n* AACH ou ACH, s. f. petite ville d'Allemagn...   \n",
-              "28     \\n* AAHUS, s. petite ville d'Allemagne dans le...   \n",
-              "30     \\n* AAR, s. grande riviere qui a sa source pro...   \n",
-              "...                                                  ...   \n",
-              "74051  \\nZYGRIS, (Géog. anc.) ville du nôme de Lybie\\...   \n",
-              "74054  \\nZYRAS, (Géog. anc.) fleuve de Thrace. Pline,...   \n",
-              "74055  \\nZZUÉNÉ ou ZZEUENE, (Géog. anc.) ville située...   \n",
-              "74080  \\nCABOTAGE, s. m. (Navigation.) le cabotage es...   \n",
-              "74165  \\nGUAYAQUIL, (Géograph.) nom d'une ville &\\nd'...   \n",
-              "\n",
-              "                               content_without_designant  \\\n",
-              "24     \\n* A, s. petite riviere de France, qui a sa s...   \n",
-              "25     \\n* AA, s. f. riviere de France, qui prend sa ...   \n",
-              "27     \\n* AACH ou ACH, s. f. petite ville d'Allemagn...   \n",
-              "28     \\n* AAHUS, s. petite ville d'Allemagne dans le...   \n",
-              "30     \\n* AAR, s. grande riviere qui a sa source pro...   \n",
-              "...                                                  ...   \n",
-              "74051  \\nZYGRIS,  ville du nôme de Lybie\\nsur la côte...   \n",
-              "74054  \\nZYRAS,  fleuve de Thrace. Pline,\\nliv. IV. c...   \n",
-              "74055  \\nZZUÉNÉ ou ZZEUENE,  ville située\\nsur la riv...   \n",
-              "74080  \\nCABOTAGE, s. m.  le cabotage est\\nune naviga...   \n",
-              "74165  \\nGUAYAQUIL,  nom d'une ville &\\nd'une grande ...   \n",
+              "                         uid  lge-volume  lge-numero     lge-head  lge-page  \\\n",
+              "1                  lge_1_a-1           1           2            A         1   \n",
+              "6                 lge_1_aa-1           1           7           AA         4   \n",
+              "7                 lge_1_aa-2           1           8           AA         5   \n",
+              "8                 lge_1_aa-3           1           9           AA         5   \n",
+              "9                 lge_1_aa-4           1          10           AA         5   \n",
+              "...                      ...         ...         ...          ...       ...   \n",
+              "134800      lge_31_zvornix-0          31        7757      ZVORNIX      1370   \n",
+              "134801  lge_31_zweibrücken-0          31        7758  ZWEIBRÜCKEN      1370   \n",
+              "134803      lge_31_zwickau-0          31        7760      ZWICKAU      1370   \n",
+              "134806       lge_31_zwolle-0          31        7763       ZWOLLE      1371   \n",
+              "134819        lge_31_zyrmi-0          31        7776        ZYRMI      1372   \n",
               "\n",
-              "                                         first_paragraph  nb_words  \\\n",
-              "24     \\n* A, s. petite riviere de France, qui a sa s...        15   \n",
-              "25     \\n* AA, s. f. riviere de France, qui prend sa ...        46   \n",
-              "27     \\n* AACH ou ACH, s. f. petite ville d'Allemagn...        24   \n",
-              "28     \\n* AAHUS, s. petite ville d'Allemagne dans le...        21   \n",
-              "30     \\n* AAR, s. grande riviere qui a sa source pro...        30   \n",
-              "...                                                  ...       ...   \n",
-              "74051  \\nZYGRIS,  ville du nôme de Lybie\\nsur la côte...        38   \n",
-              "74054  \\nZYRAS,  fleuve de Thrace. Pline,\\nliv. IV. c...        28   \n",
-              "74055  \\nZZUÉNÉ ou ZZEUENE,  ville située\\nsur la riv...       149   \n",
-              "74080  \\nCABOTAGE, s. m.  le cabotage est\\nune naviga...       192   \n",
-              "74165  \\nGUAYAQUIL,  nom d'une ville &\\nd'une grande ...       446   \n",
+              "               lge-id                                        lge-content  \\\n",
+              "1                 a-1  A(Paléogr.). C’est à l’alphabet phénicien, on ...   \n",
+              "6                aa-1  AA. Nom de plusieurs cours d’eau de l’Europe o...   \n",
+              "7                aa-2  AA. Rivière de France, prend sa source aux Tro...   \n",
+              "8                aa-3  AA. Rivière de Hollande, affluent de la Dommel...   \n",
+              "9                aa-4  AA. Nom de deux fleuves de la Russie. Le premi...   \n",
+              "...               ...                                                ...   \n",
+              "134800      zvornix-0  ZVORNIX. Ville de Bosnie, sur la r. g. de la D...   \n",
+              "134801  zweibrücken-0   ZWEIBRÜCKEN. Ville de Bavière (V. Deux-Ponts).\\n   \n",
+              "134803      zwickau-0  ZWICKAU. Ville de Saxe, ch.-l. d’un cercle, su...   \n",
+              "134806       zwolle-0  ZWOLLE. Ville des Pays-Bas, ch.-l. de la prov....   \n",
+              "134819        zyrmi-0  ZYRMI. Ville du Soudan. Ancienne capitale du p...   \n",
               "\n",
-              "       super_domain superdomainBert  \n",
-              "24     Unclassified      Géographie  \n",
-              "25     Unclassified      Géographie  \n",
-              "27     Unclassified      Géographie  \n",
-              "28     Unclassified      Géographie  \n",
-              "30     Unclassified      Géographie  \n",
-              "...             ...             ...  \n",
-              "74051    Géographie      Géographie  \n",
-              "74054    Géographie      Géographie  \n",
-              "74055    Géographie      Géographie  \n",
-              "74080    Géographie      Géographie  \n",
-              "74165    Géographie      Géographie  \n",
+              "        lge-nbWords lge-superdomainBert  \n",
+              "1             839.0          Géographie  \n",
+              "6              75.0          Géographie  \n",
+              "7             165.0          Géographie  \n",
+              "8              17.0          Géographie  \n",
+              "9              71.0          Géographie  \n",
+              "...             ...                 ...  \n",
+              "134800         27.0          Géographie  \n",
+              "134801          6.0          Géographie  \n",
+              "134803         92.0          Géographie  \n",
+              "134806        115.0          Géographie  \n",
+              "134819         16.0          Géographie  \n",
               "\n",
-              "[15383 rows x 13 columns]"
+              "[50917 rows x 9 columns]"
+            ]
+          },
+          "execution_count": 21,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "df.loc[(df[corpus+'-superdomainBert'] == 'Géographie')]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 22,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "(134820, 9)"
             ]
           },
           "execution_count": 22,
@@ -1381,7 +2196,7 @@
         }
       ],
       "source": [
-        "df.loc[(df['superdomainBert'] == 'Géographie')]"
+        "df.shape"
       ]
     },
     {
-- 
GitLab