Update Predict_LGE.ipynb

f13251b8 · Ludovic Moncla · 926002bb · f13251b8
Commit f13251b8 authored 2 years ago by Ludovic Moncla
--- a/notebooks/Predict_LGE.ipynb
+++ b/notebooks/Predict_LGE.ipynb
@@ -92,7 +92,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": null,
      "metadata": {
        "id": "SkErnwgMMbRj"
      },
@@ -120,7 +120,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
@@ -128,15 +128,7 @@
        "id": "dPOU-Efhf4ui",
        "outputId": "121dd21e-f98c-483d-d6d1-2838f732a4e2"
      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "We will use the GPU\n"
-          ]
-        }
-      ],
+      "outputs": [],
      "source": [
        "  # If there's a GPU available...\n",
        "if torch.cuda.is_available():    \n",
@@ -295,9 +287,16 @@
        "## 3. Load Data"
      ]
    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 3.1 LGE (Nakala)"
+      ]
+    },
    {
      "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
@@ -319,13 +318,13 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "#input_path = \"/Users/lmoncla/Documents/Data/Corpus/LGE/Text\"\n",
-        "input_path = \"./Text\""
+        "input_path = \"/Users/lmoncla/Documents/Data/Corpus/LGE/Text\"\n",
+        "#input_path = \"./Text\""
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
@@ -336,129 +335,18 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": null,
      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>id</th>\n",
-              "      <th>tome</th>\n",
-              "      <th>rank</th>\n",
-              "      <th>domain</th>\n",
-              "      <th>remark</th>\n",
-              "      <th>content</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>abrabeses-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>623</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ABRABESES. Village d’Espagne de la prov. de Za...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>accius-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>1076</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>achenbach-2</td>\n",
-              "      <td>1</td>\n",
-              "      <td>1357</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ACHENBACH(Henri), administrateur prussien, né ...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>acireale-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>1513</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>actée-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>1731</td>\n",
-              "      <td>botany</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ACTÉE(Actœa L.). Genre de plantes de la famill...</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "            id  tome  rank     domain remark  \\\n",
-              "0  abrabeses-0     1   623  geography    NaN   \n",
-              "1     accius-0     1  1076  biography    NaN   \n",
-              "2  achenbach-2     1  1357  biography    NaN   \n",
-              "3   acireale-0     1  1513  geography    NaN   \n",
-              "4      actée-0     1  1731     botany    NaN   \n",
-              "\n",
-              "                                             content  \n",
-              "0  ABRABESES. Village d’Espagne de la prov. de Za...  \n",
-              "1  ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...  \n",
-              "2  ACHENBACH(Henri), administrateur prussien, né ...  \n",
-              "3  ACIREALE. Yille de Sicile, de la province et d...  \n",
-              "4  ACTÉE(Actœa L.). Genre de plantes de la famill...  "
-            ]
-          },
-          "execution_count": 5,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
+      "outputs": [],
      "source": [
        "df_LGE.head()"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": null,
      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "(310, 6)"
-            ]
-          },
-          "execution_count": 6,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
+      "outputs": [],
      "source": [
        "df_LGE.shape"
      ]
@@ -474,59 +362,38 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 14,
+      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#path = \"drive/MyDrive/Classification-EDdA/\"\n",
        "path = \"../\"\n",
        "model_name = \"bert-base-multilingual-cased\"\n",
-        "#model_name = \"camembert-base\"\n",
        "model_path = path + \"models/model_\" + model_name + \"_s10000.pt\""
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 16,
+      "execution_count": null,
      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loading Bert Tokenizer...\n"
-          ]
-        }
-      ],
+      "outputs": [],
      "source": [
-        "if model_name == 'bert-base-multilingual-cased' :\n",
-        "    print('Loading Bert Tokenizer...')\n",
-        "    tokenizer = BertTokenizer.from_pretrained(model_name)\n",
-        "elif model_name == 'camembert-base':\n",
-        "    print('Loading Camembert Tokenizer...')\n",
-        "    tokenizer = CamembertTokenizer.from_pretrained(model_name)"
+        "print('Loading Bert Tokenizer...')\n",
+        "tokenizer = BertTokenizer.from_pretrained(model_name)"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 17,
+      "execution_count": null,
      "metadata": {},
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors\n"
-          ]
-        }
-      ],
+      "outputs": [],
      "source": [
        "data_loader = generate_dataloader(tokenizer, data_LGE)"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 18,
+      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
@@ -535,7 +402,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 19,
+      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
@@ -550,7 +417,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 22,
+      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
@@ -561,7 +428,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 23,
+      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
@@ -570,7 +437,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 24,
+      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
@@ -579,714 +446,16 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 26,
+      "execution_count": null,
      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>id</th>\n",
-              "      <th>tome</th>\n",
-              "      <th>rank</th>\n",
-              "      <th>domain</th>\n",
-              "      <th>remark</th>\n",
-              "      <th>content</th>\n",
-              "      <th>class_bert</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>abrabeses-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>623</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ABRABESES. Village d’Espagne de la prov. de Za...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>accius-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>1076</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n",
-              "      <td>Belles-lettres - Poésie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>achenbach-2</td>\n",
-              "      <td>1</td>\n",
-              "      <td>1357</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ACHENBACH(Henri), administrateur prussien, né ...</td>\n",
-              "      <td>Histoire</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>acireale-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>1513</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>actée-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>1731</td>\n",
-              "      <td>botany</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ACTÉE(Actœa L.). Genre de plantes de la famill...</td>\n",
-              "      <td>Histoire naturelle</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>adulteration-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>2197</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>cross reference</td>\n",
-              "      <td>ADULTERATION. Altération d’un médicament, d’un...</td>\n",
-              "      <td>Chimie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>6</th>\n",
-              "      <td>aérides-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>2334</td>\n",
-              "      <td>botany</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>AÉRIDES{Aérides Lour.). Genres de plantes de l...</td>\n",
-              "      <td>Histoire naturelle</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>7</th>\n",
-              "      <td>ager-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>2710</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>AGERouAGERIUS (Nicolaus), médecin alsacien, né...</td>\n",
-              "      <td>Histoire</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>8</th>\n",
-              "      <td>aigu-1</td>\n",
-              "      <td>1</td>\n",
-              "      <td>3160</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>cross reference</td>\n",
-              "      <td>AIGU1 LH E (V. Raimond d’).\\n</td>\n",
-              "      <td>Marine</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>9</th>\n",
-              "      <td>alavika-0</td>\n",
-              "      <td>1</td>\n",
-              "      <td>3664</td>\n",
-              "      <td>theology</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ALAVIKA« qui est d'Alava »(V. ce mot) : Bhikch...</td>\n",
-              "      <td>Religion</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>10</th>\n",
-              "      <td>allassac-0</td>\n",
-              "      <td>2</td>\n",
-              "      <td>755</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ALLASSAC. Com. du dép. de la Corrèze, arr. de ...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>11</th>\n",
-              "      <td>allegretto-0</td>\n",
-              "      <td>2</td>\n",
-              "      <td>786</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>cross reference</td>\n",
-              "      <td>ALLEGRETTO(V. Allegro).\\n</td>\n",
-              "      <td>Musique</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>12</th>\n",
-              "      <td>alleuze-0</td>\n",
-              "      <td>2</td>\n",
-              "      <td>908</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ALLEUZE. Com. du dép. du Cantal, arr. et cant....</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>13</th>\n",
-              "      <td>alliat-0</td>\n",
-              "      <td>2</td>\n",
-              "      <td>933</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ALLIAT. Com. du dép. de l’Ariège, arr. de Foix...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>14</th>\n",
-              "      <td>amanty-0</td>\n",
-              "      <td>2</td>\n",
-              "      <td>1651</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>AMANTY. Corn, du dép. de la Meuse, arr. de Com...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>15</th>\n",
-              "      <td>âmasserah-0</td>\n",
-              "      <td>2</td>\n",
-              "      <td>1701</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>explicit domain</td>\n",
-              "      <td>ÂMASSERAH, AMASR1 ou AMASRAH (Géogr.). Ville d...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>16</th>\n",
-              "      <td>a-118</td>\n",
-              "      <td>2</td>\n",
-              "      <td>2971</td>\n",
-              "      <td>history</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>AN Cl LIA. Boucliers sacrés des Romains, au no...</td>\n",
-              "      <td>Antiquité</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>17</th>\n",
-              "      <td>androclès-0</td>\n",
-              "      <td>2</td>\n",
-              "      <td>3261</td>\n",
-              "      <td>mythology</td>\n",
-              "      <td>explicit domain</td>\n",
-              "      <td>ANDROCLÈS(Myth.), un fils d’Eole qui régna sur...</td>\n",
-              "      <td>Antiquité</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>18</th>\n",
-              "      <td>anfouson-0</td>\n",
-              "      <td>2</td>\n",
-              "      <td>3394</td>\n",
-              "      <td>zoology</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ANFOUSON. Nom donné à Nice au Néron brun\\n(V. ...</td>\n",
-              "      <td>Histoire naturelle</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>19</th>\n",
-              "      <td>anicet-bourgeois-0</td>\n",
-              "      <td>2</td>\n",
-              "      <td>3717</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ANICET-BOURGEOIS(Auguste Anicet, connu sous le...</td>\n",
-              "      <td>Belles-lettres - Poésie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>20</th>\n",
-              "      <td>anomalistique-0</td>\n",
-              "      <td>3</td>\n",
-              "      <td>238</td>\n",
-              "      <td>astronomy</td>\n",
-              "      <td>explicit domain</td>\n",
-              "      <td>ANOMALISTIQUE(Astron.). On appelle révolution\\...</td>\n",
-              "      <td>Physique - [Sciences physico-mathématiques]</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>21</th>\n",
-              "      <td>anostostome-0</td>\n",
-              "      <td>3</td>\n",
-              "      <td>298</td>\n",
-              "      <td>zoology</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ANOSTOSTOME(Anostostoma Gray). Genre d’insecte...</td>\n",
-              "      <td>Histoire naturelle</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>22</th>\n",
-              "      <td>anthoxanthème-0</td>\n",
-              "      <td>3</td>\n",
-              "      <td>571</td>\n",
-              "      <td>chemistry</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ANTHOXANTHÈME. L’un des deux principes coloran...</td>\n",
-              "      <td>Pharmacie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>23</th>\n",
-              "      <td>aod-0</td>\n",
-              "      <td>3</td>\n",
-              "      <td>1024</td>\n",
-              "      <td>theology</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>AOD, plus exactement Ehoud. personnage des com...</td>\n",
-              "      <td>Histoire</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>24</th>\n",
-              "      <td>aphellan-0</td>\n",
-              "      <td>3</td>\n",
-              "      <td>1177</td>\n",
-              "      <td>astronomy</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>APHELLAN(Astron.). Un des noms de l’étoile a2 ...</td>\n",
-              "      <td>Physique - [Sciences physico-mathématiques]</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>25</th>\n",
-              "      <td>appelle-0</td>\n",
-              "      <td>3</td>\n",
-              "      <td>1494</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>APPELLE. Com. du dép. du Tarn, arr. de Lavaux,...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>26</th>\n",
-              "      <td>aragona-1</td>\n",
-              "      <td>3</td>\n",
-              "      <td>1841</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ARAGONA, cardinal d’origine sicilienne, né en ...</td>\n",
-              "      <td>Religion</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>27</th>\n",
-              "      <td>araujuzon-0</td>\n",
-              "      <td>3</td>\n",
-              "      <td>1940</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ARAUJUZON. Com. du dép. des Basses-Pyrénées, a...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>28</th>\n",
-              "      <td>ardant-0</td>\n",
-              "      <td>3</td>\n",
-              "      <td>2421</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ARDANT(Paul-Joseph), général français, né en 1...</td>\n",
-              "      <td>Militaire (Art) - Guerre - Arme</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>29</th>\n",
-              "      <td>ariano-0</td>\n",
-              "      <td>3</td>\n",
-              "      <td>2839</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ARIANOdi Puglia. Ville de la prov. de principa...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>30</th>\n",
-              "      <td>athabaska-0</td>\n",
-              "      <td>4</td>\n",
-              "      <td>1118</td>\n",
-              "      <td>anthropology</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ATHABASKA. Col, rivière, lac, territoire et fa...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>31</th>\n",
-              "      <td>aslonnes-0</td>\n",
-              "      <td>4</td>\n",
-              "      <td>446</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>ASLONNES, corn, du dép. de la Vienne, arr. de ...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>32</th>\n",
-              "      <td>astr0rh1za-0</td>\n",
-              "      <td>4</td>\n",
-              "      <td>992</td>\n",
-              "      <td>zoology</td>\n",
-              "      <td>explicit domain</td>\n",
-              "      <td>ASTR0RH1ZA(Zool.).Genre deForaminifèresimperfo...</td>\n",
-              "      <td>Histoire naturelle</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>33</th>\n",
-              "      <td>atthidographes-0</td>\n",
-              "      <td>4</td>\n",
-              "      <td>1397</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>cross reference</td>\n",
-              "      <td>ATTHIDOGRAPHES(V. Atthide).\\n</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>34</th>\n",
-              "      <td>aubery-2</td>\n",
-              "      <td>4</td>\n",
-              "      <td>1577</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>AUBERY(Antoine;, historien français, né le .18...</td>\n",
-              "      <td>Histoire</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>35</th>\n",
-              "      <td>aula-0</td>\n",
-              "      <td>4</td>\n",
-              "      <td>1992</td>\n",
-              "      <td>history</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>AULA. Mot latin signifiant cour, lieu découver...</td>\n",
-              "      <td>Architecture</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>36</th>\n",
-              "      <td>au-113</td>\n",
-              "      <td>4</td>\n",
-              "      <td>2112</td>\n",
-              "      <td>botany</td>\n",
-              "      <td>explicit domain</td>\n",
-              "      <td>AUNÉE (bot.). L'Aunée, Grande Année, Année off...</td>\n",
-              "      <td>Histoire naturelle</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>37</th>\n",
-              "      <td>auriol-4</td>\n",
-              "      <td>4</td>\n",
-              "      <td>2224</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>cross reference</td>\n",
-              "      <td>AURIOL. Nom donné à Marseille au Maquereau (V....</td>\n",
-              "      <td>Histoire naturelle</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>38</th>\n",
-              "      <td>ave-lalleniant-0</td>\n",
-              "      <td>4</td>\n",
-              "      <td>2739</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>AVE-LALLENIANT(Robert-Christian-Barthold), méd...</td>\n",
-              "      <td>Histoire</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>39</th>\n",
-              "      <td>badin-2</td>\n",
-              "      <td>4</td>\n",
-              "      <td>3857</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>BADIN(Pierre-Adolphe), peintre français, né à ...</td>\n",
-              "      <td>Arts et métiers</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>40</th>\n",
-              "      <td>baizieux-0</td>\n",
-              "      <td>5</td>\n",
-              "      <td>133</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>BAIZIEUX(Bacium, Basium). Com. du dép. de la\\n...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>41</th>\n",
-              "      <td>balsam1te-0</td>\n",
-              "      <td>5</td>\n",
-              "      <td>677</td>\n",
-              "      <td>botany</td>\n",
-              "      <td>explicit domain</td>\n",
-              "      <td>BALSAM1TE(Bot.) (Balsamita Desf.). Genre de Co...</td>\n",
-              "      <td>Histoire naturelle</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>42</th>\n",
-              "      <td>balze-0</td>\n",
-              "      <td>5</td>\n",
-              "      <td>757</td>\n",
-              "      <td>navy</td>\n",
-              "      <td>explicit domain</td>\n",
-              "      <td>BALZE(Mar.). Radeau delà côte occidentale de l...</td>\n",
-              "      <td>Marine</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>43</th>\n",
-              "      <td>bande-2</td>\n",
-              "      <td>5</td>\n",
-              "      <td>880</td>\n",
-              "      <td>history</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>BANDE(Ordre delà) ou de l’ECHARPE.Ordre milita...</td>\n",
-              "      <td>Histoire</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>44</th>\n",
-              "      <td>barbosa-5</td>\n",
-              "      <td>5</td>\n",
-              "      <td>1580</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>BARBOSA(Antonio), jésuite et orientaliste port...</td>\n",
-              "      <td>Religion</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>45</th>\n",
-              "      <td>bati-0</td>\n",
-              "      <td>5</td>\n",
-              "      <td>2955</td>\n",
-              "      <td>architecture</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>BATIÈRE. Toit en forme de bât se terminant à c...</td>\n",
-              "      <td>Architecture</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>46</th>\n",
-              "      <td>baveuse-0</td>\n",
-              "      <td>5</td>\n",
-              "      <td>3457</td>\n",
-              "      <td>zoology</td>\n",
-              "      <td>explicit domain</td>\n",
-              "      <td>BAVEUSE(Zool.). Nom vulgaire par lequel les\\np...</td>\n",
-              "      <td>Histoire naturelle</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>47</th>\n",
-              "      <td>beard-2</td>\n",
-              "      <td>5</td>\n",
-              "      <td>3728</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>BEARD(James-Henry), peintre américain contempo...</td>\n",
-              "      <td>Beaux-arts</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>48</th>\n",
-              "      <td>beaufort-4</td>\n",
-              "      <td>5</td>\n",
-              "      <td>3838</td>\n",
-              "      <td>geography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>BEAUFORT. Com. du dép. de la Meuse, arr. de Mo...</td>\n",
-              "      <td>Géographie</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>49</th>\n",
-              "      <td>beaumont-26</td>\n",
-              "      <td>5</td>\n",
-              "      <td>4018</td>\n",
-              "      <td>biography</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>BEAUMONT(J.-G. Leprevôt de), secrétaire du cle...</td>\n",
-              "      <td>Histoire</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "                    id  tome  rank        domain           remark  \\\n",
-              "0          abrabeses-0     1   623     geography              NaN   \n",
-              "1             accius-0     1  1076     biography              NaN   \n",
-              "2          achenbach-2     1  1357     biography              NaN   \n",
-              "3           acireale-0     1  1513     geography              NaN   \n",
-              "4              actée-0     1  1731        botany              NaN   \n",
-              "5       adulteration-0     1  2197           NaN  cross reference   \n",
-              "6            aérides-0     1  2334        botany              NaN   \n",
-              "7               ager-0     1  2710     biography              NaN   \n",
-              "8               aigu-1     1  3160           NaN  cross reference   \n",
-              "9            alavika-0     1  3664      theology              NaN   \n",
-              "10          allassac-0     2   755     geography              NaN   \n",
-              "11        allegretto-0     2   786           NaN  cross reference   \n",
-              "12           alleuze-0     2   908     geography              NaN   \n",
-              "13            alliat-0     2   933     geography              NaN   \n",
-              "14            amanty-0     2  1651     geography              NaN   \n",
-              "15         âmasserah-0     2  1701     geography  explicit domain   \n",
-              "16               a-118     2  2971       history              NaN   \n",
-              "17         androclès-0     2  3261     mythology  explicit domain   \n",
-              "18          anfouson-0     2  3394       zoology              NaN   \n",
-              "19  anicet-bourgeois-0     2  3717     biography              NaN   \n",
-              "20     anomalistique-0     3   238     astronomy  explicit domain   \n",
-              "21       anostostome-0     3   298       zoology              NaN   \n",
-              "22     anthoxanthème-0     3   571     chemistry              NaN   \n",
-              "23               aod-0     3  1024      theology              NaN   \n",
-              "24          aphellan-0     3  1177     astronomy              NaN   \n",
-              "25           appelle-0     3  1494     geography              NaN   \n",
-              "26           aragona-1     3  1841     biography              NaN   \n",
-              "27         araujuzon-0     3  1940     geography              NaN   \n",
-              "28            ardant-0     3  2421     biography              NaN   \n",
-              "29            ariano-0     3  2839     geography              NaN   \n",
-              "30         athabaska-0     4  1118  anthropology              NaN   \n",
-              "31          aslonnes-0     4   446     geography              NaN   \n",
-              "32        astr0rh1za-0     4   992       zoology  explicit domain   \n",
-              "33    atthidographes-0     4  1397           NaN  cross reference   \n",
-              "34            aubery-2     4  1577     biography              NaN   \n",
-              "35              aula-0     4  1992       history              NaN   \n",
-              "36              au-113     4  2112        botany  explicit domain   \n",
-              "37            auriol-4     4  2224           NaN  cross reference   \n",
-              "38    ave-lalleniant-0     4  2739     biography              NaN   \n",
-              "39             badin-2     4  3857     biography              NaN   \n",
-              "40          baizieux-0     5   133     geography              NaN   \n",
-              "41         balsam1te-0     5   677        botany  explicit domain   \n",
-              "42             balze-0     5   757          navy  explicit domain   \n",
-              "43             bande-2     5   880       history              NaN   \n",
-              "44           barbosa-5     5  1580     biography              NaN   \n",
-              "45              bati-0     5  2955  architecture              NaN   \n",
-              "46           baveuse-0     5  3457       zoology  explicit domain   \n",
-              "47             beard-2     5  3728     biography              NaN   \n",
-              "48          beaufort-4     5  3838     geography              NaN   \n",
-              "49         beaumont-26     5  4018     biography              NaN   \n",
-              "\n",
-              "                                              content  \\\n",
-              "0   ABRABESES. Village d’Espagne de la prov. de Za...   \n",
-              "1   ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...   \n",
-              "2   ACHENBACH(Henri), administrateur prussien, né ...   \n",
-              "3   ACIREALE. Yille de Sicile, de la province et d...   \n",
-              "4   ACTÉE(Actœa L.). Genre de plantes de la famill...   \n",
-              "5   ADULTERATION. Altération d’un médicament, d’un...   \n",
-              "6   AÉRIDES{Aérides Lour.). Genres de plantes de l...   \n",
-              "7   AGERouAGERIUS (Nicolaus), médecin alsacien, né...   \n",
-              "8                       AIGU1 LH E (V. Raimond d’).\\n   \n",
-              "9   ALAVIKA« qui est d'Alava »(V. ce mot) : Bhikch...   \n",
-              "10  ALLASSAC. Com. du dép. de la Corrèze, arr. de ...   \n",
-              "11                          ALLEGRETTO(V. Allegro).\\n   \n",
-              "12  ALLEUZE. Com. du dép. du Cantal, arr. et cant....   \n",
-              "13  ALLIAT. Com. du dép. de l’Ariège, arr. de Foix...   \n",
-              "14  AMANTY. Corn, du dép. de la Meuse, arr. de Com...   \n",
-              "15  ÂMASSERAH, AMASR1 ou AMASRAH (Géogr.). Ville d...   \n",
-              "16  AN Cl LIA. Boucliers sacrés des Romains, au no...   \n",
-              "17  ANDROCLÈS(Myth.), un fils d’Eole qui régna sur...   \n",
-              "18  ANFOUSON. Nom donné à Nice au Néron brun\\n(V. ...   \n",
-              "19  ANICET-BOURGEOIS(Auguste Anicet, connu sous le...   \n",
-              "20  ANOMALISTIQUE(Astron.). On appelle révolution\\...   \n",
-              "21  ANOSTOSTOME(Anostostoma Gray). Genre d’insecte...   \n",
-              "22  ANTHOXANTHÈME. L’un des deux principes coloran...   \n",
-              "23  AOD, plus exactement Ehoud. personnage des com...   \n",
-              "24  APHELLAN(Astron.). Un des noms de l’étoile a2 ...   \n",
-              "25  APPELLE. Com. du dép. du Tarn, arr. de Lavaux,...   \n",
-              "26  ARAGONA, cardinal d’origine sicilienne, né en ...   \n",
-              "27  ARAUJUZON. Com. du dép. des Basses-Pyrénées, a...   \n",
-              "28  ARDANT(Paul-Joseph), général français, né en 1...   \n",
-              "29  ARIANOdi Puglia. Ville de la prov. de principa...   \n",
-              "30  ATHABASKA. Col, rivière, lac, territoire et fa...   \n",
-              "31  ASLONNES, corn, du dép. de la Vienne, arr. de ...   \n",
-              "32  ASTR0RH1ZA(Zool.).Genre deForaminifèresimperfo...   \n",
-              "33                      ATTHIDOGRAPHES(V. Atthide).\\n   \n",
-              "34  AUBERY(Antoine;, historien français, né le .18...   \n",
-              "35  AULA. Mot latin signifiant cour, lieu découver...   \n",
-              "36  AUNÉE (bot.). L'Aunée, Grande Année, Année off...   \n",
-              "37  AURIOL. Nom donné à Marseille au Maquereau (V....   \n",
-              "38  AVE-LALLENIANT(Robert-Christian-Barthold), méd...   \n",
-              "39  BADIN(Pierre-Adolphe), peintre français, né à ...   \n",
-              "40  BAIZIEUX(Bacium, Basium). Com. du dép. de la\\n...   \n",
-              "41  BALSAM1TE(Bot.) (Balsamita Desf.). Genre de Co...   \n",
-              "42  BALZE(Mar.). Radeau delà côte occidentale de l...   \n",
-              "43  BANDE(Ordre delà) ou de l’ECHARPE.Ordre milita...   \n",
-              "44  BARBOSA(Antonio), jésuite et orientaliste port...   \n",
-              "45  BATIÈRE. Toit en forme de bât se terminant à c...   \n",
-              "46  BAVEUSE(Zool.). Nom vulgaire par lequel les\\np...   \n",
-              "47  BEARD(James-Henry), peintre américain contempo...   \n",
-              "48  BEAUFORT. Com. du dép. de la Meuse, arr. de Mo...   \n",
-              "49  BEAUMONT(J.-G. Leprevôt de), secrétaire du cle...   \n",
-              "\n",
-              "                                     class_bert  \n",
-              "0                                    Géographie  \n",
-              "1                       Belles-lettres - Poésie  \n",
-              "2                                      Histoire  \n",
-              "3                                    Géographie  \n",
-              "4                            Histoire naturelle  \n",
-              "5                                        Chimie  \n",
-              "6                            Histoire naturelle  \n",
-              "7                                      Histoire  \n",
-              "8                                        Marine  \n",
-              "9                                      Religion  \n",
-              "10                                   Géographie  \n",
-              "11                                      Musique  \n",
-              "12                                   Géographie  \n",
-              "13                                   Géographie  \n",
-              "14                                   Géographie  \n",
-              "15                                   Géographie  \n",
-              "16                                    Antiquité  \n",
-              "17                                    Antiquité  \n",
-              "18                           Histoire naturelle  \n",
-              "19                      Belles-lettres - Poésie  \n",
-              "20  Physique - [Sciences physico-mathématiques]  \n",
-              "21                           Histoire naturelle  \n",
-              "22                                    Pharmacie  \n",
-              "23                                     Histoire  \n",
-              "24  Physique - [Sciences physico-mathématiques]  \n",
-              "25                                   Géographie  \n",
-              "26                                     Religion  \n",
-              "27                                   Géographie  \n",
-              "28              Militaire (Art) - Guerre - Arme  \n",
-              "29                                   Géographie  \n",
-              "30                                   Géographie  \n",
-              "31                                   Géographie  \n",
-              "32                           Histoire naturelle  \n",
-              "33                                   Géographie  \n",
-              "34                                     Histoire  \n",
-              "35                                 Architecture  \n",
-              "36                           Histoire naturelle  \n",
-              "37                           Histoire naturelle  \n",
-              "38                                     Histoire  \n",
-              "39                              Arts et métiers  \n",
-              "40                                   Géographie  \n",
-              "41                           Histoire naturelle  \n",
-              "42                                       Marine  \n",
-              "43                                     Histoire  \n",
-              "44                                     Religion  \n",
-              "45                                 Architecture  \n",
-              "46                           Histoire naturelle  \n",
-              "47                                   Beaux-arts  \n",
-              "48                                   Géographie  \n",
-              "49                                     Histoire  "
-            ]
-          },
-          "execution_count": 26,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
+      "outputs": [],
      "source": [
        "df_LGE.head(50)"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 27,
+      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
@@ -1315,7 +484,7 @@
      "provenance": []
    },
    "kernelspec": {
-      "display_name": "Python 3.8.5",
+      "display_name": "Python 3.9.13 ('geode-classification-py39')",
      "language": "python",
      "name": "python3"
    },
@@ -1329,11 +498,11 @@
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
-      "version": "3.8.5"
+      "version": "3.9.13"
    },
    "vscode": {
      "interpreter": {
-        "hash": "5a66862d1e699d22749b730d4d12326d6986b018faa2bf0b5fca0506fffc064f"
+        "hash": "16fac9c2d845f8e1f8c6fffffe3d3a0def61c7e42da17a08d00f279ad4dea797"
      }
    },
    "widgets": {

 %% Cell type:markdown id: tags:

 # BERT Predict classification

 ## 1. Setup the environment

 ### 1.1 Setup colab environment

 #### 1.1.1 Install packages

 %% Cell type:code id: tags:

 ``` python
 !pip install transformers==4.10.3
 !pip install sentencepiece
 ```

 %% Cell type:markdown id: tags:

 #### 1.1.2 Use more RAM

 %% Cell type:code id: tags:

 ``` python
 from psutil import virtual_memory
 ram_gb = virtual_memory().total / 1e9
 print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

 if ram_gb < 20:
  print('Not using a high-RAM runtime')
 else:
  print('You are using a high-RAM runtime!')
 ```

 %% Cell type:markdown id: tags:

 #### 1.1.3 Mount GoogleDrive

 %% Cell type:code id: tags:

 ``` python
 from google.colab import drive
 drive.mount('/content/drive')
 ```

 %% Cell type:markdown id: tags:

 ### 1.2 Import librairies

 %% Cell type:code id: tags:

 ``` python
 import os
 import pandas as pd
 import numpy as np
 import pickle
 import torch
 from tqdm import tqdm

 from transformers import BertTokenizer, BertForSequenceClassification, CamembertTokenizer, CamembertForSequenceClassification
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 ```

 %% Cell type:markdown id: tags:

 ### 1.3 Setup GPU

 %% Cell type:code id: tags:

 ``` python
  # If there's a GPU available...
 if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    gpu_name = "cuda"
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
 # for MacOS
 elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    gpu_name = "mps"
    print('We will use the GPU')
 else:
    device = torch.device("cpu")
    gpu_name = "cpu"
    print('No GPU available, using the CPU instead.')
 ```

-%% Output
-
-    We will use the GPU
-
 %% Cell type:markdown id: tags:

 ## 2. Utils

 %% Cell type:code id: tags:

 ``` python
 def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_test = []
    # For every sentence...
    for sent in sentences:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            # This function also supports truncation and conversion
                            # to pytorch tensors, but I need to do padding, so I
                            # can't use these features.
                            #max_length = max_len,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                    )
        input_ids_test.append(encoded_sent)

    # Pad our input tokens
    padded_test = []
    for i in input_ids_test:
        if len(i) > max_len:
            padded_test.extend([i[:max_len]])
        else:
            padded_test.extend([i + [0] * (max_len - len(i))])
    input_ids_test = np.array(padded_test)

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids_test:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert to tensors.
    inputs = torch.tensor(input_ids_test)
    masks = torch.tensor(attention_masks)
    #set batch size

    # Create the DataLoader.
    data = TensorDataset(inputs, masks)
    prediction_sampler = SequentialSampler(data)

    return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)


 def predict(model, dataloader, device):

    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    predictions_test , true_labels = [], []
    pred_labels_ = []
    # Predict
    for batch in dataloader:
    # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from the dataloader
        b_input_ids, b_input_mask = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)
        logits = outputs[0]
        #print(logits)

        # Move logits and labels to CPU ???
        logits = logits.detach().cpu().numpy()
        #print(logits)

        # Store predictions and true labels
        predictions_test.append(logits)

        pred_labels = []

        for i in range(len(predictions_test)):
            # The predictions for this batch are a 2-column ndarray (one column for "0"
            # and one column for "1"). Pick the label with the highest value and turn this
            # in to a list of 0s and 1s.
            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
            pred_labels.append(pred_labels_i)

    pred_labels_ += [item for sublist in pred_labels for item in sublist]
    return pred_labels_


 def text_folder_to_dataframe(path):

  data = []
  # id,tome,filename,nb_words,content,domain

  for tome in sorted(os.listdir(path)):
    try:
        for article in tqdm(sorted(os.listdir(path + "/" + tome))):
            filename = article[:-4]
            id = tome + filename

            if article[-4:] == ".txt":
                with open(path + "/" + tome + "/" + article) as f:
                    content = f.read()

                    data.append([id, tome, filename, content, len(content.split(' '))])
    except NotADirectoryError:
        pass
  return pd.DataFrame(data, columns=['id', 'tome', 'filename', 'content', 'nb_words'])
 ```

 %% Cell type:markdown id: tags:

 ## 3. Load Data

+%% Cell type:markdown id: tags:
+
+### 3.1 LGE (Nakala)
+
 %% Cell type:code id: tags:

 ``` python
 !wget https://api.nakala.fr/data/10.34847/nkl.74eb1xfd/e522413b58b04ab7c283f8fa68642e9cb69ab5c5
 ```

 %% Cell type:code id: tags:

 ``` python
 !unzip e522413b58b04ab7c283f8fa68642e9cb69ab5c5
 ```

 %% Cell type:code id: tags:

 ``` python
-#input_path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text"
-input_path = "./Text"
+input_path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text"
+#input_path = "./Text"
 ```

 %% Cell type:code id: tags:

 ``` python
 df_LGE = text_folder_to_dataframe(input_path)
 #df_LGE = pd.read_csv(path + "data/LGE_withContent.tsv", sep="\t")
 data_LGE = df_LGE["content"].values
 ```

 %% Cell type:code id: tags:

 ``` python
 df_LGE.head()
 ```

-%% Output
-
-                id  tome  rank     domain remark  \
-    0  abrabeses-0     1   623  geography    NaN
-    1     accius-0     1  1076  biography    NaN
-    2  achenbach-2     1  1357  biography    NaN
-    3   acireale-0     1  1513  geography    NaN
-    4      actée-0     1  1731     botany    NaN
-    
-                                                 content
-    0  ABRABESES. Village d’Espagne de la prov. de Za...
-    1  ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...
-    2  ACHENBACH(Henri), administrateur prussien, né ...
-    3  ACIREALE. Yille de Sicile, de la province et d...
-    4  ACTÉE(Actœa L.). Genre de plantes de la famill...
-
 %% Cell type:code id: tags:

 ``` python
 df_LGE.shape
 ```

-%% Output
-
-    (310, 6)
-
 %% Cell type:markdown id: tags:

 ## 3. Load model and predict

 ### 3.1 BERT / CamemBERT

 %% Cell type:code id: tags:

 ``` python
 #path = "drive/MyDrive/Classification-EDdA/"
 path = "../"
 model_name = "bert-base-multilingual-cased"
-#model_name = "camembert-base"
 model_path = path + "models/model_" + model_name + "_s10000.pt"
 ```

 %% Cell type:code id: tags:

 ``` python
-if model_name == 'bert-base-multilingual-cased' :
-    print('Loading Bert Tokenizer...')
-    tokenizer = BertTokenizer.from_pretrained(model_name)
-elif model_name == 'camembert-base':
-    print('Loading Camembert Tokenizer...')
-    tokenizer = CamembertTokenizer.from_pretrained(model_name)
+print('Loading Bert Tokenizer...')
+tokenizer = BertTokenizer.from_pretrained(model_name)
 ```

-%% Output
-
-    Loading Bert Tokenizer...
-
 %% Cell type:code id: tags:

 ``` python
 data_loader = generate_dataloader(tokenizer, data_LGE)
 ```

-%% Output
-
-    Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors
-
 %% Cell type:code id: tags:

 ``` python
 model = BertForSequenceClassification.from_pretrained(model_path).to(gpu_name) #.to("cuda")
 ```

 %% Cell type:code id: tags:

 ``` python
 pred = predict(model, data_loader, device)
 ```

 %% Cell type:code id: tags:

 ``` python
 encoder_filename = "models/label_encoder.pkl"
 with open(path + encoder_filename, 'rb') as file:
      encoder = pickle.load(file)
 ```

 %% Cell type:code id: tags:

 ``` python
 p2 = list(encoder.inverse_transform(pred))
 ```

 %% Cell type:code id: tags:

 ``` python
 df_LGE['domain'] = p2
 ```

 %% Cell type:code id: tags:

 ``` python
 df_LGE.head(50)
 ```

-%% Output
-
-                        id  tome  rank        domain           remark  \
-    0          abrabeses-0     1   623     geography              NaN
-    1             accius-0     1  1076     biography              NaN
-    2          achenbach-2     1  1357     biography              NaN
-    3           acireale-0     1  1513     geography              NaN
-    4              actée-0     1  1731        botany              NaN
-    5       adulteration-0     1  2197           NaN  cross reference
-    6            aérides-0     1  2334        botany              NaN
-    7               ager-0     1  2710     biography              NaN
-    8               aigu-1     1  3160           NaN  cross reference
-    9            alavika-0     1  3664      theology              NaN
-    10          allassac-0     2   755     geography              NaN
-    11        allegretto-0     2   786           NaN  cross reference
-    12           alleuze-0     2   908     geography              NaN
-    13            alliat-0     2   933     geography              NaN
-    14            amanty-0     2  1651     geography              NaN
-    15         âmasserah-0     2  1701     geography  explicit domain
-    16               a-118     2  2971       history              NaN
-    17         androclès-0     2  3261     mythology  explicit domain
-    18          anfouson-0     2  3394       zoology              NaN
-    19  anicet-bourgeois-0     2  3717     biography              NaN
-    20     anomalistique-0     3   238     astronomy  explicit domain
-    21       anostostome-0     3   298       zoology              NaN
-    22     anthoxanthème-0     3   571     chemistry              NaN
-    23               aod-0     3  1024      theology              NaN
-    24          aphellan-0     3  1177     astronomy              NaN
-    25           appelle-0     3  1494     geography              NaN
-    26           aragona-1     3  1841     biography              NaN
-    27         araujuzon-0     3  1940     geography              NaN
-    28            ardant-0     3  2421     biography              NaN
-    29            ariano-0     3  2839     geography              NaN
-    30         athabaska-0     4  1118  anthropology              NaN
-    31          aslonnes-0     4   446     geography              NaN
-    32        astr0rh1za-0     4   992       zoology  explicit domain
-    33    atthidographes-0     4  1397           NaN  cross reference
-    34            aubery-2     4  1577     biography              NaN
-    35              aula-0     4  1992       history              NaN
-    36              au-113     4  2112        botany  explicit domain
-    37            auriol-4     4  2224           NaN  cross reference
-    38    ave-lalleniant-0     4  2739     biography              NaN
-    39             badin-2     4  3857     biography              NaN
-    40          baizieux-0     5   133     geography              NaN
-    41         balsam1te-0     5   677        botany  explicit domain
-    42             balze-0     5   757          navy  explicit domain
-    43             bande-2     5   880       history              NaN
-    44           barbosa-5     5  1580     biography              NaN
-    45              bati-0     5  2955  architecture              NaN
-    46           baveuse-0     5  3457       zoology  explicit domain
-    47             beard-2     5  3728     biography              NaN
-    48          beaufort-4     5  3838     geography              NaN
-    49         beaumont-26     5  4018     biography              NaN
-    
-                                                  content  \
-    0   ABRABESES. Village d’Espagne de la prov. de Za...
-    1   ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...
-    2   ACHENBACH(Henri), administrateur prussien, né ...
-    3   ACIREALE. Yille de Sicile, de la province et d...
-    4   ACTÉE(Actœa L.). Genre de plantes de la famill...
-    5   ADULTERATION. Altération d’un médicament, d’un...
-    6   AÉRIDES{Aérides Lour.). Genres de plantes de l...
-    7   AGERouAGERIUS (Nicolaus), médecin alsacien, né...
-    8                       AIGU1 LH E (V. Raimond d’).\n
-    9   ALAVIKA« qui est d'Alava »(V. ce mot) : Bhikch...
-    10  ALLASSAC. Com. du dép. de la Corrèze, arr. de ...
-    11                          ALLEGRETTO(V. Allegro).\n
-    12  ALLEUZE. Com. du dép. du Cantal, arr. et cant....
-    13  ALLIAT. Com. du dép. de l’Ariège, arr. de Foix...
-    14  AMANTY. Corn, du dép. de la Meuse, arr. de Com...
-    15  ÂMASSERAH, AMASR1 ou AMASRAH (Géogr.). Ville d...
-    16  AN Cl LIA. Boucliers sacrés des Romains, au no...
-    17  ANDROCLÈS(Myth.), un fils d’Eole qui régna sur...
-    18  ANFOUSON. Nom donné à Nice au Néron brun\n(V. ...
-    19  ANICET-BOURGEOIS(Auguste Anicet, connu sous le...
-    20  ANOMALISTIQUE(Astron.). On appelle révolution\...
-    21  ANOSTOSTOME(Anostostoma Gray). Genre d’insecte...
-    22  ANTHOXANTHÈME. L’un des deux principes coloran...
-    23  AOD, plus exactement Ehoud. personnage des com...
-    24  APHELLAN(Astron.). Un des noms de l’étoile a2 ...
-    25  APPELLE. Com. du dép. du Tarn, arr. de Lavaux,...
-    26  ARAGONA, cardinal d’origine sicilienne, né en ...
-    27  ARAUJUZON. Com. du dép. des Basses-Pyrénées, a...
-    28  ARDANT(Paul-Joseph), général français, né en 1...
-    29  ARIANOdi Puglia. Ville de la prov. de principa...
-    30  ATHABASKA. Col, rivière, lac, territoire et fa...
-    31  ASLONNES, corn, du dép. de la Vienne, arr. de ...
-    32  ASTR0RH1ZA(Zool.).Genre deForaminifèresimperfo...
-    33                      ATTHIDOGRAPHES(V. Atthide).\n
-    34  AUBERY(Antoine;, historien français, né le .18...
-    35  AULA. Mot latin signifiant cour, lieu découver...
-    36  AUNÉE (bot.). L'Aunée, Grande Année, Année off...
-    37  AURIOL. Nom donné à Marseille au Maquereau (V....
-    38  AVE-LALLENIANT(Robert-Christian-Barthold), méd...
-    39  BADIN(Pierre-Adolphe), peintre français, né à ...
-    40  BAIZIEUX(Bacium, Basium). Com. du dép. de la\n...
-    41  BALSAM1TE(Bot.) (Balsamita Desf.). Genre de Co...
-    42  BALZE(Mar.). Radeau delà côte occidentale de l...
-    43  BANDE(Ordre delà) ou de l’ECHARPE.Ordre milita...
-    44  BARBOSA(Antonio), jésuite et orientaliste port...
-    45  BATIÈRE. Toit en forme de bât se terminant à c...
-    46  BAVEUSE(Zool.). Nom vulgaire par lequel les\np...
-    47  BEARD(James-Henry), peintre américain contempo...
-    48  BEAUFORT. Com. du dép. de la Meuse, arr. de Mo...
-    49  BEAUMONT(J.-G. Leprevôt de), secrétaire du cle...
-    
-                                         class_bert
-    0                                    Géographie
-    1                       Belles-lettres - Poésie
-    2                                      Histoire
-    3                                    Géographie
-    4                            Histoire naturelle
-    5                                        Chimie
-    6                            Histoire naturelle
-    7                                      Histoire
-    8                                        Marine
-    9                                      Religion
-    10                                   Géographie
-    11                                      Musique
-    12                                   Géographie
-    13                                   Géographie
-    14                                   Géographie
-    15                                   Géographie
-    16                                    Antiquité
-    17                                    Antiquité
-    18                           Histoire naturelle
-    19                      Belles-lettres - Poésie
-    20  Physique - [Sciences physico-mathématiques]
-    21                           Histoire naturelle
-    22                                    Pharmacie
-    23                                     Histoire
-    24  Physique - [Sciences physico-mathématiques]
-    25                                   Géographie
-    26                                     Religion
-    27                                   Géographie
-    28              Militaire (Art) - Guerre - Arme
-    29                                   Géographie
-    30                                   Géographie
-    31                                   Géographie
-    32                           Histoire naturelle
-    33                                   Géographie
-    34                                     Histoire
-    35                                 Architecture
-    36                           Histoire naturelle
-    37                           Histoire naturelle
-    38                                     Histoire
-    39                              Arts et métiers
-    40                                   Géographie
-    41                           Histoire naturelle
-    42                                       Marine
-    43                                     Histoire
-    44                                     Religion
-    45                                 Architecture
-    46                           Histoire naturelle
-    47                                   Beaux-arts
-    48                                   Géographie
-    49                                     Histoire
-
 %% Cell type:code id: tags:

 ``` python
 filepath = path + "results_LGE/LGE-metadata-withContent.csv"
 df_LGE.to_csv(filepath, sep="\,")
 ```

 %% Cell type:code id: tags:

 ``` python
 df_LGE.drop(columns=['content'], inplace=True)
 filepath = path + "results_LGE/LGE-metadata.csv"
 df_LGE.to_csv(filepath, sep="\,")
 ```