Update Predict.ipynb

e6ada53c · Ludovic Moncla · 3bba3205 · e6ada53c
Commit e6ada53c authored 2 years ago by Ludovic Moncla
--- a/notebooks/Predict.ipynb
+++ b/notebooks/Predict.ipynb
@@ -1070,10 +1070,324 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 22,
      "metadata": {
        "id": "7TD1mbKj_fXH"
      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>volume</th>\n",
+              "      <th>numero</th>\n",
+              "      <th>head</th>\n",
+              "      <th>author</th>\n",
+              "      <th>edda_class</th>\n",
+              "      <th>enccre_id</th>\n",
+              "      <th>enccre_class</th>\n",
+              "      <th>content</th>\n",
+              "      <th>content_without_designant</th>\n",
+              "      <th>first_paragraph</th>\n",
+              "      <th>nb_words</th>\n",
+              "      <th>super_domain</th>\n",
+              "      <th>superdomainBert</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>24</th>\n",
+              "      <td>1</td>\n",
+              "      <td>26</td>\n",
+              "      <td>A</td>\n",
+              "      <td>Diderot</td>\n",
+              "      <td>unclassified</td>\n",
+              "      <td>v1-9-0</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>\\n* A, s. petite riviere de France, qui a sa s...</td>\n",
+              "      <td>\\n* A, s. petite riviere de France, qui a sa s...</td>\n",
+              "      <td>\\n* A, s. petite riviere de France, qui a sa s...</td>\n",
+              "      <td>15</td>\n",
+              "      <td>Unclassified</td>\n",
+              "      <td>Géographie</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>25</th>\n",
+              "      <td>1</td>\n",
+              "      <td>27</td>\n",
+              "      <td>AA</td>\n",
+              "      <td>Diderot</td>\n",
+              "      <td>unclassified</td>\n",
+              "      <td>v1-10-0</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>\\n* AA, s. f. riviere de France, qui prend sa ...</td>\n",
+              "      <td>\\n* AA, s. f. riviere de France, qui prend sa ...</td>\n",
+              "      <td>\\n* AA, s. f. riviere de France, qui prend sa ...</td>\n",
+              "      <td>46</td>\n",
+              "      <td>Unclassified</td>\n",
+              "      <td>Géographie</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>27</th>\n",
+              "      <td>1</td>\n",
+              "      <td>29</td>\n",
+              "      <td>AACH ou ACH</td>\n",
+              "      <td>Diderot</td>\n",
+              "      <td>unclassified</td>\n",
+              "      <td>v1-12-0</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>\\n* AACH ou ACH, s. f. petite ville d'Allemagn...</td>\n",
+              "      <td>\\n* AACH ou ACH, s. f. petite ville d'Allemagn...</td>\n",
+              "      <td>\\n* AACH ou ACH, s. f. petite ville d'Allemagn...</td>\n",
+              "      <td>24</td>\n",
+              "      <td>Unclassified</td>\n",
+              "      <td>Géographie</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>28</th>\n",
+              "      <td>1</td>\n",
+              "      <td>30</td>\n",
+              "      <td>AAHUS</td>\n",
+              "      <td>Diderot</td>\n",
+              "      <td>unclassified</td>\n",
+              "      <td>v1-13-0</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>\\n* AAHUS, s. petite ville d'Allemagne dans le...</td>\n",
+              "      <td>\\n* AAHUS, s. petite ville d'Allemagne dans le...</td>\n",
+              "      <td>\\n* AAHUS, s. petite ville d'Allemagne dans le...</td>\n",
+              "      <td>21</td>\n",
+              "      <td>Unclassified</td>\n",
+              "      <td>Géographie</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>30</th>\n",
+              "      <td>1</td>\n",
+              "      <td>32</td>\n",
+              "      <td>AAR</td>\n",
+              "      <td>Diderot</td>\n",
+              "      <td>unclassified</td>\n",
+              "      <td>v1-15-0</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>\\n* AAR, s. grande riviere qui a sa source pro...</td>\n",
+              "      <td>\\n* AAR, s. grande riviere qui a sa source pro...</td>\n",
+              "      <td>\\n* AAR, s. grande riviere qui a sa source pro...</td>\n",
+              "      <td>30</td>\n",
+              "      <td>Unclassified</td>\n",
+              "      <td>Géographie</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>...</th>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>74051</th>\n",
+              "      <td>17</td>\n",
+              "      <td>3070</td>\n",
+              "      <td>ZYGRIS</td>\n",
+              "      <td>Jaucourt</td>\n",
+              "      <td>Géographie ancienne</td>\n",
+              "      <td>v17-2068-0</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>\\nZYGRIS, (Géog. anc.) ville du nôme de Lybie\\...</td>\n",
+              "      <td>\\nZYGRIS,  ville du nôme de Lybie\\nsur la côte...</td>\n",
+              "      <td>\\nZYGRIS,  ville du nôme de Lybie\\nsur la côte...</td>\n",
+              "      <td>38</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>Géographie</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>74054</th>\n",
+              "      <td>17</td>\n",
+              "      <td>3073</td>\n",
+              "      <td>ZYRAS</td>\n",
+              "      <td>Jaucourt</td>\n",
+              "      <td>Géographie ancienne</td>\n",
+              "      <td>v17-2071-0</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>\\nZYRAS, (Géog. anc.) fleuve de Thrace. Pline,...</td>\n",
+              "      <td>\\nZYRAS,  fleuve de Thrace. Pline,\\nliv. IV. c...</td>\n",
+              "      <td>\\nZYRAS,  fleuve de Thrace. Pline,\\nliv. IV. c...</td>\n",
+              "      <td>28</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>Géographie</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>74055</th>\n",
+              "      <td>17</td>\n",
+              "      <td>3074</td>\n",
+              "      <td>ZZUÉNÉ ou ZZEUENE</td>\n",
+              "      <td>Jaucourt</td>\n",
+              "      <td>Géographie ancienne</td>\n",
+              "      <td>v17-2072-0</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>\\nZZUÉNÉ ou ZZEUENE, (Géog. anc.) ville située...</td>\n",
+              "      <td>\\nZZUÉNÉ ou ZZEUENE,  ville située\\nsur la riv...</td>\n",
+              "      <td>\\nZZUÉNÉ ou ZZEUENE,  ville située\\nsur la riv...</td>\n",
+              "      <td>149</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>Géographie</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>74080</th>\n",
+              "      <td>17</td>\n",
+              "      <td>3099</td>\n",
+              "      <td>CABOTAGE</td>\n",
+              "      <td>Jaucourt</td>\n",
+              "      <td>Navigation</td>\n",
+              "      <td>v17-2097-0</td>\n",
+              "      <td>Marine</td>\n",
+              "      <td>\\nCABOTAGE, s. m. (Navigation.) le cabotage es...</td>\n",
+              "      <td>\\nCABOTAGE, s. m.  le cabotage est\\nune naviga...</td>\n",
+              "      <td>\\nCABOTAGE, s. m.  le cabotage est\\nune naviga...</td>\n",
+              "      <td>192</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>Géographie</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>74165</th>\n",
+              "      <td>17</td>\n",
+              "      <td>3184</td>\n",
+              "      <td>GUAYAQUIL</td>\n",
+              "      <td>La Condamine</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>v17-2177-0</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>\\nGUAYAQUIL, (Géograph.) nom d'une ville &amp;\\nd'...</td>\n",
+              "      <td>\\nGUAYAQUIL,  nom d'une ville &amp;\\nd'une grande ...</td>\n",
+              "      <td>\\nGUAYAQUIL,  nom d'une ville &amp;\\nd'une grande ...</td>\n",
+              "      <td>446</td>\n",
+              "      <td>Géographie</td>\n",
+              "      <td>Géographie</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "<p>15383 rows × 13 columns</p>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "       volume  numero               head        author           edda_class  \\\n",
+              "24          1      26                  A       Diderot         unclassified   \n",
+              "25          1      27                 AA       Diderot         unclassified   \n",
+              "27          1      29        AACH ou ACH       Diderot         unclassified   \n",
+              "28          1      30              AAHUS       Diderot         unclassified   \n",
+              "30          1      32                AAR       Diderot         unclassified   \n",
+              "...       ...     ...                ...           ...                  ...   \n",
+              "74051      17    3070             ZYGRIS      Jaucourt  Géographie ancienne   \n",
+              "74054      17    3073              ZYRAS      Jaucourt  Géographie ancienne   \n",
+              "74055      17    3074  ZZUÉNÉ ou ZZEUENE      Jaucourt  Géographie ancienne   \n",
+              "74080      17    3099           CABOTAGE      Jaucourt           Navigation   \n",
+              "74165      17    3184          GUAYAQUIL  La Condamine           Géographie   \n",
+              "\n",
+              "        enccre_id enccre_class  \\\n",
+              "24         v1-9-0   Géographie   \n",
+              "25        v1-10-0   Géographie   \n",
+              "27        v1-12-0   Géographie   \n",
+              "28        v1-13-0   Géographie   \n",
+              "30        v1-15-0   Géographie   \n",
+              "...           ...          ...   \n",
+              "74051  v17-2068-0   Géographie   \n",
+              "74054  v17-2071-0   Géographie   \n",
+              "74055  v17-2072-0   Géographie   \n",
+              "74080  v17-2097-0       Marine   \n",
+              "74165  v17-2177-0   Géographie   \n",
+              "\n",
+              "                                                 content  \\\n",
+              "24     \\n* A, s. petite riviere de France, qui a sa s...   \n",
+              "25     \\n* AA, s. f. riviere de France, qui prend sa ...   \n",
+              "27     \\n* AACH ou ACH, s. f. petite ville d'Allemagn...   \n",
+              "28     \\n* AAHUS, s. petite ville d'Allemagne dans le...   \n",
+              "30     \\n* AAR, s. grande riviere qui a sa source pro...   \n",
+              "...                                                  ...   \n",
+              "74051  \\nZYGRIS, (Géog. anc.) ville du nôme de Lybie\\...   \n",
+              "74054  \\nZYRAS, (Géog. anc.) fleuve de Thrace. Pline,...   \n",
+              "74055  \\nZZUÉNÉ ou ZZEUENE, (Géog. anc.) ville située...   \n",
+              "74080  \\nCABOTAGE, s. m. (Navigation.) le cabotage es...   \n",
+              "74165  \\nGUAYAQUIL, (Géograph.) nom d'une ville &\\nd'...   \n",
+              "\n",
+              "                               content_without_designant  \\\n",
+              "24     \\n* A, s. petite riviere de France, qui a sa s...   \n",
+              "25     \\n* AA, s. f. riviere de France, qui prend sa ...   \n",
+              "27     \\n* AACH ou ACH, s. f. petite ville d'Allemagn...   \n",
+              "28     \\n* AAHUS, s. petite ville d'Allemagne dans le...   \n",
+              "30     \\n* AAR, s. grande riviere qui a sa source pro...   \n",
+              "...                                                  ...   \n",
+              "74051  \\nZYGRIS,  ville du nôme de Lybie\\nsur la côte...   \n",
+              "74054  \\nZYRAS,  fleuve de Thrace. Pline,\\nliv. IV. c...   \n",
+              "74055  \\nZZUÉNÉ ou ZZEUENE,  ville située\\nsur la riv...   \n",
+              "74080  \\nCABOTAGE, s. m.  le cabotage est\\nune naviga...   \n",
+              "74165  \\nGUAYAQUIL,  nom d'une ville &\\nd'une grande ...   \n",
+              "\n",
+              "                                         first_paragraph  nb_words  \\\n",
+              "24     \\n* A, s. petite riviere de France, qui a sa s...        15   \n",
+              "25     \\n* AA, s. f. riviere de France, qui prend sa ...        46   \n",
+              "27     \\n* AACH ou ACH, s. f. petite ville d'Allemagn...        24   \n",
+              "28     \\n* AAHUS, s. petite ville d'Allemagne dans le...        21   \n",
+              "30     \\n* AAR, s. grande riviere qui a sa source pro...        30   \n",
+              "...                                                  ...       ...   \n",
+              "74051  \\nZYGRIS,  ville du nôme de Lybie\\nsur la côte...        38   \n",
+              "74054  \\nZYRAS,  fleuve de Thrace. Pline,\\nliv. IV. c...        28   \n",
+              "74055  \\nZZUÉNÉ ou ZZEUENE,  ville située\\nsur la riv...       149   \n",
+              "74080  \\nCABOTAGE, s. m.  le cabotage est\\nune naviga...       192   \n",
+              "74165  \\nGUAYAQUIL,  nom d'une ville &\\nd'une grande ...       446   \n",
+              "\n",
+              "       super_domain superdomainBert  \n",
+              "24     Unclassified      Géographie  \n",
+              "25     Unclassified      Géographie  \n",
+              "27     Unclassified      Géographie  \n",
+              "28     Unclassified      Géographie  \n",
+              "30     Unclassified      Géographie  \n",
+              "...             ...             ...  \n",
+              "74051    Géographie      Géographie  \n",
+              "74054    Géographie      Géographie  \n",
+              "74055    Géographie      Géographie  \n",
+              "74080    Géographie      Géographie  \n",
+              "74165    Géographie      Géographie  \n",
+              "\n",
+              "[15383 rows x 13 columns]"
+            ]
+          },
+          "execution_count": 22,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "df.loc[(df['superdomainBert'] == 'Géographie')]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
      "outputs": [],
      "source": []
    }

 %% Cell type:markdown id: tags:
 # BERT Predict classification
 ## 1. Setup the environment
 ### 1.1 Setup colab environment
 #### 1.1.1 Install packages
 %% Cell type:code id: tags:
 ``` python
 !pip install transformers==4.10.3
 !pip install sentencepiece
 ```
 %% Cell type:markdown id: tags:
 #### 1.1.2 Use more RAM
 %% Cell type:code id: tags:
 ``` python
 from psutil import virtual_memory
 ram_gb = virtual_memory().total / 1e9
 print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
 if ram_gb < 20:
  print('Not using a high-RAM runtime')
 else:
  print('You are using a high-RAM runtime!')
 ```
 %% Cell type:markdown id: tags:
 #### 1.1.3 Mount GoogleDrive
 %% Cell type:code id: tags:
 ``` python
 from google.colab import drive
 drive.mount('/content/drive')
 ```
 %% Cell type:markdown id: tags:
 ### 1.2 Setup GPU
 %% Cell type:code id: tags:
 ``` python
 import torch
 # If there's a GPU available...
 if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
 # for MacOS
 elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print('We will use the GPU')
 else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')
 ```
 %% Output
    We will use the GPU
 %% Cell type:markdown id: tags:
 ### 1.3 Import librairies
 %% Cell type:code id: tags:
 ``` python
 import pandas as pd
 import numpy as np
 from transformers import BertTokenizer, BertForSequenceClassification, CamembertTokenizer, CamembertForSequenceClassification
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 ```
 %% Cell type:markdown id: tags:
 ## 2. Load Data
 %% Cell type:code id: tags:
 ``` python
 #drive_path = "drive/MyDrive/Classification-EDdA/"
 drive_path = "../"
 path = "/Users/lmoncla/git/gitlab.liris/GEODE/EDdA/output/"
 ```
 %% Cell type:code id: tags:
 ``` python
 #!wget https://geode.liris.cnrs.fr/files/datasets/EDdA/Classification/LGE_withContent.tsv
 #!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/EDdA_dataset_articles_no_superdomain.tsv
 !wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/Parallel_datatset_articles_230215.tsv
 ```
 %% Cell type:code id: tags:
 ``` python
 #filepath = "data/LGE_withContent.tsv"
 #filepath = "EDdA_dataset_articles_no_superdomain.tsv"
 #filepath = "Parallel_datatset_articles_230215.tsv"
 filepath = "EDdA_dataset_articles.tsv"
 ```
 %% Cell type:code id: tags:
 ``` python
 df = pd.read_csv(path + filepath, sep="\t")
 df.head()
 ```
 %% Output
       volume  numero                                head                author  \
    0       1       1                          Title Page              unsigned
    1       1       2   A MONSEIGNEUR LE COMTE D'ARGENSON  Diderot & d'Alembert
    2       1       3  DISCOURS PRÉLIMINAIRE DES EDITEURS            d'Alembert
    3       1       5                            A, a & a            Dumarsais5
    4       1       6                                   A            Dumarsais5
         edda_class enccre_id enccre_class  \
    0  unclassified       NaN          NaN
    1  unclassified       NaN          NaN
    2  unclassified       NaN          NaN
    3     Grammaire    v1-1-0    Grammaire
    4  unclassified    v1-1-1    Grammaire
                                                 content  \
    0  \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
    1  \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
    2  \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
    3  \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
    4  \nA, mot, est 1. la troisieme personne du prés...
                               content_without_designant  \
    0  \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
    1  \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
    2  \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
    3  \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
    4  \nA, mot, est 1. la troisieme personne du prés...
                                         first_paragraph  nb_words  super_domain
    0  \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...       151  Unclassified
    1  \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...       208  Unclassified
    2       \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n     44669  Unclassified
    3  \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...       711   Philosophie
    4  \nA, mot, est 1. la troisieme personne du prés...       238  Unclassified
 %% Cell type:code id: tags:
 ``` python
 #corpus = 'LGE'
 corpus = ''
 data = df['content'+corpus].values
 ```
 %% Cell type:markdown id: tags:
 ## 3. Load model and predict
 ### 3.1 BERT / CamemBERT
 %% Cell type:code id: tags:
 ``` python
 model_name = "bert-base-multilingual-cased"
 #model_name = "camembert-base"
 #model_path = path + "models/model_" + model_name + "_s10000.pt"
 model_path = drive_path + "models/model_" + model_name + "_s10000_superdomains.pt"
 ```
 %% Cell type:code id: tags:
 ``` python
 def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_test = []
    # For every sentence...
    for sent in sentences:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            # This function also supports truncation and conversion
                            # to pytorch tensors, but I need to do padding, so I
                            # can't use these features.
                            #max_length = max_len,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                    )
        input_ids_test.append(encoded_sent)
    # Pad our input tokens
    padded_test = []
    for i in input_ids_test:
        if len(i) > max_len:
            padded_test.extend([i[:max_len]])
        else:
            padded_test.extend([i + [0] * (max_len - len(i))])
    input_ids_test = np.array(padded_test)
    # Create attention masks
    attention_masks = []
    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids_test:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    # Convert to tensors.
    inputs = torch.tensor(input_ids_test)
    masks = torch.tensor(attention_masks)
    #set batch size
    # Create the DataLoader.
    data = TensorDataset(inputs, masks)
    prediction_sampler = SequentialSampler(data)
    return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)
 def predict(model, dataloader, device):
    # Put model in evaluation mode
    model.eval()
    # Tracking variables
    predictions_test , true_labels = [], []
    pred_labels_ = []
    # Predict
    for batch in dataloader:
    # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from the dataloader
        b_input_ids, b_input_mask = batch
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)
        logits = outputs[0]
        #print(logits)
        # Move logits and labels to CPU ???
        logits = logits.detach().cpu().numpy()
        #print(logits)
        # Store predictions and true labels
        predictions_test.append(logits)
        pred_labels = []
        for i in range(len(predictions_test)):
            # The predictions for this batch are a 2-column ndarray (one column for "0"
            # and one column for "1"). Pick the label with the highest value and turn this
            # in to a list of 0s and 1s.
            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
            pred_labels.append(pred_labels_i)
    pred_labels_ += [item for sublist in pred_labels for item in sublist]
    return pred_labels_
 ```
 %% Cell type:code id: tags:
 ``` python
 if model_name == 'bert-base-multilingual-cased' :
    print('Loading Bert Tokenizer...')
    tokenizer = BertTokenizer.from_pretrained(model_name)
 elif model_name == 'camembert-base':
    print('Loading Camembert Tokenizer...')
    tokenizer = CamembertTokenizer.from_pretrained(model_name)
 ```
 %% Output
    Loading Bert Tokenizer...
 %% Cell type:code id: tags:
 ``` python
 data_loader = generate_dataloader(tokenizer, data)
 ```
 %% Output
    Token indices sequence length is longer than the specified maximum sequence length for this model (75311 > 512). Running this sequence through the model will result in indexing errors
 %% Cell type:markdown id: tags:
 https://discuss.huggingface.co/t/an-efficient-way-of-loading-a-model-that-was-saved-with-torch-save/9814
 https://github.com/huggingface/transformers/issues/2094
 %% Cell type:code id: tags:
 ``` python
 #model = torch.load(model_path, map_location=torch.device('mps'))
 #model.load_state_dict(torch.load(model_path, map_location=torch.device('mps')))
 #model = BertForSequenceClassification.from_pretrained(model_path).to("cuda")
 model = BertForSequenceClassification.from_pretrained(model_path).to("mps")
 ```
 %% Cell type:code id: tags:
 ``` python
 pred = predict(model, data_loader, device)
 ```
 %% Cell type:code id: tags:
 ``` python
 pred
 ```
 %% Cell type:code id: tags:
 ``` python
 import pickle
 #encoder_filename = "models/label_encoder.pkl"
 encoder_filename = "models/label_encoder_superdomains.pkl"
 with open(drive_path + encoder_filename, 'rb') as file:
      encoder = pickle.load(file)
 ```
 %% Output
    /opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator LabelEncoder from version 1.0.2 when using version 1.1.3. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
    https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
      warnings.warn(
 %% Cell type:code id: tags:
 ``` python
 p2 = list(encoder.inverse_transform(pred))
 ```
 %% Cell type:code id: tags:
 ``` python
 df['superdomainBert'+corpus] = p2
 ```
 %% Cell type:code id: tags:
 ``` python
 df[df.numero == 2835]['content'+corpus].values
 ```
 %% Cell type:code id: tags:
 ``` python
 df.head(10)
 ```
 %% Output
       volume  numero                                head                author  \
    0       1       1                          Title Page              unsigned
    1       1       2   A MONSEIGNEUR LE COMTE D'ARGENSON  Diderot & d'Alembert
    2       1       3  DISCOURS PRÉLIMINAIRE DES EDITEURS            d'Alembert
    3       1       5                            A, a & a            Dumarsais5
    4       1       6                                   A            Dumarsais5
    5       1       7                                   A             Dumarsais
    6       1       8                                   A                Mallet
    7       1       9                A, lettre symbolique                Mallet
    8       1      10        A, numismatique ou monétaire                Mallet
    9       1      11                        A, lapidaire                Mallet
         edda_class enccre_id enccre_class  \
    0  unclassified       NaN          NaN
    1  unclassified       NaN          NaN
    2  unclassified       NaN          NaN
    3     Grammaire    v1-1-0    Grammaire
    4  unclassified    v1-1-1    Grammaire
    5  unclassified    v1-1-2    Grammaire
    6  unclassified    v1-1-3          NaN
    7  unclassified    v1-1-4          NaN
    8  unclassified    v1-1-5    Médailles
    9  unclassified    v1-1-6     Histoire
                                                 content  \
    0  \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
    1  \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
    2  \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
    3  \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
    4  \nA, mot, est 1. la troisieme personne du prés...
    5  \nA, préposition vient du latin à, à dextris, ...
    6  \nA, étoit une lettre numérale parmi les Ancie...
    7  \nA, lettre symbolique, étoit un hiéroglyphe c...
    8  \nA, numismatique ou monétaire, sur le revers ...
    9  \nA, lapidaire, dans les anciennes inscription...
                               content_without_designant  \
    0  \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
    1  \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
    2  \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
    3  \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
    4  \nA, mot, est 1. la troisieme personne du prés...
    5  \nA, préposition vient du latin à, à dextris, ...
    6  \nA, étoit une lettre numérale parmi les Ancie...
    7  \nA, lettre symbolique, étoit un hiéroglyphe c...
    8  \nA, numismatique ou monétaire, sur le revers ...
    9  \nA, lapidaire, dans les anciennes inscription...
                                         first_paragraph  nb_words  super_domain  \
    0  \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...       151  Unclassified
    1  \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...       208  Unclassified
    2       \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n     44669  Unclassified
    3  \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...       711   Philosophie
    4  \nA, mot, est 1. la troisieme personne du prés...       238  Unclassified
    5  \nA, préposition vient du latin à, à dextris, ...      1980  Unclassified
    6  \nA, étoit une lettre numérale parmi les Ancie...       200  Unclassified
    7  \nA, lettre symbolique, étoit un hiéroglyphe c...        82  Unclassified
    8  \nA, numismatique ou monétaire, sur le revers ...       112  Unclassified
    9  \nA, lapidaire, dans les anciennes inscription...        80  Unclassified
      superdomainBert
    0     Philosophie
    1     Philosophie
    2  Belles-lettres
    3     Philosophie
    4     Philosophie
    5     Philosophie
    6        Histoire
    7        Histoire
    8        Histoire
    9        Histoire
 %% Cell type:code id: tags:
 ``` python
 df.to_csv(drive_path + "predictions/EDdA_dataset_articles_superdomainBERT_230313.tsv", sep="\t")
 ```
 %% Cell type:code id: tags:
 ``` python
 df.drop(columns=['contentLGE', 'contentEDdA'], inplace=True)
 ```
 %% Cell type:code id: tags:
 ``` python
 df.to_csv(drive_path + "predictions/metadata_parallel_predictions_superdomain.csv", sep=",", index=False)
 ```
 %% Cell type:code id: tags:
 ``` python
+df.loc[(df['superdomainBert'] == 'Géographie')]
+```
+%% Output
+           volume  numero               head        author           edda_class  \
+    24          1      26                  A       Diderot         unclassified
+    25          1      27                 AA       Diderot         unclassified
+    27          1      29        AACH ou ACH       Diderot         unclassified
+    28          1      30              AAHUS       Diderot         unclassified
+    30          1      32                AAR       Diderot         unclassified
+    ...       ...     ...                ...           ...                  ...
+    74051      17    3070             ZYGRIS      Jaucourt  Géographie ancienne
+    74054      17    3073              ZYRAS      Jaucourt  Géographie ancienne
+    74055      17    3074  ZZUÉNÉ ou ZZEUENE      Jaucourt  Géographie ancienne
+    74080      17    3099           CABOTAGE      Jaucourt           Navigation
+    74165      17    3184          GUAYAQUIL  La Condamine           Géographie
+            enccre_id enccre_class  \
+    24         v1-9-0   Géographie
+    25        v1-10-0   Géographie
+    27        v1-12-0   Géographie
+    28        v1-13-0   Géographie
+    30        v1-15-0   Géographie
+    ...           ...          ...
+    74051  v17-2068-0   Géographie
+    74054  v17-2071-0   Géographie
+    74055  v17-2072-0   Géographie
+    74080  v17-2097-0       Marine
+    74165  v17-2177-0   Géographie
+                                                     content  \
+    24     \n* A, s. petite riviere de France, qui a sa s...
+    25     \n* AA, s. f. riviere de France, qui prend sa ...
+    27     \n* AACH ou ACH, s. f. petite ville d'Allemagn...
+    28     \n* AAHUS, s. petite ville d'Allemagne dans le...
+    30     \n* AAR, s. grande riviere qui a sa source pro...
+    ...                                                  ...
+    74051  \nZYGRIS, (Géog. anc.) ville du nôme de Lybie\...
+    74054  \nZYRAS, (Géog. anc.) fleuve de Thrace. Pline,...
+    74055  \nZZUÉNÉ ou ZZEUENE, (Géog. anc.) ville située...
+    74080  \nCABOTAGE, s. m. (Navigation.) le cabotage es...
+    74165  \nGUAYAQUIL, (Géograph.) nom d'une ville &\nd'...
+                                   content_without_designant  \
+    24     \n* A, s. petite riviere de France, qui a sa s...
+    25     \n* AA, s. f. riviere de France, qui prend sa ...
+    27     \n* AACH ou ACH, s. f. petite ville d'Allemagn...
+    28     \n* AAHUS, s. petite ville d'Allemagne dans le...
+    30     \n* AAR, s. grande riviere qui a sa source pro...
+    ...                                                  ...
+    74051  \nZYGRIS,  ville du nôme de Lybie\nsur la côte...
+    74054  \nZYRAS,  fleuve de Thrace. Pline,\nliv. IV. c...
+    74055  \nZZUÉNÉ ou ZZEUENE,  ville située\nsur la riv...
+    74080  \nCABOTAGE, s. m.  le cabotage est\nune naviga...
+    74165  \nGUAYAQUIL,  nom d'une ville &\nd'une grande ...
+                                             first_paragraph  nb_words  \
+    24     \n* A, s. petite riviere de France, qui a sa s...        15
+    25     \n* AA, s. f. riviere de France, qui prend sa ...        46
+    27     \n* AACH ou ACH, s. f. petite ville d'Allemagn...        24
+    28     \n* AAHUS, s. petite ville d'Allemagne dans le...        21
+    30     \n* AAR, s. grande riviere qui a sa source pro...        30
+    ...                                                  ...       ...
+    74051  \nZYGRIS,  ville du nôme de Lybie\nsur la côte...        38
+    74054  \nZYRAS,  fleuve de Thrace. Pline,\nliv. IV. c...        28
+    74055  \nZZUÉNÉ ou ZZEUENE,  ville située\nsur la riv...       149
+    74080  \nCABOTAGE, s. m.  le cabotage est\nune naviga...       192
+    74165  \nGUAYAQUIL,  nom d'une ville &\nd'une grande ...       446
+           super_domain superdomainBert
+    24     Unclassified      Géographie
+    25     Unclassified      Géographie
+    27     Unclassified      Géographie
+    28     Unclassified      Géographie
+    30     Unclassified      Géographie
+    ...             ...             ...
+    74051    Géographie      Géographie
+    74054    Géographie      Géographie
+    74055    Géographie      Géographie
+    74080    Géographie      Géographie
+    74165    Géographie      Géographie
+    [15383 rows x 13 columns]
+%% Cell type:code id: tags:
+``` python
 ```