Update Predict_XAI.ipynb

a36157a7 · Ludovic Moncla · 18a8d3f2 · a36157a7
Commit a36157a7 authored 2 years ago by Ludovic Moncla
--- a/notebooks/Predict_XAI.ipynb
+++ b/notebooks/Predict_XAI.ipynb
@@ -740,14 +740,19 @@
      "execution_count": null,
      "metadata": {},
      "outputs": [],
-      "source": []
+      "source": [
+        "edda_par_path = \"/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_articles.tsv\"\n",
+        "df_EDdA_par = pd.read_csv(edda_par_path, sep=\"\\t\")"
+      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
-      "source": []
+      "source": [
+        "df_EDdA_par.head()"
+      ]
    },
    {
      "cell_type": "markdown",
@@ -844,6 +849,16 @@
        "data_loader_EDdA = generate_dataloader(tokenizer, df_EDdA.content.values)"
      ]
    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# EDdA parallel\n",
+        "data_loader_EDdA_par = generate_dataloader(tokenizer, df_EDdA_par.content.values)"
+      ]
+    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -887,6 +902,16 @@
        "df_EDdA['class_pred'] = list(encoder.inverse_transform(pred_EDdA))"
      ]
    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "pred_EDdA_par = predict(model, data_loader_EDdA_par, device)\n",
+        "df_EDdA_par['class_pred'] = list(encoder.inverse_transform(pred_EDdA_par))"
+      ]
+    },
    {
      "cell_type": "code",
      "execution_count": 20,
@@ -994,7 +1019,7 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "### 4.3 Save"
+        "### 4.4 Save"
      ]
    },
    {
@@ -1029,7 +1054,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 18,
+      "execution_count": 54,
      "metadata": {},
      "outputs": [],
      "source": [
@@ -1042,7 +1067,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 53,
+      "execution_count": 55,
      "metadata": {},
      "outputs": [
        {
@@ -1051,7 +1076,7 @@
              "\"\\nLYON, (Géogr.) grande, riche, belle, ancienne\\n& celebre ville de France, la plus considérable du\\nroyaume après Paris, & la capitale du Lyonnois.\\nElle se nomme en latin Lugdunum, Lugudunum, Lugdumum Segusianorum, Lugdumum Celtarum, &c.\\nVoyez Lugdunum.\\n\\nLyon fut fondée l'an de Rome 712, quarante-un\\nans avant l'ere chrétienne, par Lucius Munatius\\nPlancus, qui étoit consul avec AEmilius Lepidus. Il\\nla bâtit sur la Sône, au lieu où cette riviere se jette\\ndans le Rhône, & il la peupla des citoyens romains \\nqui a\""
            ]
          },
-          "execution_count": 53,
+          "execution_count": 55,
          "metadata": {},
          "output_type": "execute_result"
        }
@@ -1063,7 +1088,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 50,
+      "execution_count": 56,
      "metadata": {},
      "outputs": [
        {
@@ -1231,7 +1256,7 @@
              " ('[SEP]', 0.0)]"
            ]
          },
-          "execution_count": 50,
+          "execution_count": 56,
          "metadata": {},
          "output_type": "execute_result"
        }
@@ -1241,6 +1266,186 @@
        "word_attributions"
      ]
    },
+    {
+      "cell_type": "code",
+      "execution_count": 59,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[('ville', 0.478071716663547),\n",
+              " ('capitale', 0.2983988672217172),\n",
+              " ('royaume', 0.24665610131446675),\n",
+              " ('G', 0.24402357535335403),\n",
+              " ('##éo', 0.23393328870446992),\n",
+              " ('grande', 0.21832893139528123),\n",
+              " (',', 0.19909154256915337),\n",
+              " ('##gr', 0.1695800465119405),\n",
+              " ('##Y', 0.1456759996705617),\n",
+              " ('##ON', 0.14307146561933012),\n",
+              " ('.', 0.14162802579543046),\n",
+              " ('France', 0.13688799086603975),\n",
+              " ('latin', 0.13303588704102381),\n",
+              " ('chrétienne', 0.1311835388990743),\n",
+              " ('lieu', 0.13020947076813982),\n",
+              " ('ancienne', 0.12958979621300132),\n",
+              " ('celebre', 0.12947489123965564),\n",
+              " (',', 0.11257940886969105),\n",
+              " ('rivier', 0.11169096058453537),\n",
+              " ('nomme', 0.10000471924693329),\n",
+              " ('(', 0.09932002907423143),\n",
+              " ('la', 0.08253583803987206),\n",
+              " ('riche', 0.07913704700022943),\n",
+              " ('.', 0.0760972913677917),\n",
+              " ('du', 0.07376998774114908),\n",
+              " ('Rhône', 0.07226400802922804),\n",
+              " ('Elle', 0.0693630173969722),\n",
+              " (',', 0.06873738399629244),\n",
+              " (\"'\", 0.06774875716439344),\n",
+              " ('S', 0.06575101714951456),\n",
+              " ('.', 0.06146632041097513),\n",
+              " ('plus', 0.05840061099213507),\n",
+              " ('qui', 0.05804189959646576),\n",
+              " (\"'\", 0.05723696778164145),\n",
+              " (',', 0.05662853362544685),\n",
+              " ('712', 0.054693452829347115),\n",
+              " ('##érable', 0.05297839086419718),\n",
+              " ('fondée', 0.04704211890403151),\n",
+              " ('l', 0.04620483463390136),\n",
+              " ('.', 0.04544699824023643),\n",
+              " ('l', 0.0451974674122074),\n",
+              " ('la', 0.04410484491168233),\n",
+              " ('fut', 0.0438798787047486),\n",
+              " ('##e', 0.04378867745175019),\n",
+              " ('b', 0.042870227388534604),\n",
+              " ('sur', 0.04260004363332922),\n",
+              " ('la', 0.04198219592000479),\n",
+              " ('se', 0.04164162356829115),\n",
+              " ('peu', 0.040981027718879084),\n",
+              " ('c', 0.0403850871592572),\n",
+              " ('Lyon', 0.04007542253467923),\n",
+              " ('##um', 0.039922520378568),\n",
+              " ('##ye', 0.03940461731845493),\n",
+              " ('##rum', 0.03820084664850618),\n",
+              " ('qui', 0.03778469886529954),\n",
+              " ('##m', 0.03770363967219936),\n",
+              " ('Se', 0.037503453809376),\n",
+              " ('cette', 0.03737564027887762),\n",
+              " ('la', 0.03714972247323993),\n",
+              " ('Rome', 0.03688032185991681),\n",
+              " ('##umu', 0.03658546160187376),\n",
+              " ('##um', 0.03484266276127894),\n",
+              " ('##umu', 0.03456580806237662),\n",
+              " ('de', 0.03403811335226887),\n",
+              " ('Paris', 0.03310146903416289),\n",
+              " ('##gus', 0.03294045015997047),\n",
+              " ('##sid', 0.03255043778254519),\n",
+              " ('jet', 0.031575857152632385),\n",
+              " ('un', 0.03117251985912735),\n",
+              " ('le', 0.03105610182850656),\n",
+              " ('Lu', 0.0309274199183622),\n",
+              " ('Plan', 0.030820184404097863),\n",
+              " ('en', 0.03010674205624715),\n",
+              " ('##z', 0.029871874749211054),\n",
+              " ('##mil', 0.02963200210194755),\n",
+              " ('##unum', 0.029477331874186236),\n",
+              " ('Vo', 0.02935262786796574),\n",
+              " (',', 0.02919256859997905),\n",
+              " ('.', 0.029180628239546275),\n",
+              " ('##noi', 0.02909189419875202),\n",
+              " ('des', 0.028996120278423045),\n",
+              " ('quarante', 0.028231791558966633),\n",
+              " ('Lu', 0.02694302543021504),\n",
+              " ('##m', 0.026521004509341334),\n",
+              " ('Lu', 0.026337930390794705),\n",
+              " ('##s', 0.02625525527522554),\n",
+              " ('con', 0.026165582559808873),\n",
+              " ('##unum', 0.025898349689579492),\n",
+              " ('##dun', 0.025699022336446258),\n",
+              " ('##tar', 0.025188870477124894),\n",
+              " ('é', 0.024089543382319098),\n",
+              " ('Lu', 0.02366442497712222),\n",
+              " ('an', 0.023585319400848195),\n",
+              " ('##gu', 0.023084632572130535),\n",
+              " ('Lyon', 0.022692171217471906),\n",
+              " (',', 0.022664305461904344),\n",
+              " ('Il', 0.021736540370470812),\n",
+              " ('de', 0.021013220187771894),\n",
+              " ('##us', 0.020667475964218647),\n",
+              " ('##te', 0.020236291895152022),\n",
+              " ('avec', 0.019039309232488966),\n",
+              " ('du', 0.018572791985543135),\n",
+              " ('dans', 0.01852231748257226),\n",
+              " ('##cus', 0.01828726599412002),\n",
+              " ('##pid', 0.01787476167297771),\n",
+              " ('après', 0.01785470962170739),\n",
+              " ('avant', 0.017626577836139475),\n",
+              " ('##nati', 0.01752347206998558),\n",
+              " ('##iano', 0.017089445343453365),\n",
+              " ('Le', 0.016166723086828174),\n",
+              " ('##us', 0.015200983089939281),\n",
+              " ('##s', 0.01484737615013025),\n",
+              " ('##toi', 0.01483008688193065),\n",
+              " ('ans', 0.014472180695321534),\n",
+              " ('Cel', 0.014027086848242715),\n",
+              " (',', 0.013178253982938232),\n",
+              " ('où', 0.013157964330803138),\n",
+              " ('##ât', 0.011332787999157318),\n",
+              " ('##ôn', 0.011241165099203603),\n",
+              " ('##e', 0.010537012868472688),\n",
+              " ('##it', 0.009763016011555254),\n",
+              " ('Lu', 0.008645628419735481),\n",
+              " (',', 0.007825484996566502),\n",
+              " ('ere', 0.007590037219544403),\n",
+              " ('L', 0.007399733805079844),\n",
+              " ('par', 0.007011176299182855),\n",
+              " ('##gd', 0.006578965732858923),\n",
+              " ('Mu', 0.006340399133187405),\n",
+              " ('consul', 0.006312700914285012),\n",
+              " ('il', 0.00609352197030786),\n",
+              " ('se', 0.0058271154715995995),\n",
+              " ('##gd', 0.005721331572683938),\n",
+              " ('##pla', 0.005560350755837545),\n",
+              " ('##gd', 0.00518317960511743),\n",
+              " ('Lucius', 0.0044462351021057325),\n",
+              " ('##t', 0.0036622619849812073),\n",
+              " (',', 0.0030423079199119554),\n",
+              " ('[CLS]', 0.0),\n",
+              " ('[SEP]', 0.0),\n",
+              " ('-', -0.0003846539976056082),\n",
+              " ('##ius', -0.00048531039895657175),\n",
+              " (',', -0.0010714894154601323),\n",
+              " ('au', -0.0018304190363696647),\n",
+              " ('##gd', -0.00442376201350928),\n",
+              " (',', -0.0062768408260973066),\n",
+              " ('&', -0.006321268573570221),\n",
+              " (',', -0.006856821180122214),\n",
+              " ('la', -0.006879341345145134),\n",
+              " (',', -0.007011581545450849),\n",
+              " ('romain', -0.008083189911088765),\n",
+              " (',', -0.008296981653008715),\n",
+              " ('AE', -0.010990138859793724),\n",
+              " (',', -0.011642202072501788),\n",
+              " ('a', -0.022083265525204197),\n",
+              " ('belle', -0.029909244412604778),\n",
+              " ('citoyens', -0.03278504989463669),\n",
+              " ('&', -0.038176803729996794),\n",
+              " ('&', -0.0494132018474461),\n",
+              " ('&', -0.08418116246612357),\n",
+              " (')', -0.13544847084394057)]"
+            ]
+          },
+          "execution_count": 59,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "word_attributions.sort(key=lambda a: a[1], reverse = True)\n",
+        "word_attributions"
+      ]
+    },
    {
      "cell_type": "code",
      "execution_count": 51,

 %% Cell type:markdown id: tags:

 # BERT Predict classification

 ## 1. Setup the environment

 ### 1.1 Setup colab environment

 #### 1.1.1 Install packages

 %% Cell type:code id: tags:

 ``` python
 !pip install transformers==4.10.3
 !pip install sentencepiece
 !pip install transformers_interpret
 ```

 %% Cell type:markdown id: tags:

 #### 1.1.2 Use more RAM

 %% Cell type:code id: tags:

 ``` python
 from psutil import virtual_memory
 ram_gb = virtual_memory().total / 1e9
 print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

 if ram_gb < 20:
  print('Not using a high-RAM runtime')
 else:
  print('You are using a high-RAM runtime!')
 ```

 %% Cell type:markdown id: tags:

 #### 1.1.3 Mount GoogleDrive

 %% Cell type:code id: tags:

 ``` python
 from google.colab import drive
 drive.mount('/content/drive')
 ```

 %% Cell type:markdown id: tags:

 ### 1.2 Import librairies

 %% Cell type:code id: tags:

 ``` python
 import pickle
 import torch

 from transformers import BertTokenizer, BertForSequenceClassification
 from transformers_interpret import SequenceClassificationExplainer

 import numpy as np
 import torch
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 import pandas as pd
 ```

 %% Cell type:markdown id: tags:

 ### 1.3 Setup GPU

 %% Cell type:code id: tags:

 ``` python
  # If there's a GPU available...
 if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    gpu_name = "cuda"
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
 # for MacOS
 elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    gpu_name = "mps"
    print('We will use the GPU')
 else:
    device = torch.device("cpu")
    gpu_name = "cpu"
    print('No GPU available, using the CPU instead.')
 ```

 %% Output

    We will use the GPU

 %% Cell type:code id: tags:

 ``` python
 device = torch.device("cpu")
 gpu_name = "cpu"
 ```

 %% Cell type:markdown id: tags:

 ## 2. Utils

 %% Cell type:code id: tags:

 ``` python

 def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_test = []
    # For every sentence...
    for sent in sentences:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            # This function also supports truncation and conversion
                            # to pytorch tensors, but I need to do padding, so I
                            # can't use these features.
                            #max_length = max_len,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                    )
        input_ids_test.append(encoded_sent)

    # Pad our input tokens
    padded_test = []
    for i in input_ids_test:
        if len(i) > max_len:
            padded_test.extend([i[:max_len]])
        else:
            padded_test.extend([i + [0] * (max_len - len(i))])
    input_ids_test = np.array(padded_test)

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids_test:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert to tensors.
    inputs = torch.tensor(input_ids_test)
    masks = torch.tensor(attention_masks)
    #set batch size

    # Create the DataLoader.
    data = TensorDataset(inputs, masks)
    prediction_sampler = SequentialSampler(data)

    return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)


 def predict(model, dataloader, device):

    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    predictions_test , true_labels = [], []
    pred_labels_ = []
    # Predict
    for batch in dataloader:
    # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from the dataloader
        b_input_ids, b_input_mask = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)
        logits = outputs[0]
        #print(logits)

        # Move logits and labels to CPU ???
        logits = logits.detach().cpu().numpy()
        #print(logits)

        # Store predictions and true labels
        predictions_test.append(logits)

        pred_labels = []

        for i in range(len(predictions_test)):
            # The predictions for this batch are a 2-column ndarray (one column for "0"
            # and one column for "1"). Pick the label with the highest value and turn this
            # in to a list of 0s and 1s.
            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
            pred_labels.append(pred_labels_i)

    pred_labels_ += [item for sublist in pred_labels for item in sublist]
    return pred_labels_

 ```

 %% Cell type:markdown id: tags:

 ## 3. Load Data


 !! A modifier: charger le corpus parallele : EDdA et LGE

 %% Cell type:markdown id: tags:

 ### 3.1 LGE (Nakala)

 %% Cell type:code id: tags:

 ``` python
 lge_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/LGE_dataset_articles.tsv"
 df_LGE = pd.read_csv(lge_path, sep="\t")
 ```

 %% Cell type:code id: tags:

 ``` python
 df_LGE.head()
 ```

 %% Output

                   id tome      filename  \
    0     T1article_1   T1     article_1
    1    T1article_10   T1    article_10
    2   T1article_100   T1   article_100
    3  T1article_1000   T1  article_1000
    4  T1article_1001   T1  article_1001
    
                                                 content  nb_words
    0          F.-Camille DREYFUS, député de la Seine.\n         6
    1  quimarque un mouvement en avant de l’esprit hu...       212
    2  ABACUS. L’abacus ou abaque était un instrument...      1345
    3  H6SS6)\n1780-1793 Choiseul-Goufficr\n1780-1793...       218
    4                                     1803Le Brun.\n         2

 %% Cell type:code id: tags:

 ``` python
 df_LGE.shape
 ```

 %% Output

    (229475, 5)

 %% Cell type:markdown id: tags:

 ### 3.2 LGE Parallel

 %% Cell type:code id: tags:

 ``` python
 lge_par_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/LGE_parallel_dataset_articles.tsv"
 df_LGE_par = pd.read_csv(lge_par_path, sep="\t")
 ```

 %% Cell type:code id: tags:

 ``` python
 df_LGE_par.head()
 ```

 %% Output

                      id tome         filename  \
    0            T1aam-0   T1            aam-0
    1          T1abaco-0   T1          abaco-0
    2         T1abacot-0   T1         abacot-0
    3        T1abaddon-0   T1        abaddon-0
    4  T1abandonnement-0   T1  abandonnement-0
    
                                                 content  nb_words
    0  AAM. Mesure de capacité pour les liquides en u...        38
    1  ABACO, architecte italien du xvi siècle (V. La...         8
    2  ABACOT. Double couronne que portaient autrefoi...        33
    3  ABADDONou APOLYON le Destructeur. « Elles\nava...       109
    4  ABANDONNEMENT. I. Droit civil. — Ce mot est un...        76

 %% Cell type:markdown id: tags:

 ### 3.3 EDdA (ARTFL)

 %% Cell type:code id: tags:

 ``` python
 edda_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_articles.tsv"
 df_EDdA = pd.read_csv(edda_path, sep="\t")
 ```

 %% Cell type:code id: tags:

 ``` python
 df_EDdA.head()
 ```

 %% Output

       volume  numero                                head                author  \
    0       1       1                          Title Page              unsigned
    1       1       2   A MONSEIGNEUR LE COMTE D'ARGENSON  Diderot & d'Alembert
    2       1       3  DISCOURS PRÉLIMINAIRE DES EDITEURS            d'Alembert
    3       1       5                            A, a & a            Dumarsais5
    4       1       6                                   A            Dumarsais5
    
         edda_class enccre_id enccre_class  \
    0  unclassified       NaN          NaN
    1  unclassified       NaN          NaN
    2  unclassified       NaN          NaN
    3     Grammaire    v1-1-0    Grammaire
    4  unclassified    v1-1-1    Grammaire
    
                                                 content  \
    0  \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
    1  \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
    2  \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
    3  \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
    4  \nA, mot, est 1. la troisieme personne du prés...
    
                               content_without_designant  \
    0  \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
    1  \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
    2  \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
    3  \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
    4  \nA, mot, est 1. la troisieme personne du prés...
    
                                         first_paragraph  nb_words
    0  \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...       151
    1  \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...       208
    2       \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n     44669
    3  \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...       711
    4  \nA, mot, est 1. la troisieme personne du prés...       238

 %% Cell type:markdown id: tags:

 ### 3.4 EDdA Parallel

 %% Cell type:code id: tags:

 ``` python
+edda_par_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_articles.tsv"
+df_EDdA_par = pd.read_csv(edda_par_path, sep="\t")
 ```

 %% Cell type:code id: tags:

 ``` python
+df_EDdA_par.head()
 ```

 %% Cell type:markdown id: tags:

 ## 4. Load model and predict

 ### 4.1 Load BERT model

 %% Cell type:code id: tags:

 ``` python
 #path = "drive/MyDrive/Classification-EDdA/"
 path = "../"
 model_name = "bert-base-multilingual-cased"
 model_path = path + "models/model_" + model_name + "_s10000.pt"
 ```

 %% Cell type:code id: tags:

 ``` python
 encoder_filename = "models/label_encoder.pkl"
 with open(path + encoder_filename, 'rb') as file:
      encoder = pickle.load(file)
 ```

 %% Cell type:code id: tags:

 ``` python
 tokenizer = BertTokenizer.from_pretrained(model_name)
 ```

 %% Cell type:code id: tags:

 ``` python
 model = BertForSequenceClassification.from_pretrained(model_path).to(gpu_name) #.to("cuda")
 ```

 %% Cell type:markdown id: tags:

 ### 4.2 Prepare datasets

 %% Cell type:code id: tags:

 ``` python
 # LGE
 data_loader_LGE = generate_dataloader(tokenizer, df_LGE.content.values)
 ```

 %% Output

    Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors

 %% Cell type:code id: tags:

 ``` python
 # LGE parallel
 data_loader_LGE_par = generate_dataloader(tokenizer, df_LGE_par.content.values)
 ```

 %% Cell type:code id: tags:

 ``` python
 # EDdA
 data_loader_EDdA = generate_dataloader(tokenizer, df_EDdA.content.values)
 ```

+%% Cell type:code id: tags:
+
+``` python
+# EDdA parallel
+data_loader_EDdA_par = generate_dataloader(tokenizer, df_EDdA_par.content.values)
+```
+
 %% Cell type:markdown id: tags:

 ### 4.3 Predict

 %% Cell type:code id: tags:

 ``` python
 pred_LGE = predict(model, data_loader_LGE, device)
 df_LGE['class_pred'] = list(encoder.inverse_transform(pred_LGE))
 ```

 %% Cell type:code id: tags:

 ``` python
 pred_LGE_par = predict(model, data_loader_LGE_par, device)
 df_LGE_par['class_pred'] = list(encoder.inverse_transform(pred_LGE_par))
 ```

 %% Cell type:code id: tags:

 ``` python
 pred_EDdA = predict(model, data_loader_EDdA, device)
 df_EDdA['class_pred'] = list(encoder.inverse_transform(pred_EDdA))
 ```

 %% Cell type:code id: tags:

 ``` python
+pred_EDdA_par = predict(model, data_loader_EDdA_par, device)
+df_EDdA_par['class_pred'] = list(encoder.inverse_transform(pred_EDdA_par))
+```
+
+%% Cell type:code id: tags:
+
+``` python
 df_LGE.head()
 ```

 %% Output

                   id tome      filename  \
    0     T1article_1   T1     article_1
    1    T1article_10   T1    article_10
    2   T1article_100   T1   article_100
    3  T1article_1000   T1  article_1000
    4  T1article_1001   T1  article_1001
    
                                                 content  nb_words
    0          F.-Camille DREYFUS, député de la Seine.\n         6
    1  quimarque un mouvement en avant de l’esprit hu...       212
    2  ABACUS. L’abacus ou abaque était un instrument...      1345
    3  H6SS6)\n1780-1793 Choiseul-Goufficr\n1780-1793...       218
    4                                     1803Le Brun.\n         2

 %% Cell type:markdown id: tags:

-### 4.3 Save
+### 4.4 Save

 %% Cell type:code id: tags:

 ``` python
 filepath = path + "results_LGE/LGE-metadata-withContent.csv"
 df_LGE.to_csv(filepath, sep="\,")
 ```

 %% Cell type:code id: tags:

 ``` python
 df_LGE.drop(columns=['content'], inplace=True)
 filepath = path + "results_LGE/LGE-metadata.csv"
 df_LGE.to_csv(filepath, sep="\,")
 ```

 %% Cell type:markdown id: tags:

 ## 5. BERT XAI

 https://www.kaggle.com/code/rizwanhaidar/deep-learning-xai-models-loading-and-predictions

 %% Cell type:code id: tags:

 ``` python
 cls_explainer = SequenceClassificationExplainer(
    model,
    tokenizer,
    custom_labels=encoder.classes_.tolist()
 )
 ```

 %% Cell type:code id: tags:

 ``` python
 content = df_EDdA.loc[df_EDdA['head']=="LYON"].reset_index().content[0][:512]
 content
 ```

 %% Output

    "\nLYON, (Géogr.) grande, riche, belle, ancienne\n& celebre ville de France, la plus considérable du\nroyaume après Paris, & la capitale du Lyonnois.\nElle se nomme en latin Lugdunum, Lugudunum, Lugdumum Segusianorum, Lugdumum Celtarum, &c.\nVoyez Lugdunum.\n\nLyon fut fondée l'an de Rome 712, quarante-un\nans avant l'ere chrétienne, par Lucius Munatius\nPlancus, qui étoit consul avec AEmilius Lepidus. Il\nla bâtit sur la Sône, au lieu où cette riviere se jette\ndans le Rhône, & il la peupla des citoyens romains \nqui a"

 %% Cell type:code id: tags:

 ``` python
 word_attributions = cls_explainer(content if len(content) < 512 else content[:512])
 word_attributions
 ```

 %% Output

    [('[CLS]', 0.0),
     ('L', 0.007399733805079844),
     ('##Y', 0.1456759996705617),
     ('##ON', 0.14307146561933012),
     (',', 0.19909154256915337),
     ('(', 0.09932002907423143),
     ('G', 0.24402357535335403),
     ('##éo', 0.23393328870446992),
     ('##gr', 0.1695800465119405),
     ('.', 0.14162802579543046),
     (')', -0.13544847084394057),
     ('grande', 0.21832893139528123),
     (',', 0.11257940886969105),
     ('riche', 0.07913704700022943),
     (',', 0.05662853362544685),
     ('belle', -0.029909244412604778),
     (',', 0.06873738399629244),
     ('ancienne', 0.12958979621300132),
     ('&', -0.08418116246612357),
     ('celebre', 0.12947489123965564),
     ('ville', 0.478071716663547),
     ('de', 0.03403811335226887),
     ('France', 0.13688799086603975),
     (',', -0.0010714894154601323),
     ('la', -0.006879341345145134),
     ('plus', 0.05840061099213507),
     ('con', 0.026165582559808873),
     ('##sid', 0.03255043778254519),
     ('##érable', 0.05297839086419718),
     ('du', 0.018572791985543135),
     ('royaume', 0.24665610131446675),
     ('après', 0.01785470962170739),
     ('Paris', 0.03310146903416289),
     (',', -0.006856821180122214),
     ('&', -0.006321268573570221),
     ('la', 0.08253583803987206),
     ('capitale', 0.2983988672217172),
     ('du', 0.07376998774114908),
     ('Lyon', 0.04007542253467923),
     ('##noi', 0.02909189419875202),
     ('##s', 0.02625525527522554),
     ('.', 0.0760972913677917),
     ('Elle', 0.0693630173969722),
     ('se', 0.04164162356829115),
     ('nomme', 0.10000471924693329),
     ('en', 0.03010674205624715),
     ('latin', 0.13303588704102381),
     ('Lu', 0.0309274199183622),
     ('##gd', 0.00518317960511743),
     ('##unum', 0.029477331874186236),
     (',', -0.007011581545450849),
     ('Lu', 0.008645628419735481),
     ('##gu', 0.023084632572130535),
     ('##dun', 0.025699022336446258),
     ('##um', 0.03484266276127894),
     (',', 0.0030423079199119554),
     ('Lu', 0.02366442497712222),
     ('##gd', 0.006578965732858923),
     ('##umu', 0.03456580806237662),
     ('##m', 0.026521004509341334),
     ('Se', 0.037503453809376),
     ('##gus', 0.03294045015997047),
     ('##iano', 0.017089445343453365),
     ('##rum', 0.03820084664850618),
     (',', -0.011642202072501788),
     ('Lu', 0.02694302543021504),
     ('##gd', 0.005721331572683938),
     ('##umu', 0.03658546160187376),
     ('##m', 0.03770363967219936),
     ('Cel', 0.014027086848242715),
     ('##tar', 0.025188870477124894),
     ('##um', 0.039922520378568),
     (',', 0.022664305461904344),
     ('&', -0.0494132018474461),
     ('c', 0.0403850871592572),
     ('.', 0.04544699824023643),
     ('Vo', 0.02935262786796574),
     ('##ye', 0.03940461731845493),
     ('##z', 0.029871874749211054),
     ('Lu', 0.026337930390794705),
     ('##gd', -0.00442376201350928),
     ('##unum', 0.025898349689579492),
     ('.', 0.06146632041097513),
     ('Lyon', 0.022692171217471906),
     ('fut', 0.0438798787047486),
     ('fondée', 0.04704211890403151),
     ('l', 0.0451974674122074),
     ("'", 0.06774875716439344),
     ('an', 0.023585319400848195),
     ('de', 0.021013220187771894),
     ('Rome', 0.03688032185991681),
     ('712', 0.054693452829347115),
     (',', 0.007825484996566502),
     ('quarante', 0.028231791558966633),
     ('-', -0.0003846539976056082),
     ('un', 0.03117251985912735),
     ('ans', 0.014472180695321534),
     ('avant', 0.017626577836139475),
     ('l', 0.04620483463390136),
     ("'", 0.05723696778164145),
     ('ere', 0.007590037219544403),
     ('chrétienne', 0.1311835388990743),
     (',', 0.013178253982938232),
     ('par', 0.007011176299182855),
     ('Lucius', 0.0044462351021057325),
     ('Mu', 0.006340399133187405),
     ('##nati', 0.01752347206998558),
     ('##us', 0.015200983089939281),
     ('Plan', 0.030820184404097863),
     ('##cus', 0.01828726599412002),
     (',', -0.008296981653008715),
     ('qui', 0.03778469886529954),
     ('é', 0.024089543382319098),
     ('##toi', 0.01483008688193065),
     ('##t', 0.0036622619849812073),
     ('consul', 0.006312700914285012),
     ('avec', 0.019039309232488966),
     ('AE', -0.010990138859793724),
     ('##mil', 0.02963200210194755),
     ('##ius', -0.00048531039895657175),
     ('Le', 0.016166723086828174),
     ('##pid', 0.01787476167297771),
     ('##us', 0.020667475964218647),
     ('.', 0.029180628239546275),
     ('Il', 0.021736540370470812),
     ('la', 0.03714972247323993),
     ('b', 0.042870227388534604),
     ('##ât', 0.011332787999157318),
     ('##it', 0.009763016011555254),
     ('sur', 0.04260004363332922),
     ('la', 0.04410484491168233),
     ('S', 0.06575101714951456),
     ('##ôn', 0.011241165099203603),
     ('##e', 0.010537012868472688),
     (',', -0.0062768408260973066),
     ('au', -0.0018304190363696647),
     ('lieu', 0.13020947076813982),
     ('où', 0.013157964330803138),
     ('cette', 0.03737564027887762),
     ('rivier', 0.11169096058453537),
     ('##e', 0.04378867745175019),
     ('se', 0.0058271154715995995),
     ('jet', 0.031575857152632385),
     ('##te', 0.020236291895152022),
     ('dans', 0.01852231748257226),
     ('le', 0.03105610182850656),
     ('Rhône', 0.07226400802922804),
     (',', 0.02919256859997905),
     ('&', -0.038176803729996794),
     ('il', 0.00609352197030786),
     ('la', 0.04198219592000479),
     ('peu', 0.040981027718879084),
     ('##pla', 0.005560350755837545),
     ('des', 0.028996120278423045),
     ('citoyens', -0.03278504989463669),
     ('romain', -0.008083189911088765),
     ('##s', 0.01484737615013025),
     ('qui', 0.05804189959646576),
     ('a', -0.022083265525204197),
     ('[SEP]', 0.0)]

 %% Cell type:code id: tags:

 ``` python
+word_attributions.sort(key=lambda a: a[1], reverse = True)
+word_attributions
+```
+
+%% Output
+
+    [('ville', 0.478071716663547),
+     ('capitale', 0.2983988672217172),
+     ('royaume', 0.24665610131446675),
+     ('G', 0.24402357535335403),
+     ('##éo', 0.23393328870446992),
+     ('grande', 0.21832893139528123),
+     (',', 0.19909154256915337),
+     ('##gr', 0.1695800465119405),
+     ('##Y', 0.1456759996705617),
+     ('##ON', 0.14307146561933012),
+     ('.', 0.14162802579543046),
+     ('France', 0.13688799086603975),
+     ('latin', 0.13303588704102381),
+     ('chrétienne', 0.1311835388990743),
+     ('lieu', 0.13020947076813982),
+     ('ancienne', 0.12958979621300132),
+     ('celebre', 0.12947489123965564),
+     (',', 0.11257940886969105),
+     ('rivier', 0.11169096058453537),
+     ('nomme', 0.10000471924693329),
+     ('(', 0.09932002907423143),
+     ('la', 0.08253583803987206),
+     ('riche', 0.07913704700022943),
+     ('.', 0.0760972913677917),
+     ('du', 0.07376998774114908),
+     ('Rhône', 0.07226400802922804),
+     ('Elle', 0.0693630173969722),
+     (',', 0.06873738399629244),
+     ("'", 0.06774875716439344),
+     ('S', 0.06575101714951456),
+     ('.', 0.06146632041097513),
+     ('plus', 0.05840061099213507),
+     ('qui', 0.05804189959646576),
+     ("'", 0.05723696778164145),
+     (',', 0.05662853362544685),
+     ('712', 0.054693452829347115),
+     ('##érable', 0.05297839086419718),
+     ('fondée', 0.04704211890403151),
+     ('l', 0.04620483463390136),
+     ('.', 0.04544699824023643),
+     ('l', 0.0451974674122074),
+     ('la', 0.04410484491168233),
+     ('fut', 0.0438798787047486),
+     ('##e', 0.04378867745175019),
+     ('b', 0.042870227388534604),
+     ('sur', 0.04260004363332922),
+     ('la', 0.04198219592000479),
+     ('se', 0.04164162356829115),
+     ('peu', 0.040981027718879084),
+     ('c', 0.0403850871592572),
+     ('Lyon', 0.04007542253467923),
+     ('##um', 0.039922520378568),
+     ('##ye', 0.03940461731845493),
+     ('##rum', 0.03820084664850618),
+     ('qui', 0.03778469886529954),
+     ('##m', 0.03770363967219936),
+     ('Se', 0.037503453809376),
+     ('cette', 0.03737564027887762),
+     ('la', 0.03714972247323993),
+     ('Rome', 0.03688032185991681),
+     ('##umu', 0.03658546160187376),
+     ('##um', 0.03484266276127894),
+     ('##umu', 0.03456580806237662),
+     ('de', 0.03403811335226887),
+     ('Paris', 0.03310146903416289),
+     ('##gus', 0.03294045015997047),
+     ('##sid', 0.03255043778254519),
+     ('jet', 0.031575857152632385),
+     ('un', 0.03117251985912735),
+     ('le', 0.03105610182850656),
+     ('Lu', 0.0309274199183622),
+     ('Plan', 0.030820184404097863),
+     ('en', 0.03010674205624715),
+     ('##z', 0.029871874749211054),
+     ('##mil', 0.02963200210194755),
+     ('##unum', 0.029477331874186236),
+     ('Vo', 0.02935262786796574),
+     (',', 0.02919256859997905),
+     ('.', 0.029180628239546275),
+     ('##noi', 0.02909189419875202),
+     ('des', 0.028996120278423045),
+     ('quarante', 0.028231791558966633),
+     ('Lu', 0.02694302543021504),
+     ('##m', 0.026521004509341334),
+     ('Lu', 0.026337930390794705),
+     ('##s', 0.02625525527522554),
+     ('con', 0.026165582559808873),
+     ('##unum', 0.025898349689579492),
+     ('##dun', 0.025699022336446258),
+     ('##tar', 0.025188870477124894),
+     ('é', 0.024089543382319098),
+     ('Lu', 0.02366442497712222),
+     ('an', 0.023585319400848195),
+     ('##gu', 0.023084632572130535),
+     ('Lyon', 0.022692171217471906),
+     (',', 0.022664305461904344),
+     ('Il', 0.021736540370470812),
+     ('de', 0.021013220187771894),
+     ('##us', 0.020667475964218647),
+     ('##te', 0.020236291895152022),
+     ('avec', 0.019039309232488966),
+     ('du', 0.018572791985543135),
+     ('dans', 0.01852231748257226),
+     ('##cus', 0.01828726599412002),
+     ('##pid', 0.01787476167297771),
+     ('après', 0.01785470962170739),
+     ('avant', 0.017626577836139475),
+     ('##nati', 0.01752347206998558),
+     ('##iano', 0.017089445343453365),
+     ('Le', 0.016166723086828174),
+     ('##us', 0.015200983089939281),
+     ('##s', 0.01484737615013025),
+     ('##toi', 0.01483008688193065),
+     ('ans', 0.014472180695321534),
+     ('Cel', 0.014027086848242715),
+     (',', 0.013178253982938232),
+     ('où', 0.013157964330803138),
+     ('##ât', 0.011332787999157318),
+     ('##ôn', 0.011241165099203603),
+     ('##e', 0.010537012868472688),
+     ('##it', 0.009763016011555254),
+     ('Lu', 0.008645628419735481),
+     (',', 0.007825484996566502),
+     ('ere', 0.007590037219544403),
+     ('L', 0.007399733805079844),
+     ('par', 0.007011176299182855),
+     ('##gd', 0.006578965732858923),
+     ('Mu', 0.006340399133187405),
+     ('consul', 0.006312700914285012),
+     ('il', 0.00609352197030786),
+     ('se', 0.0058271154715995995),
+     ('##gd', 0.005721331572683938),
+     ('##pla', 0.005560350755837545),
+     ('##gd', 0.00518317960511743),
+     ('Lucius', 0.0044462351021057325),
+     ('##t', 0.0036622619849812073),
+     (',', 0.0030423079199119554),
+     ('[CLS]', 0.0),
+     ('[SEP]', 0.0),
+     ('-', -0.0003846539976056082),
+     ('##ius', -0.00048531039895657175),
+     (',', -0.0010714894154601323),
+     ('au', -0.0018304190363696647),
+     ('##gd', -0.00442376201350928),
+     (',', -0.0062768408260973066),
+     ('&', -0.006321268573570221),
+     (',', -0.006856821180122214),
+     ('la', -0.006879341345145134),
+     (',', -0.007011581545450849),
+     ('romain', -0.008083189911088765),
+     (',', -0.008296981653008715),
+     ('AE', -0.010990138859793724),
+     (',', -0.011642202072501788),
+     ('a', -0.022083265525204197),
+     ('belle', -0.029909244412604778),
+     ('citoyens', -0.03278504989463669),
+     ('&', -0.038176803729996794),
+     ('&', -0.0494132018474461),
+     ('&', -0.08418116246612357),
+     (')', -0.13544847084394057)]
+
+%% Cell type:code id: tags:
+
+``` python
 cls_explainer.predicted_class_name
 ```

 %% Output

    'Géographie'

 %% Cell type:code id: tags:

 ``` python
 cls_explainer.visualize()
 ```

 %% Output


    <IPython.core.display.HTML object>

 %% Cell type:code id: tags:

 ``` python
 ```

 %% Cell type:markdown id: tags:

 * récupérer les mots positifs par domaine (EDdA et LGE)
 * faire des nuages de mots et comparer les plus fréquents entre EDdA et LGE (corpus parallèle)

 %% Cell type:markdown id: tags: