From 6db02a3d32f8f3335862e0eb88902f86362eff39 Mon Sep 17 00:00:00 2001 From: Ludovic Moncla <moncla.ludovic@gmail.com> Date: Mon, 28 Nov 2022 15:31:09 +0100 Subject: [PATCH] Update Predict_XAI.ipynb --- notebooks/Predict_XAI.ipynb | 480 ++++++++++++++++++++++++++---------- 1 file changed, 355 insertions(+), 125 deletions(-) diff --git a/notebooks/Predict_XAI.ipynb b/notebooks/Predict_XAI.ipynb index 1fdf013..517c7a3 100644 --- a/notebooks/Predict_XAI.ipynb +++ b/notebooks/Predict_XAI.ipynb @@ -108,8 +108,6 @@ "import numpy as np\n", "import torch\n", "from torch.utils.data import TensorDataset, DataLoader, SequentialSampler\n", - "from tqdm import tqdm\n", - "import os\n", "import pandas as pd \n" ] }, @@ -278,27 +276,7 @@ "\n", " pred_labels_ += [item for sublist in pred_labels for item in sublist]\n", " return pred_labels_\n", - "\n", - "\n", - "def text_folder_to_dataframe(path):\n", - "\n", - " data = []\n", - " # id,tome,filename,nb_words,content,domain\n", - "\n", - " for tome in sorted(os.listdir(path)):\n", - " try:\n", - " for article in tqdm(sorted(os.listdir(path + \"/\" + tome))):\n", - " filename = article[:-4]\n", - " id = tome + filename\n", - "\n", - " if article[-4:] == \".txt\":\n", - " with open(path + \"/\" + tome + \"/\" + article) as f:\n", - " content = f.read()\n", - "\n", - " data.append([id, tome, filename, content, len(content.split(' '))])\n", - " except NotADirectoryError:\n", - " pass\n", - " return pd.DataFrame(data, columns=['id', 'tome', 'filename', 'content', 'nb_words'])\n" + "\n" ] }, { @@ -307,92 +285,172 @@ "id": "c5QKcXulhNJ-" }, "source": [ - "## 2. Load Data\n", + "## 3. Load Data\n", "\n", "\n", "!! A modifier: charger le corpus parallele : EDdA et LGE" ] }, { - "cell_type": "code", - "execution_count": 3, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "!wget https://api.nakala.fr/data/10.34847/nkl.74eb1xfd/e522413b58b04ab7c283f8fa68642e9cb69ab5c5" + "### 3.1 LGE (Nakala)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "!unzip e522413b58b04ab7c283f8fa68642e9cb69ab5c5" + "lge_path = \"/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/LGE_dataset_articles.tsv\"\n", + "df_LGE = pd.read_csv(lge_path, sep=\"\\t\")" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>tome</th>\n", + " <th>filename</th>\n", + " <th>content</th>\n", + " <th>nb_words</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>T1article_1</td>\n", + " <td>T1</td>\n", + " <td>article_1</td>\n", + " <td>F.-Camille DREYFUS, député de la Seine.\\n</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>T1article_10</td>\n", + " <td>T1</td>\n", + " <td>article_10</td>\n", + " <td>quimarque un mouvement en avant de l’esprit hu...</td>\n", + " <td>212</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>T1article_100</td>\n", + " <td>T1</td>\n", + " <td>article_100</td>\n", + " <td>ABACUS. L’abacus ou abaque était un instrument...</td>\n", + " <td>1345</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>T1article_1000</td>\n", + " <td>T1</td>\n", + " <td>article_1000</td>\n", + " <td>H6SS6)\\n1780-1793 Choiseul-Goufficr\\n1780-1793...</td>\n", + " <td>218</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>T1article_1001</td>\n", + " <td>T1</td>\n", + " <td>article_1001</td>\n", + " <td>1803Le Brun.\\n</td>\n", + " <td>2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id tome filename \\\n", + "0 T1article_1 T1 article_1 \n", + "1 T1article_10 T1 article_10 \n", + "2 T1article_100 T1 article_100 \n", + "3 T1article_1000 T1 article_1000 \n", + "4 T1article_1001 T1 article_1001 \n", + "\n", + " content nb_words \n", + "0 F.-Camille DREYFUS, député de la Seine.\\n 6 \n", + "1 quimarque un mouvement en avant de l’esprit hu... 212 \n", + "2 ABACUS. L’abacus ou abaque était un instrument... 1345 \n", + "3 H6SS6)\\n1780-1793 Choiseul-Goufficr\\n1780-1793... 218 \n", + "4 1803Le Brun.\\n 2 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "input_path = \"/Users/lmoncla/Documents/Data/Corpus/LGE/Text\"\n", - "#input_path = \"./Text\"" + "df_LGE.head()" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 7, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5201/5201 [00:00<00:00, 6137.32it/s]\n", - "100%|██████████| 5704/5704 [00:00<00:00, 6325.35it/s]\n", - "100%|██████████| 5214/5214 [00:00<00:00, 5986.96it/s]\n", - "100%|██████████| 5528/5528 [00:00<00:00, 6213.04it/s]\n", - "100%|██████████| 6963/6963 [00:01<00:00, 5686.82it/s]\n", - "100%|██████████| 5983/5983 [00:00<00:00, 6120.28it/s]\n", - "100%|██████████| 13713/13713 [00:01<00:00, 7057.45it/s]\n", - "100%|██████████| 9202/9202 [00:01<00:00, 7161.23it/s]\n", - "100%|██████████| 10704/10704 [00:01<00:00, 7208.53it/s]\n", - "100%|██████████| 6378/6378 [00:00<00:00, 6988.23it/s]\n", - "100%|██████████| 8476/8476 [00:01<00:00, 7098.48it/s]\n", - "100%|██████████| 6576/6576 [00:00<00:00, 6996.19it/s]\n", - "100%|██████████| 7797/7797 [00:01<00:00, 6981.47it/s]\n", - "100%|██████████| 9027/9027 [00:01<00:00, 6563.44it/s]\n", - "100%|██████████| 8383/8383 [00:01<00:00, 7017.88it/s]\n", - "100%|██████████| 7319/7319 [00:01<00:00, 7064.77it/s]\n", - "100%|██████████| 10269/10269 [00:01<00:00, 6864.36it/s]\n", - "100%|██████████| 7512/7512 [00:01<00:00, 6854.61it/s]\n", - "100%|██████████| 6701/6701 [00:01<00:00, 6501.17it/s]\n", - "100%|██████████| 7343/7343 [00:01<00:00, 6933.17it/s]\n", - "100%|██████████| 7273/7273 [00:01<00:00, 6877.68it/s]\n", - "100%|██████████| 10877/10877 [00:01<00:00, 6410.62it/s]\n", - "100%|██████████| 4731/4731 [00:00<00:00, 6429.83it/s]\n", - "100%|██████████| 8698/8698 [00:01<00:00, 6076.43it/s]\n", - "100%|██████████| 9675/9675 [00:01<00:00, 6399.53it/s]\n", - "100%|██████████| 5710/5710 [00:00<00:00, 6343.15it/s]\n", - "100%|██████████| 5664/5664 [00:00<00:00, 6450.75it/s]\n", - "100%|██████████| 5828/5828 [00:00<00:00, 6425.49it/s]\n", - "100%|██████████| 5721/5721 [00:00<00:00, 6536.62it/s]\n", - "100%|██████████| 6110/6110 [00:00<00:00, 6391.42it/s]\n", - "100%|██████████| 5195/5195 [00:00<00:00, 6016.13it/s]\n" - ] + "data": { + "text/plain": [ + "(229475, 5)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "df_LGE = text_folder_to_dataframe(input_path)\n", - "#df_LGE = pd.read_csv(path + \"data/LGE_withContent.tsv\", sep=\"\\t\")\n", - "data_LGE = df_LGE[\"content\"].values" + "df_LGE.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 LGE Parallel" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "lge_par_path = \"/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/LGE_parallel_dataset_articles.tsv\"\n", + "df_LGE_par = pd.read_csv(lge_par_path, sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -418,107 +476,279 @@ " <th></th>\n", " <th>id</th>\n", " <th>tome</th>\n", - " <th>rank</th>\n", - " <th>domain</th>\n", - " <th>remark</th>\n", + " <th>filename</th>\n", " <th>content</th>\n", + " <th>nb_words</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>abrabeses-0</td>\n", - " <td>1</td>\n", - " <td>623</td>\n", - " <td>geography</td>\n", - " <td>NaN</td>\n", - " <td>ABRABESES. Village d’Espagne de la prov. de Za...</td>\n", + " <td>T1aam-0</td>\n", + " <td>T1</td>\n", + " <td>aam-0</td>\n", + " <td>AAM. Mesure de capacité pour les liquides en u...</td>\n", + " <td>38</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>accius-0</td>\n", - " <td>1</td>\n", - " <td>1076</td>\n", - " <td>biography</td>\n", - " <td>NaN</td>\n", - " <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n", + " <td>T1abaco-0</td>\n", + " <td>T1</td>\n", + " <td>abaco-0</td>\n", + " <td>ABACO, architecte italien du xvi siècle (V. La...</td>\n", + " <td>8</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>achenbach-2</td>\n", - " <td>1</td>\n", - " <td>1357</td>\n", - " <td>biography</td>\n", - " <td>NaN</td>\n", - " <td>ACHENBACH(Henri), administrateur prussien, né ...</td>\n", + " <td>T1abacot-0</td>\n", + " <td>T1</td>\n", + " <td>abacot-0</td>\n", + " <td>ABACOT. Double couronne que portaient autrefoi...</td>\n", + " <td>33</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>acireale-0</td>\n", - " <td>1</td>\n", - " <td>1513</td>\n", - " <td>geography</td>\n", - " <td>NaN</td>\n", - " <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n", + " <td>T1abaddon-0</td>\n", + " <td>T1</td>\n", + " <td>abaddon-0</td>\n", + " <td>ABADDONou APOLYON le Destructeur. « Elles\\nava...</td>\n", + " <td>109</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>actée-0</td>\n", - " <td>1</td>\n", - " <td>1731</td>\n", - " <td>botany</td>\n", - " <td>NaN</td>\n", - " <td>ACTÉE(Actœa L.). Genre de plantes de la famill...</td>\n", + " <td>T1abandonnement-0</td>\n", + " <td>T1</td>\n", + " <td>abandonnement-0</td>\n", + " <td>ABANDONNEMENT. I. Droit civil. — Ce mot est un...</td>\n", + " <td>76</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " id tome rank domain remark \\\n", - "0 abrabeses-0 1 623 geography NaN \n", - "1 accius-0 1 1076 biography NaN \n", - "2 achenbach-2 1 1357 biography NaN \n", - "3 acireale-0 1 1513 geography NaN \n", - "4 actée-0 1 1731 botany NaN \n", + " id tome filename \\\n", + "0 T1aam-0 T1 aam-0 \n", + "1 T1abaco-0 T1 abaco-0 \n", + "2 T1abacot-0 T1 abacot-0 \n", + "3 T1abaddon-0 T1 abaddon-0 \n", + "4 T1abandonnement-0 T1 abandonnement-0 \n", "\n", - " content \n", - "0 ABRABESES. Village d’Espagne de la prov. de Za... \n", - "1 ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po... \n", - "2 ACHENBACH(Henri), administrateur prussien, né ... \n", - "3 ACIREALE. Yille de Sicile, de la province et d... \n", - "4 ACTÉE(Actœa L.). Genre de plantes de la famill... " + " content nb_words \n", + "0 AAM. Mesure de capacité pour les liquides en u... 38 \n", + "1 ABACO, architecte italien du xvi siècle (V. La... 8 \n", + "2 ABACOT. Double couronne que portaient autrefoi... 33 \n", + "3 ABADDONou APOLYON le Destructeur. « Elles\\nava... 109 \n", + "4 ABANDONNEMENT. I. Droit civil. — Ce mot est un... 76 " ] }, - "execution_count": 5, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_LGE.head()" + "df_LGE_par.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.3 EDdA (ARTFL)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "edda_path = \"/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_articles.tsv\"\n", + "df_EDdA = pd.read_csv(edda_path, sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>author</th>\n", + " <th>edda_class</th>\n", + " <th>enccre_id</th>\n", + " <th>enccre_class</th>\n", + " <th>content</th>\n", + " <th>content_without_designant</th>\n", + " <th>first_paragraph</th>\n", + " <th>nb_words</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>Title Page</td>\n", + " <td>unsigned</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>151</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", + " <td>Diderot & d'Alembert</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>208</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", + " <td>d'Alembert</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n</td>\n", + " <td>44669</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Dumarsais5</td>\n", + " <td>Grammaire</td>\n", + " <td>v1-1-0</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>711</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>Dumarsais5</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-1</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>238</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], "text/plain": [ - "(310, 6)" + " volume numero head author \\\n", + "0 1 1 Title Page unsigned \n", + "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert \n", + "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert \n", + "3 1 5 A, a & a Dumarsais5 \n", + "4 1 6 A Dumarsais5 \n", + "\n", + " edda_class enccre_id enccre_class \\\n", + "0 unclassified NaN NaN \n", + "1 unclassified NaN NaN \n", + "2 unclassified NaN NaN \n", + "3 Grammaire v1-1-0 Grammaire \n", + "4 unclassified v1-1-1 Grammaire \n", + "\n", + " content \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... \n", + "\n", + " content_without_designant \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... \n", + "\n", + " first_paragraph nb_words \n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... 151 \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... 208 \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n 44669 \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... 711 \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... 238 " ] }, - "execution_count": 6, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_LGE.shape" + "df_EDdA.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.4 EDdA Parallel" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, -- GitLab