From c83c5d0dfaba790e7a0c026d4ce3c5e786bd0bbc Mon Sep 17 00:00:00 2001 From: Ludovic Moncla <moncla.ludovic@gmail.com> Date: Tue, 14 Mar 2023 22:16:29 +0100 Subject: [PATCH] Update Predict.ipynb --- notebooks/Predict.ipynb | 1949 +++++++++++++++++++++++++++------------ 1 file changed, 1382 insertions(+), 567 deletions(-) diff --git a/notebooks/Predict.ipynb b/notebooks/Predict.ipynb index 57eced6..e9e6378 100644 --- a/notebooks/Predict.ipynb +++ b/notebooks/Predict.ipynb @@ -169,20 +169,7 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "M2awiee1r0zV" - }, - "outputs": [], - "source": [ - "#drive_path = \"drive/MyDrive/Classification-EDdA/\"\n", - "drive_path = \"../\"\n", - "path = \"/Users/lmoncla/git/gitlab.liris/GEODE/EDdA/output/\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -194,21 +181,29 @@ "source": [ "#!wget https://geode.liris.cnrs.fr/files/datasets/EDdA/Classification/LGE_withContent.tsv\n", "#!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/EDdA_dataset_articles_no_superdomain.tsv\n", - "!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/Parallel_datatset_articles_230215.tsv" + "#!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/Parallel_datatset_articles_230215.tsv" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { - "id": "eea7F4vato1x" + "id": "M2awiee1r0zV" }, "outputs": [], "source": [ - "#filepath = \"data/LGE_withContent.tsv\"\n", - "#filepath = \"EDdA_dataset_articles_no_superdomain.tsv\"\n", + "#drive_path = \"drive/MyDrive/Classification-EDdA/\"\n", + "drive_path = \"../\"\n", + "#path = \"/Users/lmoncla/git/gitlab.liris/GEODE/EDdA/output/\"\n", + "path = \"/Users/lmoncla/git/gitlab.liris/GEODE/LGE/output/\"\n", + "\n", + "\n", "#filepath = \"Parallel_datatset_articles_230215.tsv\"\n", - "filepath = \"EDdA_dataset_articles.tsv\"" + "#filepath = \"EDdA_dataset_articles.tsv\"\n", + "filepath = \"LGE_dataset_articles_230314.tsv\"\n", + "\n", + "corpus = 'lge'\n", + "#corpus = ''" ] }, { @@ -244,135 +239,90 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>volume</th>\n", - " <th>numero</th>\n", - " <th>head</th>\n", - " <th>author</th>\n", - " <th>edda_class</th>\n", - " <th>enccre_id</th>\n", - " <th>enccre_class</th>\n", - " <th>content</th>\n", - " <th>content_without_designant</th>\n", - " <th>first_paragraph</th>\n", - " <th>nb_words</th>\n", - " <th>super_domain</th>\n", + " <th>uid</th>\n", + " <th>lge-volume</th>\n", + " <th>lge-numero</th>\n", + " <th>lge-head</th>\n", + " <th>lge-page</th>\n", + " <th>lge-id</th>\n", + " <th>lge-content</th>\n", + " <th>lge-nbWords</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", + " <td>lge_1_a-0</td>\n", " <td>1</td>\n", " <td>1</td>\n", - " <td>Title Page</td>\n", - " <td>unsigned</td>\n", - " <td>unclassified</td>\n", - " <td>NaN</td>\n", - " <td>NaN</td>\n", - " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", - " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", - " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", - " <td>151</td>\n", - " <td>Unclassified</td>\n", + " <td>A</td>\n", + " <td>0</td>\n", + " <td>a-0</td>\n", + " <td>A(Ling.). Son vocal et première lettre de notr...</td>\n", + " <td>1761.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", + " <td>lge_1_a-1</td>\n", " <td>1</td>\n", " <td>2</td>\n", - " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", - " <td>Diderot & d'Alembert</td>\n", - " <td>unclassified</td>\n", - " <td>NaN</td>\n", - " <td>NaN</td>\n", - " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", - " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", - " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", - " <td>208</td>\n", - " <td>Unclassified</td>\n", + " <td>A</td>\n", + " <td>1</td>\n", + " <td>a-1</td>\n", + " <td>A(Paléogr.). C’est à l’alphabet phénicien, on ...</td>\n", + " <td>839.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", + " <td>lge_1_a-2</td>\n", " <td>1</td>\n", " <td>3</td>\n", - " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", - " <td>d'Alembert</td>\n", - " <td>unclassified</td>\n", - " <td>NaN</td>\n", - " <td>NaN</td>\n", - " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", - " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", - " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n</td>\n", - " <td>44669</td>\n", - " <td>Unclassified</td>\n", + " <td>A</td>\n", + " <td>4</td>\n", + " <td>a-2</td>\n", + " <td>A(Log.). Cette voyelle désigne les proposition...</td>\n", + " <td>56.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", + " <td>lge_1_a-3</td>\n", " <td>1</td>\n", - " <td>5</td>\n", - " <td>A, a & a</td>\n", - " <td>Dumarsais5</td>\n", - " <td>Grammaire</td>\n", - " <td>v1-1-0</td>\n", - " <td>Grammaire</td>\n", - " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", - " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", - " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", - " <td>711</td>\n", - " <td>Philosophie</td>\n", + " <td>4</td>\n", + " <td>A</td>\n", + " <td>4</td>\n", + " <td>a-3</td>\n", + " <td>A(Mus.). La lettre a est employée par les musi...</td>\n", + " <td>267.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", + " <td>lge_1_a-4</td>\n", " <td>1</td>\n", - " <td>6</td>\n", + " <td>5</td>\n", " <td>A</td>\n", - " <td>Dumarsais5</td>\n", - " <td>unclassified</td>\n", - " <td>v1-1-1</td>\n", - " <td>Grammaire</td>\n", - " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", - " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", - " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", - " <td>238</td>\n", - " <td>Unclassified</td>\n", + " <td>4</td>\n", + " <td>a-4</td>\n", + " <td>A(Numis.). Dans la numismatique grecque, la le...</td>\n", + " <td>67.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " volume numero head author \\\n", - "0 1 1 Title Page unsigned \n", - "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert \n", - "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert \n", - "3 1 5 A, a & a Dumarsais5 \n", - "4 1 6 A Dumarsais5 \n", - "\n", - " edda_class enccre_id enccre_class \\\n", - "0 unclassified NaN NaN \n", - "1 unclassified NaN NaN \n", - "2 unclassified NaN NaN \n", - "3 Grammaire v1-1-0 Grammaire \n", - "4 unclassified v1-1-1 Grammaire \n", - "\n", - " content \\\n", - "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", - "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", - "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", - "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", - "4 \\nA, mot, est 1. la troisieme personne du prés... \n", + " uid lge-volume lge-numero lge-head lge-page lge-id \\\n", + "0 lge_1_a-0 1 1 A 0 a-0 \n", + "1 lge_1_a-1 1 2 A 1 a-1 \n", + "2 lge_1_a-2 1 3 A 4 a-2 \n", + "3 lge_1_a-3 1 4 A 4 a-3 \n", + "4 lge_1_a-4 1 5 A 4 a-4 \n", "\n", - " content_without_designant \\\n", - "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", - "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", - "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", - "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", - "4 \\nA, mot, est 1. la troisieme personne du prés... \n", - "\n", - " first_paragraph nb_words super_domain \n", - "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... 151 Unclassified \n", - "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... 208 Unclassified \n", - "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n 44669 Unclassified \n", - "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... 711 Philosophie \n", - "4 \\nA, mot, est 1. la troisieme personne du prés... 238 Unclassified " + " lge-content lge-nbWords \n", + "0 A(Ling.). Son vocal et première lettre de notr... 1761.0 \n", + "1 A(Paléogr.). C’est à l’alphabet phénicien, on ... 839.0 \n", + "2 A(Log.). Cette voyelle désigne les proposition... 56.0 \n", + "3 A(Mus.). La lettre a est employée par les musi... 267.0 \n", + "4 A(Numis.). Dans la numismatique grecque, la le... 67.0 " ] }, "execution_count": 5, @@ -393,9 +343,7 @@ }, "outputs": [], "source": [ - "#corpus = 'LGE'\n", - "corpus = ''\n", - "data = df['content'+corpus].values\n" + "data = df[corpus+'-content'].values" ] }, { @@ -411,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": { "id": "0qDZ86qTr0zX" }, @@ -426,7 +374,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "id": "KEljGX0br0zX" }, @@ -532,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -607,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "id": "-O6NspVTr0zZ" }, @@ -616,7 +564,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Token indices sequence length is longer than the specified maximum sequence length for this model (75311 > 512). Running this sequence through the model will result in indexing errors\n" + "Token indices sequence length is longer than the specified maximum sequence length for this model (3408 > 512). Running this sequence through the model will result in indexing errors\n" ] } ], @@ -638,7 +586,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": { "id": "CN8EZst-r0zZ" }, @@ -653,7 +601,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": { "id": "_fzgS5USJeAF" }, @@ -664,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -672,14 +620,1025 @@ "id": "ISkijyclr0za", "outputId": "8120e858-9950-4380-f887-70ca47360c76" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[13,\n", + " 6,\n", + " 13,\n", + " 10,\n", + " 7,\n", + " 4,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 11,\n", + " 7,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 8,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 4,\n", + " 8,\n", + " 7,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 16,\n", + " 7,\n", + " 10,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 11,\n", + " 3,\n", + " 9,\n", + " 7,\n", + " 4,\n", + " 6,\n", + " 7,\n", + " 14,\n", + " 1,\n", + " 8,\n", + " 6,\n", + " 8,\n", + " 7,\n", + " 5,\n", + " 7,\n", + " 14,\n", + " 6,\n", + " 3,\n", + " 16,\n", + " 9,\n", + " 2,\n", + " 1,\n", + " 1,\n", + " 7,\n", + " 7,\n", + " 5,\n", + " 6,\n", + " 7,\n", + " 8,\n", + " 7,\n", + " 8,\n", + " 0,\n", + " 9,\n", + " 14,\n", + " 6,\n", + " 8,\n", + " 6,\n", + " 7,\n", + " 6,\n", + " 9,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 5,\n", + " 5,\n", + " 8,\n", + " 5,\n", + " 5,\n", + " 5,\n", + " 5,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 6,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 4,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 4,\n", + " 0,\n", + " 4,\n", + " 4,\n", + " 0,\n", + " 8,\n", + " 9,\n", + " 1,\n", + " 1,\n", + " 6,\n", + " 7,\n", + " 1,\n", + " 9,\n", + " 5,\n", + " 7,\n", + " 5,\n", + " 8,\n", + " 2,\n", + " 6,\n", + " 7,\n", + " 8,\n", + " 5,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 3,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 2,\n", + " 7,\n", + " 2,\n", + " 8,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 1,\n", + " 11,\n", + " 1,\n", + " 1,\n", + " 7,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 10,\n", + " 6,\n", + " 16,\n", + " 12,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 6,\n", + " 7,\n", + " 3,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 6,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 16,\n", + " 2,\n", + " 9,\n", + " 11,\n", + " 16,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 8,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 1,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 13,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 5,\n", + " 8,\n", + " 9,\n", + " 11,\n", + " 8,\n", + " 7,\n", + " 11,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 7,\n", + " 13,\n", + " 8,\n", + " 7,\n", + " 12,\n", + " 6,\n", + " 7,\n", + " 5,\n", + " 8,\n", + " 11,\n", + " 8,\n", + " 14,\n", + " 2,\n", + " 11,\n", + " 1,\n", + " 7,\n", + " 10,\n", + " 11,\n", + " 8,\n", + " 7,\n", + " 6,\n", + " 6,\n", + " 7,\n", + " 16,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 1,\n", + " 8,\n", + " 10,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 1,\n", + " 1,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 9,\n", + " 13,\n", + " 8,\n", + " 16,\n", + " 7,\n", + " 6,\n", + " 8,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 16,\n", + " 13,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 5,\n", + " 6,\n", + " 7,\n", + " 8,\n", + " 7,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 11,\n", + " 8,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 8,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 11,\n", + " 1,\n", + " 6,\n", + " 11,\n", + " 14,\n", + " 6,\n", + " 10,\n", + " 6,\n", + " 6,\n", + " 8,\n", + " 5,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 16,\n", + " 7,\n", + " 7,\n", + " 13,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 9,\n", + " 13,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 5,\n", + " 7,\n", + " 8,\n", + " 3,\n", + " 14,\n", + " 8,\n", + " 14,\n", + " 8,\n", + " 7,\n", + " 5,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 3,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 5,\n", + " 6,\n", + " 6,\n", + " 5,\n", + " 16,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 9,\n", + " 6,\n", + " 16,\n", + " 6,\n", + " 7,\n", + " 5,\n", + " 6,\n", + " 8,\n", + " 11,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 6,\n", + " 5,\n", + " 2,\n", + " 7,\n", + " 8,\n", + " 6,\n", + " 13,\n", + " 11,\n", + " 14,\n", + " 7,\n", + " 8,\n", + " 16,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 9,\n", + " 0,\n", + " 2,\n", + " 6,\n", + " 8,\n", + " 3,\n", + " 6,\n", + " 1,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 16,\n", + " 7,\n", + " 3,\n", + " 16,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 13,\n", + " 5,\n", + " 7,\n", + " 9,\n", + " 7,\n", + " 2,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 7,\n", + " 13,\n", + " 6,\n", + " 14,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 5,\n", + " 5,\n", + " 6,\n", + " 7,\n", + " 6,\n", + " 8,\n", + " 9,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 5,\n", + " 7,\n", + " 11,\n", + " 7,\n", + " 4,\n", + " 6,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 3,\n", + " 6,\n", + " 12,\n", + " 9,\n", + " 7,\n", + " 1,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 13,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 13,\n", + " 7,\n", + " 6,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 7,\n", + " 1,\n", + " 2,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 5,\n", + " 9,\n", + " 7,\n", + " 2,\n", + " 6,\n", + " 3,\n", + " 4,\n", + " 6,\n", + " 16,\n", + " 5,\n", + " 5,\n", + " 5,\n", + " 5,\n", + " 5,\n", + " 5,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 13,\n", + " 5,\n", + " 5,\n", + " 5,\n", + " 1,\n", + " 8,\n", + " 7,\n", + " 2,\n", + " 14,\n", + " 8,\n", + " 11,\n", + " 8,\n", + " 7,\n", + " 16,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 16,\n", + " 7,\n", + " 7,\n", + " 16,\n", + " 7,\n", + " 7,\n", + " 16,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 11,\n", + " 8,\n", + " 6,\n", + " 8,\n", + " 7,\n", + " 6,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 12,\n", + " 8,\n", + " 11,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 10,\n", + " 14,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 14,\n", + " 7,\n", + " 5,\n", + " 7,\n", + " 0,\n", + " 5,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 1,\n", + " 0,\n", + " 8,\n", + " 8,\n", + " 9,\n", + " 9,\n", + " 3,\n", + " 6,\n", + " 13,\n", + " 6,\n", + " 5,\n", + " 4,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 1,\n", + " 8,\n", + " 7,\n", + " 8,\n", + " 3,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 0,\n", + " 6,\n", + " 9,\n", + " 6,\n", + " 8,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 14,\n", + " 5,\n", + " 5,\n", + " 1,\n", + " 1,\n", + " 12,\n", + " 8,\n", + " 11,\n", + " 11,\n", + " 7,\n", + " 13,\n", + " 16,\n", + " 13,\n", + " 14,\n", + " 14,\n", + " 11,\n", + " 14,\n", + " 11,\n", + " 14,\n", + " 16,\n", + " 7,\n", + " 7,\n", + " 5,\n", + " 5,\n", + " 13,\n", + " 11,\n", + " 16,\n", + " 7,\n", + " 13,\n", + " 14,\n", + " 14,\n", + " 13,\n", + " 8,\n", + " 7,\n", + " 7,\n", + " 10,\n", + " 9,\n", + " 4,\n", + " 8,\n", + " 2,\n", + " 9,\n", + " 8,\n", + " 8,\n", + " 3,\n", + " 5,\n", + " 13,\n", + " 5,\n", + " 5,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 6,\n", + " 6,\n", + " 7,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 13,\n", + " 6,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 8,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 6,\n", + " 8,\n", + " 13,\n", + " 7,\n", + " 13,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 5,\n", + " 1,\n", + " 7,\n", + " 1,\n", + " 7,\n", + " 6,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 7,\n", + " 6,\n", + " 8,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 7,\n", + " 8,\n", + " 7,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 11,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 3,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 11,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 7,\n", + " 8,\n", + " 5,\n", + " 6,\n", + " 6,\n", + " 11,\n", + " 8,\n", + " 8,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 13,\n", + " 7,\n", + " 7,\n", + " 13,\n", + " 7,\n", + " 7,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 7,\n", + " 7,\n", + " 9,\n", + " 7,\n", + " 7,\n", + " 7,\n", + " 10,\n", + " 9,\n", + " 10,\n", + " 14,\n", + " 3,\n", + " 14,\n", + " 14,\n", + " 9,\n", + " 16,\n", + " 5,\n", + " 7,\n", + " 13,\n", + " 8,\n", + " 13,\n", + " 5,\n", + " 5,\n", + " 5,\n", + " 5,\n", + " 13,\n", + " 16,\n", + " 5,\n", + " 13,\n", + " 2,\n", + " 11,\n", + " 8,\n", + " 10,\n", + " 7,\n", + " 1,\n", + " 14,\n", + " 14,\n", + " 10,\n", + " 9,\n", + " 5,\n", + " 8,\n", + " 8,\n", + " 4,\n", + " 2,\n", + " 7,\n", + " 13,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 1,\n", + " 8,\n", + " 7,\n", + " 0,\n", + " 6,\n", + " 9,\n", + " 2,\n", + " 1,\n", + " 8,\n", + " 11,\n", + " 12,\n", + " 9,\n", + " 10,\n", + " 7,\n", + " 13,\n", + " 11,\n", + " 13,\n", + " 1,\n", + " 5,\n", + " 10,\n", + " 10,\n", + " 10,\n", + " 10,\n", + " 2,\n", + " 9,\n", + " 3,\n", + " 9,\n", + " 6,\n", + " 1,\n", + " 13,\n", + " 11,\n", + " 11,\n", + " 11,\n", + " 1,\n", + " 1,\n", + " 13,\n", + " 3,\n", + " 1,\n", + " 9,\n", + " 6,\n", + " 12,\n", + " 7,\n", + " 3,\n", + " 8,\n", + " 12,\n", + " 12,\n", + " 12,\n", + " 12,\n", + " 8,\n", + " 0,\n", + " 3,\n", + " 7,\n", + " 7,\n", + " 3,\n", + " 9,\n", + " 9,\n", + " 9,\n", + " 14,\n", + " 14,\n", + " 8,\n", + " 5,\n", + " 6,\n", + " 7,\n", + " 5,\n", + " 5,\n", + " 13,\n", + " 5,\n", + " 5,\n", + " 5,\n", + " 16,\n", + " 14,\n", + " 11,\n", + " 8,\n", + " 9,\n", + " 11,\n", + " 11,\n", + " 11,\n", + " 8,\n", + " 11,\n", + " 11,\n", + " 11,\n", + " 11,\n", + " 11,\n", + " 8,\n", + " 8,\n", + " 12,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 11,\n", + " 8,\n", + " 11,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 8,\n", + " 8,\n", + " 8,\n", + " 6,\n", + " 7,\n", + " 13,\n", + " ...]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pred" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, "metadata": { "id": "fo6k4li1r0za" }, @@ -704,7 +1663,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, "metadata": { "id": "UU7qg7zVr0zb" }, @@ -715,18 +1674,18 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 16, "metadata": { "id": "w4eHpBztr0zb" }, "outputs": [], "source": [ - "df['superdomainBert'+corpus] = p2" + "df[corpus+'-superdomainBert'] = p2" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -734,14 +1693,27 @@ "id": "KsJQMhCBxpSF", "outputId": "2ffa7475-e6de-4c42-a413-22c0d4b2d45f" }, - "outputs": [], + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'DataFrame' object has no attribute 'numero'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/qm/v_b1md29221_cnpcxf5qc43c0000gn/T/ipykernel_3552/1621721301.py\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnumero\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2835\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'-content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5900\u001b[0m ):\n\u001b[1;32m 5901\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5902\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5903\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5904\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'numero'" + ] + } + ], "source": [ - "df[df.numero == 2835]['content'+corpus].values" + "df[df.numero == 2835][corpus+'-content'].values" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -772,261 +1744,181 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>volume</th>\n", - " <th>numero</th>\n", - " <th>head</th>\n", - " <th>author</th>\n", - " <th>edda_class</th>\n", - " <th>enccre_id</th>\n", - " <th>enccre_class</th>\n", - " <th>content</th>\n", - " <th>content_without_designant</th>\n", - " <th>first_paragraph</th>\n", - " <th>nb_words</th>\n", - " <th>super_domain</th>\n", - " <th>superdomainBert</th>\n", + " <th>uid</th>\n", + " <th>lge-volume</th>\n", + " <th>lge-numero</th>\n", + " <th>lge-head</th>\n", + " <th>lge-page</th>\n", + " <th>lge-id</th>\n", + " <th>lge-content</th>\n", + " <th>lge-nbWords</th>\n", + " <th>lge-superdomainBert</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", + " <td>lge_1_a-0</td>\n", " <td>1</td>\n", " <td>1</td>\n", - " <td>Title Page</td>\n", - " <td>unsigned</td>\n", - " <td>unclassified</td>\n", - " <td>NaN</td>\n", - " <td>NaN</td>\n", - " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", - " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", - " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", - " <td>151</td>\n", - " <td>Unclassified</td>\n", + " <td>A</td>\n", + " <td>0</td>\n", + " <td>a-0</td>\n", + " <td>A(Ling.). Son vocal et première lettre de notr...</td>\n", + " <td>1761.0</td>\n", " <td>Philosophie</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", + " <td>lge_1_a-1</td>\n", " <td>1</td>\n", " <td>2</td>\n", - " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", - " <td>Diderot & d'Alembert</td>\n", - " <td>unclassified</td>\n", - " <td>NaN</td>\n", - " <td>NaN</td>\n", - " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", - " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", - " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", - " <td>208</td>\n", - " <td>Unclassified</td>\n", - " <td>Philosophie</td>\n", + " <td>A</td>\n", + " <td>1</td>\n", + " <td>a-1</td>\n", + " <td>A(Paléogr.). C’est à l’alphabet phénicien, on ...</td>\n", + " <td>839.0</td>\n", + " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", + " <td>lge_1_a-2</td>\n", " <td>1</td>\n", " <td>3</td>\n", - " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", - " <td>d'Alembert</td>\n", - " <td>unclassified</td>\n", - " <td>NaN</td>\n", - " <td>NaN</td>\n", - " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", - " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", - " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n</td>\n", - " <td>44669</td>\n", - " <td>Unclassified</td>\n", - " <td>Belles-lettres</td>\n", + " <td>A</td>\n", + " <td>4</td>\n", + " <td>a-2</td>\n", + " <td>A(Log.). Cette voyelle désigne les proposition...</td>\n", + " <td>56.0</td>\n", + " <td>Philosophie</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", + " <td>lge_1_a-3</td>\n", " <td>1</td>\n", - " <td>5</td>\n", - " <td>A, a & a</td>\n", - " <td>Dumarsais5</td>\n", - " <td>Grammaire</td>\n", - " <td>v1-1-0</td>\n", - " <td>Grammaire</td>\n", - " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", - " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", - " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", - " <td>711</td>\n", - " <td>Philosophie</td>\n", - " <td>Philosophie</td>\n", + " <td>4</td>\n", + " <td>A</td>\n", + " <td>4</td>\n", + " <td>a-3</td>\n", + " <td>A(Mus.). La lettre a est employée par les musi...</td>\n", + " <td>267.0</td>\n", + " <td>Musique</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", + " <td>lge_1_a-4</td>\n", " <td>1</td>\n", - " <td>6</td>\n", + " <td>5</td>\n", " <td>A</td>\n", - " <td>Dumarsais5</td>\n", - " <td>unclassified</td>\n", - " <td>v1-1-1</td>\n", - " <td>Grammaire</td>\n", - " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", - " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", - " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", - " <td>238</td>\n", - " <td>Unclassified</td>\n", - " <td>Philosophie</td>\n", + " <td>4</td>\n", + " <td>a-4</td>\n", + " <td>A(Numis.). Dans la numismatique grecque, la le...</td>\n", + " <td>67.0</td>\n", + " <td>Histoire</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", + " <td>lge_1_aa-0</td>\n", " <td>1</td>\n", - " <td>7</td>\n", - " <td>A</td>\n", - " <td>Dumarsais</td>\n", - " <td>unclassified</td>\n", - " <td>v1-1-2</td>\n", - " <td>Grammaire</td>\n", - " <td>\\nA, préposition vient du latin à , à dextris, ...</td>\n", - " <td>\\nA, préposition vient du latin à , à dextris, ...</td>\n", - " <td>\\nA, préposition vient du latin à , à dextris, ...</td>\n", - " <td>1980</td>\n", - " <td>Unclassified</td>\n", - " <td>Philosophie</td>\n", + " <td>6</td>\n", + " <td>AA</td>\n", + " <td>4</td>\n", + " <td>aa-0</td>\n", + " <td>AA. Ces deux lettres désignent l’atelier monét...</td>\n", + " <td>14.0</td>\n", + " <td>Commerce</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", + " <td>lge_1_aa-1</td>\n", " <td>1</td>\n", - " <td>8</td>\n", - " <td>A</td>\n", - " <td>Mallet</td>\n", - " <td>unclassified</td>\n", - " <td>v1-1-3</td>\n", - " <td>NaN</td>\n", - " <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n", - " <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n", - " <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n", - " <td>200</td>\n", - " <td>Unclassified</td>\n", - " <td>Histoire</td>\n", + " <td>7</td>\n", + " <td>AA</td>\n", + " <td>4</td>\n", + " <td>aa-1</td>\n", + " <td>AA. Nom de plusieurs cours d’eau de l’Europe o...</td>\n", + " <td>75.0</td>\n", + " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", + " <td>lge_1_aa-2</td>\n", " <td>1</td>\n", - " <td>9</td>\n", - " <td>A, lettre symbolique</td>\n", - " <td>Mallet</td>\n", - " <td>unclassified</td>\n", - " <td>v1-1-4</td>\n", - " <td>NaN</td>\n", - " <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n", - " <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n", - " <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n", - " <td>82</td>\n", - " <td>Unclassified</td>\n", - " <td>Histoire</td>\n", + " <td>8</td>\n", + " <td>AA</td>\n", + " <td>5</td>\n", + " <td>aa-2</td>\n", + " <td>AA. Rivière de France, prend sa source aux Tro...</td>\n", + " <td>165.0</td>\n", + " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", + " <td>lge_1_aa-3</td>\n", " <td>1</td>\n", - " <td>10</td>\n", - " <td>A, numismatique ou monétaire</td>\n", - " <td>Mallet</td>\n", - " <td>unclassified</td>\n", - " <td>v1-1-5</td>\n", - " <td>Médailles</td>\n", - " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", - " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", - " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", - " <td>112</td>\n", - " <td>Unclassified</td>\n", - " <td>Histoire</td>\n", + " <td>9</td>\n", + " <td>AA</td>\n", + " <td>5</td>\n", + " <td>aa-3</td>\n", + " <td>AA. Rivière de Hollande, affluent de la Dommel...</td>\n", + " <td>17.0</td>\n", + " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", + " <td>lge_1_aa-4</td>\n", " <td>1</td>\n", - " <td>11</td>\n", - " <td>A, lapidaire</td>\n", - " <td>Mallet</td>\n", - " <td>unclassified</td>\n", - " <td>v1-1-6</td>\n", - " <td>Histoire</td>\n", - " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", - " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", - " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", - " <td>80</td>\n", - " <td>Unclassified</td>\n", - " <td>Histoire</td>\n", + " <td>10</td>\n", + " <td>AA</td>\n", + " <td>5</td>\n", + " <td>aa-4</td>\n", + " <td>AA. Nom de deux fleuves de la Russie. Le premi...</td>\n", + " <td>71.0</td>\n", + " <td>Géographie</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " volume numero head author \\\n", - "0 1 1 Title Page unsigned \n", - "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert \n", - "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert \n", - "3 1 5 A, a & a Dumarsais5 \n", - "4 1 6 A Dumarsais5 \n", - "5 1 7 A Dumarsais \n", - "6 1 8 A Mallet \n", - "7 1 9 A, lettre symbolique Mallet \n", - "8 1 10 A, numismatique ou monétaire Mallet \n", - "9 1 11 A, lapidaire Mallet \n", - "\n", - " edda_class enccre_id enccre_class \\\n", - "0 unclassified NaN NaN \n", - "1 unclassified NaN NaN \n", - "2 unclassified NaN NaN \n", - "3 Grammaire v1-1-0 Grammaire \n", - "4 unclassified v1-1-1 Grammaire \n", - "5 unclassified v1-1-2 Grammaire \n", - "6 unclassified v1-1-3 NaN \n", - "7 unclassified v1-1-4 NaN \n", - "8 unclassified v1-1-5 Médailles \n", - "9 unclassified v1-1-6 Histoire \n", - "\n", - " content \\\n", - "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", - "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", - "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", - "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", - "4 \\nA, mot, est 1. la troisieme personne du prés... \n", - "5 \\nA, préposition vient du latin à , à dextris, ... \n", - "6 \\nA, étoit une lettre numérale parmi les Ancie... \n", - "7 \\nA, lettre symbolique, étoit un hiéroglyphe c... \n", - "8 \\nA, numismatique ou monétaire, sur le revers ... \n", - "9 \\nA, lapidaire, dans les anciennes inscription... \n", + " uid lge-volume lge-numero lge-head lge-page lge-id \\\n", + "0 lge_1_a-0 1 1 A 0 a-0 \n", + "1 lge_1_a-1 1 2 A 1 a-1 \n", + "2 lge_1_a-2 1 3 A 4 a-2 \n", + "3 lge_1_a-3 1 4 A 4 a-3 \n", + "4 lge_1_a-4 1 5 A 4 a-4 \n", + "5 lge_1_aa-0 1 6 AA 4 aa-0 \n", + "6 lge_1_aa-1 1 7 AA 4 aa-1 \n", + "7 lge_1_aa-2 1 8 AA 5 aa-2 \n", + "8 lge_1_aa-3 1 9 AA 5 aa-3 \n", + "9 lge_1_aa-4 1 10 AA 5 aa-4 \n", "\n", - " content_without_designant \\\n", - "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", - "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", - "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", - "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", - "4 \\nA, mot, est 1. la troisieme personne du prés... \n", - "5 \\nA, préposition vient du latin à , à dextris, ... \n", - "6 \\nA, étoit une lettre numérale parmi les Ancie... \n", - "7 \\nA, lettre symbolique, étoit un hiéroglyphe c... \n", - "8 \\nA, numismatique ou monétaire, sur le revers ... \n", - "9 \\nA, lapidaire, dans les anciennes inscription... \n", + " lge-content lge-nbWords \\\n", + "0 A(Ling.). Son vocal et première lettre de notr... 1761.0 \n", + "1 A(Paléogr.). C’est à l’alphabet phénicien, on ... 839.0 \n", + "2 A(Log.). Cette voyelle désigne les proposition... 56.0 \n", + "3 A(Mus.). La lettre a est employée par les musi... 267.0 \n", + "4 A(Numis.). Dans la numismatique grecque, la le... 67.0 \n", + "5 AA. Ces deux lettres désignent l’atelier monét... 14.0 \n", + "6 AA. Nom de plusieurs cours d’eau de l’Europe o... 75.0 \n", + "7 AA. Rivière de France, prend sa source aux Tro... 165.0 \n", + "8 AA. Rivière de Hollande, affluent de la Dommel... 17.0 \n", + "9 AA. Nom de deux fleuves de la Russie. Le premi... 71.0 \n", "\n", - " first_paragraph nb_words super_domain \\\n", - "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... 151 Unclassified \n", - "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... 208 Unclassified \n", - "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n 44669 Unclassified \n", - "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... 711 Philosophie \n", - "4 \\nA, mot, est 1. la troisieme personne du prés... 238 Unclassified \n", - "5 \\nA, préposition vient du latin à , à dextris, ... 1980 Unclassified \n", - "6 \\nA, étoit une lettre numérale parmi les Ancie... 200 Unclassified \n", - "7 \\nA, lettre symbolique, étoit un hiéroglyphe c... 82 Unclassified \n", - "8 \\nA, numismatique ou monétaire, sur le revers ... 112 Unclassified \n", - "9 \\nA, lapidaire, dans les anciennes inscription... 80 Unclassified \n", - "\n", - " superdomainBert \n", - "0 Philosophie \n", - "1 Philosophie \n", - "2 Belles-lettres \n", - "3 Philosophie \n", - "4 Philosophie \n", - "5 Philosophie \n", - "6 Histoire \n", - "7 Histoire \n", - "8 Histoire \n", - "9 Histoire " + " lge-superdomainBert \n", + "0 Philosophie \n", + "1 Géographie \n", + "2 Philosophie \n", + "3 Musique \n", + "4 Histoire \n", + "5 Commerce \n", + "6 Géographie \n", + "7 Géographie \n", + "8 Géographie \n", + "9 Géographie " ] }, - "execution_count": 20, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1037,40 +1929,30 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": { "id": "J9rObbvVr0zc" }, "outputs": [], "source": [ - "df.to_csv(drive_path + \"predictions/EDdA_dataset_articles_superdomainBERT_230313.tsv\", sep=\"\\t\")" + "#df.to_csv(drive_path + \"predictions/EDdA_dataset_articles_superdomainBERT_230313.tsv\", sep=\"\\t\")\n", + "df.to_csv(drive_path + \"predictions/LGE_dataset_articles_superdomainBERT_230314.tsv\", sep=\"\\t\", index=False)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "id": "8cX6XBq8_F5T" }, "outputs": [], "source": [ - "df.drop(columns=['contentLGE', 'contentEDdA'], inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7fx6BPpg0iNc" - }, - "outputs": [], - "source": [ - "df.to_csv(drive_path + \"predictions/metadata_parallel_predictions_superdomain.csv\", sep=\",\", index=False)" + "#df.drop(columns=['contentLGE', 'contentEDdA'], inplace=True)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": { "id": "7TD1mbKj_fXH" }, @@ -1096,100 +1978,76 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>volume</th>\n", - " <th>numero</th>\n", - " <th>head</th>\n", - " <th>author</th>\n", - " <th>edda_class</th>\n", - " <th>enccre_id</th>\n", - " <th>enccre_class</th>\n", - " <th>content</th>\n", - " <th>content_without_designant</th>\n", - " <th>first_paragraph</th>\n", - " <th>nb_words</th>\n", - " <th>super_domain</th>\n", - " <th>superdomainBert</th>\n", + " <th>uid</th>\n", + " <th>lge-volume</th>\n", + " <th>lge-numero</th>\n", + " <th>lge-head</th>\n", + " <th>lge-page</th>\n", + " <th>lge-id</th>\n", + " <th>lge-content</th>\n", + " <th>lge-nbWords</th>\n", + " <th>lge-superdomainBert</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>24</th>\n", + " <th>1</th>\n", + " <td>lge_1_a-1</td>\n", " <td>1</td>\n", - " <td>26</td>\n", + " <td>2</td>\n", " <td>A</td>\n", - " <td>Diderot</td>\n", - " <td>unclassified</td>\n", - " <td>v1-9-0</td>\n", - " <td>Géographie</td>\n", - " <td>\\n* A, s. petite riviere de France, qui a sa s...</td>\n", - " <td>\\n* A, s. petite riviere de France, qui a sa s...</td>\n", - " <td>\\n* A, s. petite riviere de France, qui a sa s...</td>\n", - " <td>15</td>\n", - " <td>Unclassified</td>\n", + " <td>1</td>\n", + " <td>a-1</td>\n", + " <td>A(Paléogr.). C’est à l’alphabet phénicien, on ...</td>\n", + " <td>839.0</td>\n", " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", - " <th>25</th>\n", + " <th>6</th>\n", + " <td>lge_1_aa-1</td>\n", " <td>1</td>\n", - " <td>27</td>\n", + " <td>7</td>\n", " <td>AA</td>\n", - " <td>Diderot</td>\n", - " <td>unclassified</td>\n", - " <td>v1-10-0</td>\n", - " <td>Géographie</td>\n", - " <td>\\n* AA, s. f. riviere de France, qui prend sa ...</td>\n", - " <td>\\n* AA, s. f. riviere de France, qui prend sa ...</td>\n", - " <td>\\n* AA, s. f. riviere de France, qui prend sa ...</td>\n", - " <td>46</td>\n", - " <td>Unclassified</td>\n", + " <td>4</td>\n", + " <td>aa-1</td>\n", + " <td>AA. Nom de plusieurs cours d’eau de l’Europe o...</td>\n", + " <td>75.0</td>\n", " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", - " <th>27</th>\n", + " <th>7</th>\n", + " <td>lge_1_aa-2</td>\n", " <td>1</td>\n", - " <td>29</td>\n", - " <td>AACH ou ACH</td>\n", - " <td>Diderot</td>\n", - " <td>unclassified</td>\n", - " <td>v1-12-0</td>\n", - " <td>Géographie</td>\n", - " <td>\\n* AACH ou ACH, s. f. petite ville d'Allemagn...</td>\n", - " <td>\\n* AACH ou ACH, s. f. petite ville d'Allemagn...</td>\n", - " <td>\\n* AACH ou ACH, s. f. petite ville d'Allemagn...</td>\n", - " <td>24</td>\n", - " <td>Unclassified</td>\n", + " <td>8</td>\n", + " <td>AA</td>\n", + " <td>5</td>\n", + " <td>aa-2</td>\n", + " <td>AA. Rivière de France, prend sa source aux Tro...</td>\n", + " <td>165.0</td>\n", " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", - " <th>28</th>\n", + " <th>8</th>\n", + " <td>lge_1_aa-3</td>\n", " <td>1</td>\n", - " <td>30</td>\n", - " <td>AAHUS</td>\n", - " <td>Diderot</td>\n", - " <td>unclassified</td>\n", - " <td>v1-13-0</td>\n", - " <td>Géographie</td>\n", - " <td>\\n* AAHUS, s. petite ville d'Allemagne dans le...</td>\n", - " <td>\\n* AAHUS, s. petite ville d'Allemagne dans le...</td>\n", - " <td>\\n* AAHUS, s. petite ville d'Allemagne dans le...</td>\n", - " <td>21</td>\n", - " <td>Unclassified</td>\n", + " <td>9</td>\n", + " <td>AA</td>\n", + " <td>5</td>\n", + " <td>aa-3</td>\n", + " <td>AA. Rivière de Hollande, affluent de la Dommel...</td>\n", + " <td>17.0</td>\n", " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", - " <th>30</th>\n", + " <th>9</th>\n", + " <td>lge_1_aa-4</td>\n", " <td>1</td>\n", - " <td>32</td>\n", - " <td>AAR</td>\n", - " <td>Diderot</td>\n", - " <td>unclassified</td>\n", - " <td>v1-15-0</td>\n", - " <td>Géographie</td>\n", - " <td>\\n* AAR, s. grande riviere qui a sa source pro...</td>\n", - " <td>\\n* AAR, s. grande riviere qui a sa source pro...</td>\n", - " <td>\\n* AAR, s. grande riviere qui a sa source pro...</td>\n", - " <td>30</td>\n", - " <td>Unclassified</td>\n", + " <td>10</td>\n", + " <td>AA</td>\n", + " <td>5</td>\n", + " <td>aa-4</td>\n", + " <td>AA. Nom de deux fleuves de la Russie. Le premi...</td>\n", + " <td>71.0</td>\n", " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", @@ -1203,176 +2061,133 @@ " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", " </tr>\n", " <tr>\n", - " <th>74051</th>\n", - " <td>17</td>\n", - " <td>3070</td>\n", - " <td>ZYGRIS</td>\n", - " <td>Jaucourt</td>\n", - " <td>Géographie ancienne</td>\n", - " <td>v17-2068-0</td>\n", - " <td>Géographie</td>\n", - " <td>\\nZYGRIS, (Géog. anc.) ville du nôme de Lybie\\...</td>\n", - " <td>\\nZYGRIS, ville du nôme de Lybie\\nsur la côte...</td>\n", - " <td>\\nZYGRIS, ville du nôme de Lybie\\nsur la côte...</td>\n", - " <td>38</td>\n", - " <td>Géographie</td>\n", + " <th>134800</th>\n", + " <td>lge_31_zvornix-0</td>\n", + " <td>31</td>\n", + " <td>7757</td>\n", + " <td>ZVORNIX</td>\n", + " <td>1370</td>\n", + " <td>zvornix-0</td>\n", + " <td>ZVORNIX. Ville de Bosnie, sur la r. g. de la D...</td>\n", + " <td>27.0</td>\n", " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", - " <th>74054</th>\n", - " <td>17</td>\n", - " <td>3073</td>\n", - " <td>ZYRAS</td>\n", - " <td>Jaucourt</td>\n", - " <td>Géographie ancienne</td>\n", - " <td>v17-2071-0</td>\n", - " <td>Géographie</td>\n", - " <td>\\nZYRAS, (Géog. anc.) fleuve de Thrace. Pline,...</td>\n", - " <td>\\nZYRAS, fleuve de Thrace. Pline,\\nliv. IV. c...</td>\n", - " <td>\\nZYRAS, fleuve de Thrace. Pline,\\nliv. IV. c...</td>\n", - " <td>28</td>\n", - " <td>Géographie</td>\n", + " <th>134801</th>\n", + " <td>lge_31_zweibrücken-0</td>\n", + " <td>31</td>\n", + " <td>7758</td>\n", + " <td>ZWEIBRÜCKEN</td>\n", + " <td>1370</td>\n", + " <td>zweibrücken-0</td>\n", + " <td>ZWEIBRÜCKEN. Ville de Bavière (V. Deux-Ponts).\\n</td>\n", + " <td>6.0</td>\n", " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", - " <th>74055</th>\n", - " <td>17</td>\n", - " <td>3074</td>\n", - " <td>ZZUÉNÉ ou ZZEUENE</td>\n", - " <td>Jaucourt</td>\n", - " <td>Géographie ancienne</td>\n", - " <td>v17-2072-0</td>\n", - " <td>Géographie</td>\n", - " <td>\\nZZUÉNÉ ou ZZEUENE, (Géog. anc.) ville située...</td>\n", - " <td>\\nZZUÉNÉ ou ZZEUENE, ville située\\nsur la riv...</td>\n", - " <td>\\nZZUÉNÉ ou ZZEUENE, ville située\\nsur la riv...</td>\n", - " <td>149</td>\n", - " <td>Géographie</td>\n", + " <th>134803</th>\n", + " <td>lge_31_zwickau-0</td>\n", + " <td>31</td>\n", + " <td>7760</td>\n", + " <td>ZWICKAU</td>\n", + " <td>1370</td>\n", + " <td>zwickau-0</td>\n", + " <td>ZWICKAU. Ville de Saxe, ch.-l. d’un cercle, su...</td>\n", + " <td>92.0</td>\n", " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", - " <th>74080</th>\n", - " <td>17</td>\n", - " <td>3099</td>\n", - " <td>CABOTAGE</td>\n", - " <td>Jaucourt</td>\n", - " <td>Navigation</td>\n", - " <td>v17-2097-0</td>\n", - " <td>Marine</td>\n", - " <td>\\nCABOTAGE, s. m. (Navigation.) le cabotage es...</td>\n", - " <td>\\nCABOTAGE, s. m. le cabotage est\\nune naviga...</td>\n", - " <td>\\nCABOTAGE, s. m. le cabotage est\\nune naviga...</td>\n", - " <td>192</td>\n", - " <td>Géographie</td>\n", + " <th>134806</th>\n", + " <td>lge_31_zwolle-0</td>\n", + " <td>31</td>\n", + " <td>7763</td>\n", + " <td>ZWOLLE</td>\n", + " <td>1371</td>\n", + " <td>zwolle-0</td>\n", + " <td>ZWOLLE. Ville des Pays-Bas, ch.-l. de la prov....</td>\n", + " <td>115.0</td>\n", " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", - " <th>74165</th>\n", - " <td>17</td>\n", - " <td>3184</td>\n", - " <td>GUAYAQUIL</td>\n", - " <td>La Condamine</td>\n", - " <td>Géographie</td>\n", - " <td>v17-2177-0</td>\n", - " <td>Géographie</td>\n", - " <td>\\nGUAYAQUIL, (Géograph.) nom d'une ville &\\nd'...</td>\n", - " <td>\\nGUAYAQUIL, nom d'une ville &\\nd'une grande ...</td>\n", - " <td>\\nGUAYAQUIL, nom d'une ville &\\nd'une grande ...</td>\n", - " <td>446</td>\n", - " <td>Géographie</td>\n", + " <th>134819</th>\n", + " <td>lge_31_zyrmi-0</td>\n", + " <td>31</td>\n", + " <td>7776</td>\n", + " <td>ZYRMI</td>\n", + " <td>1372</td>\n", + " <td>zyrmi-0</td>\n", + " <td>ZYRMI. Ville du Soudan. Ancienne capitale du p...</td>\n", + " <td>16.0</td>\n", " <td>Géographie</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", - "<p>15383 rows × 13 columns</p>\n", + "<p>50917 rows × 9 columns</p>\n", "</div>" ], "text/plain": [ - " volume numero head author edda_class \\\n", - "24 1 26 A Diderot unclassified \n", - "25 1 27 AA Diderot unclassified \n", - "27 1 29 AACH ou ACH Diderot unclassified \n", - "28 1 30 AAHUS Diderot unclassified \n", - "30 1 32 AAR Diderot unclassified \n", - "... ... ... ... ... ... \n", - "74051 17 3070 ZYGRIS Jaucourt Géographie ancienne \n", - "74054 17 3073 ZYRAS Jaucourt Géographie ancienne \n", - "74055 17 3074 ZZUÉNÉ ou ZZEUENE Jaucourt Géographie ancienne \n", - "74080 17 3099 CABOTAGE Jaucourt Navigation \n", - "74165 17 3184 GUAYAQUIL La Condamine Géographie \n", - "\n", - " enccre_id enccre_class \\\n", - "24 v1-9-0 Géographie \n", - "25 v1-10-0 Géographie \n", - "27 v1-12-0 Géographie \n", - "28 v1-13-0 Géographie \n", - "30 v1-15-0 Géographie \n", - "... ... ... \n", - "74051 v17-2068-0 Géographie \n", - "74054 v17-2071-0 Géographie \n", - "74055 v17-2072-0 Géographie \n", - "74080 v17-2097-0 Marine \n", - "74165 v17-2177-0 Géographie \n", - "\n", - " content \\\n", - "24 \\n* A, s. petite riviere de France, qui a sa s... \n", - "25 \\n* AA, s. f. riviere de France, qui prend sa ... \n", - "27 \\n* AACH ou ACH, s. f. petite ville d'Allemagn... \n", - "28 \\n* AAHUS, s. petite ville d'Allemagne dans le... \n", - "30 \\n* AAR, s. grande riviere qui a sa source pro... \n", - "... ... \n", - "74051 \\nZYGRIS, (Géog. anc.) ville du nôme de Lybie\\... \n", - "74054 \\nZYRAS, (Géog. anc.) fleuve de Thrace. Pline,... \n", - "74055 \\nZZUÉNÉ ou ZZEUENE, (Géog. anc.) ville située... \n", - "74080 \\nCABOTAGE, s. m. (Navigation.) le cabotage es... \n", - "74165 \\nGUAYAQUIL, (Géograph.) nom d'une ville &\\nd'... \n", - "\n", - " content_without_designant \\\n", - "24 \\n* A, s. petite riviere de France, qui a sa s... \n", - "25 \\n* AA, s. f. riviere de France, qui prend sa ... \n", - "27 \\n* AACH ou ACH, s. f. petite ville d'Allemagn... \n", - "28 \\n* AAHUS, s. petite ville d'Allemagne dans le... \n", - "30 \\n* AAR, s. grande riviere qui a sa source pro... \n", - "... ... \n", - "74051 \\nZYGRIS, ville du nôme de Lybie\\nsur la côte... \n", - "74054 \\nZYRAS, fleuve de Thrace. Pline,\\nliv. IV. c... \n", - "74055 \\nZZUÉNÉ ou ZZEUENE, ville située\\nsur la riv... \n", - "74080 \\nCABOTAGE, s. m. le cabotage est\\nune naviga... \n", - "74165 \\nGUAYAQUIL, nom d'une ville &\\nd'une grande ... \n", + " uid lge-volume lge-numero lge-head lge-page \\\n", + "1 lge_1_a-1 1 2 A 1 \n", + "6 lge_1_aa-1 1 7 AA 4 \n", + "7 lge_1_aa-2 1 8 AA 5 \n", + "8 lge_1_aa-3 1 9 AA 5 \n", + "9 lge_1_aa-4 1 10 AA 5 \n", + "... ... ... ... ... ... \n", + "134800 lge_31_zvornix-0 31 7757 ZVORNIX 1370 \n", + "134801 lge_31_zweibrücken-0 31 7758 ZWEIBRÜCKEN 1370 \n", + "134803 lge_31_zwickau-0 31 7760 ZWICKAU 1370 \n", + "134806 lge_31_zwolle-0 31 7763 ZWOLLE 1371 \n", + "134819 lge_31_zyrmi-0 31 7776 ZYRMI 1372 \n", "\n", - " first_paragraph nb_words \\\n", - "24 \\n* A, s. petite riviere de France, qui a sa s... 15 \n", - "25 \\n* AA, s. f. riviere de France, qui prend sa ... 46 \n", - "27 \\n* AACH ou ACH, s. f. petite ville d'Allemagn... 24 \n", - "28 \\n* AAHUS, s. petite ville d'Allemagne dans le... 21 \n", - "30 \\n* AAR, s. grande riviere qui a sa source pro... 30 \n", - "... ... ... \n", - "74051 \\nZYGRIS, ville du nôme de Lybie\\nsur la côte... 38 \n", - "74054 \\nZYRAS, fleuve de Thrace. Pline,\\nliv. IV. c... 28 \n", - "74055 \\nZZUÉNÉ ou ZZEUENE, ville située\\nsur la riv... 149 \n", - "74080 \\nCABOTAGE, s. m. le cabotage est\\nune naviga... 192 \n", - "74165 \\nGUAYAQUIL, nom d'une ville &\\nd'une grande ... 446 \n", + " lge-id lge-content \\\n", + "1 a-1 A(Paléogr.). C’est à l’alphabet phénicien, on ... \n", + "6 aa-1 AA. Nom de plusieurs cours d’eau de l’Europe o... \n", + "7 aa-2 AA. Rivière de France, prend sa source aux Tro... \n", + "8 aa-3 AA. Rivière de Hollande, affluent de la Dommel... \n", + "9 aa-4 AA. Nom de deux fleuves de la Russie. Le premi... \n", + "... ... ... \n", + "134800 zvornix-0 ZVORNIX. Ville de Bosnie, sur la r. g. de la D... \n", + "134801 zweibrücken-0 ZWEIBRÜCKEN. Ville de Bavière (V. Deux-Ponts).\\n \n", + "134803 zwickau-0 ZWICKAU. Ville de Saxe, ch.-l. d’un cercle, su... \n", + "134806 zwolle-0 ZWOLLE. Ville des Pays-Bas, ch.-l. de la prov.... \n", + "134819 zyrmi-0 ZYRMI. Ville du Soudan. Ancienne capitale du p... \n", "\n", - " super_domain superdomainBert \n", - "24 Unclassified Géographie \n", - "25 Unclassified Géographie \n", - "27 Unclassified Géographie \n", - "28 Unclassified Géographie \n", - "30 Unclassified Géographie \n", - "... ... ... \n", - "74051 Géographie Géographie \n", - "74054 Géographie Géographie \n", - "74055 Géographie Géographie \n", - "74080 Géographie Géographie \n", - "74165 Géographie Géographie \n", + " lge-nbWords lge-superdomainBert \n", + "1 839.0 Géographie \n", + "6 75.0 Géographie \n", + "7 165.0 Géographie \n", + "8 17.0 Géographie \n", + "9 71.0 Géographie \n", + "... ... ... \n", + "134800 27.0 Géographie \n", + "134801 6.0 Géographie \n", + "134803 92.0 Géographie \n", + "134806 115.0 Géographie \n", + "134819 16.0 Géographie \n", "\n", - "[15383 rows x 13 columns]" + "[50917 rows x 9 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[(df[corpus+'-superdomainBert'] == 'Géographie')]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(134820, 9)" ] }, "execution_count": 22, @@ -1381,7 +2196,7 @@ } ], "source": [ - "df.loc[(df['superdomainBert'] == 'Géographie')]" + "df.shape" ] }, { -- GitLab