diff --git a/notebooks/Predict.ipynb b/notebooks/Predict.ipynb index 1ea2070ad60244e6583b5d8c2da8849c4d135d62..a3dcfe00df981474d9c6b31e59b89d9f9e92f3bf 100644 --- a/notebooks/Predict.ipynb +++ b/notebooks/Predict.ipynb @@ -98,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -146,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 1, "metadata": { "id": "SkErnwgMMbRj" }, @@ -190,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 2, "metadata": { "id": "M2awiee1r0zV" }, @@ -199,12 +199,12 @@ "#drive_path = \"drive/MyDrive/Classification-EDdA/\"\n", "drive_path = \"../\"\n", "#path = \"/Users/lmoncla/git/gitlab.liris/GEODE/EDdA/output/\"\n", - "path = \"/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/\"\n", - "\n", + "path = \"/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/\"\n", + "#path = \"/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/\"\n", "\n", "#filepath = \"Parallel_datatset_articles_230215.tsv\"\n", - "#filepath = \"EDdA_dataset_articles.tsv\"\n", - "filepath = 'LGE_dataset_articles_230314.tsv'\n", + "filepath = \"EDdA_dataset_articles_221208.tsv\"\n", + "#filepath = 'LGE_dataset_articles_230314.tsv'\n", "\n", "corpus = 'lge'\n", "#corpus = ''" @@ -212,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -243,93 +243,138 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>uid</th>\n", - " <th>lge-volume</th>\n", - " <th>lge-numero</th>\n", - " <th>lge-head</th>\n", - " <th>lge-page</th>\n", - " <th>lge-id</th>\n", - " <th>lge-content</th>\n", - " <th>lge-nbWords</th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>author</th>\n", + " <th>edda_class</th>\n", + " <th>enccre_id</th>\n", + " <th>enccre_class</th>\n", + " <th>content</th>\n", + " <th>content_without_designant</th>\n", + " <th>first_paragraph</th>\n", + " <th>nb_words</th>\n", + " <th>super_domain</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>lge_1_a-0</td>\n", " <td>1</td>\n", " <td>1</td>\n", - " <td>A</td>\n", - " <td>0</td>\n", - " <td>a-0</td>\n", - " <td>A(Ling.). Son vocal et première lettre de notr...</td>\n", - " <td>1761.0</td>\n", + " <td>Title Page</td>\n", + " <td>unsigned</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>151</td>\n", + " <td>unclassified</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>lge_1_a-1</td>\n", " <td>1</td>\n", " <td>2</td>\n", - " <td>A</td>\n", - " <td>1</td>\n", - " <td>a-1</td>\n", - " <td>A(Paléogr.). C’est à l’alphabet phénicien, on ...</td>\n", - " <td>839.0</td>\n", + " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", + " <td>Diderot & d'Alembert</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>208</td>\n", + " <td>unclassified</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>lge_1_a-2</td>\n", " <td>1</td>\n", " <td>3</td>\n", - " <td>A</td>\n", - " <td>4</td>\n", - " <td>a-2</td>\n", - " <td>A(Log.). Cette voyelle désigne les proposition...</td>\n", - " <td>56.0</td>\n", + " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", + " <td>d'Alembert</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n</td>\n", + " <td>44669</td>\n", + " <td>unclassified</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>lge_1_a-3</td>\n", " <td>1</td>\n", - " <td>4</td>\n", - " <td>A</td>\n", - " <td>4</td>\n", - " <td>a-3</td>\n", - " <td>A(Mus.). La lettre a est employée par les musi...</td>\n", - " <td>267.0</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Dumarsais5</td>\n", + " <td>Grammaire</td>\n", + " <td>v1-1-0</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>711</td>\n", + " <td>Philosophie</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>lge_1_a-4</td>\n", " <td>1</td>\n", - " <td>5</td>\n", + " <td>6</td>\n", " <td>A</td>\n", - " <td>4</td>\n", - " <td>a-4</td>\n", - " <td>A(Numis.). Dans la numismatique grecque, la le...</td>\n", - " <td>67.0</td>\n", + " <td>Dumarsais5</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-1</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>238</td>\n", + " <td>unclassified</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " uid lge-volume lge-numero lge-head lge-page lge-id \\\n", - "0 lge_1_a-0 1 1 A 0 a-0 \n", - "1 lge_1_a-1 1 2 A 1 a-1 \n", - "2 lge_1_a-2 1 3 A 4 a-2 \n", - "3 lge_1_a-3 1 4 A 4 a-3 \n", - "4 lge_1_a-4 1 5 A 4 a-4 \n", + " volume numero head author \\\n", + "0 1 1 Title Page unsigned \n", + "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert \n", + "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert \n", + "3 1 5 A, a & a Dumarsais5 \n", + "4 1 6 A Dumarsais5 \n", + "\n", + " edda_class enccre_id enccre_class \\\n", + "0 unclassified NaN NaN \n", + "1 unclassified NaN NaN \n", + "2 unclassified NaN NaN \n", + "3 Grammaire v1-1-0 Grammaire \n", + "4 unclassified v1-1-1 Grammaire \n", + "\n", + " content \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... \n", "\n", - " lge-content lge-nbWords \n", - "0 A(Ling.). Son vocal et première lettre de notr... 1761.0 \n", - "1 A(Paléogr.). C’est à l’alphabet phénicien, on ... 839.0 \n", - "2 A(Log.). Cette voyelle désigne les proposition... 56.0 \n", - "3 A(Mus.). La lettre a est employée par les musi... 267.0 \n", - "4 A(Numis.). Dans la numismatique grecque, la le... 67.0 " + " content_without_designant \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... \n", + "\n", + " first_paragraph nb_words super_domain \n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... 151 unclassified \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... 208 unclassified \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n 44669 unclassified \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... 711 Philosophie \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... 238 unclassified " ] }, - "execution_count": 43, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -341,13 +386,14 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 4, "metadata": { "id": "Ndw4UtgWt_MJ" }, "outputs": [], "source": [ - "dataset = df[corpus+'-content'].values" + "dataset = df['content'].values\n", + "#dataset = df[corpus+'-content'].values" ] }, { @@ -363,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 5, "metadata": { "id": "0qDZ86qTr0zX" }, @@ -378,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "KEljGX0br0zX" }, @@ -488,7 +534,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -575,7 +621,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 10, "metadata": { "id": "CN8EZst-r0zZ" }, @@ -608,7 +654,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -635,14 +681,14 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "134820it [1:07:31, 33.27it/s]\n" + "74190it [41:03, 30.12it/s]\n" ] } ], @@ -659,7 +705,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 13, "metadata": { "id": "fo6k4li1r0za" }, @@ -685,7 +731,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 14, "metadata": { "id": "UU7qg7zVr0zb" }, @@ -698,7 +744,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -708,7 +754,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 16, "metadata": { "id": "w4eHpBztr0zb" }, @@ -724,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -755,14 +801,18 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>uid</th>\n", - " <th>lge-volume</th>\n", - " <th>lge-numero</th>\n", - " <th>lge-head</th>\n", - " <th>lge-page</th>\n", - " <th>lge-id</th>\n", - " <th>lge-content</th>\n", - " <th>lge-nbWords</th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>author</th>\n", + " <th>edda_class</th>\n", + " <th>enccre_id</th>\n", + " <th>enccre_class</th>\n", + " <th>content</th>\n", + " <th>content_without_designant</th>\n", + " <th>first_paragraph</th>\n", + " <th>nb_words</th>\n", + " <th>super_domain</th>\n", " <th>lge-superdomainPred1</th>\n", " <th>lge-superdomainProba1</th>\n", " <th>lge-superdomainPred2</th>\n", @@ -774,229 +824,305 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>lge_1_a-0</td>\n", " <td>1</td>\n", " <td>1</td>\n", - " <td>A</td>\n", - " <td>0</td>\n", - " <td>a-0</td>\n", - " <td>A(Ling.). Son vocal et première lettre de notr...</td>\n", - " <td>1761.0</td>\n", + " <td>Title Page</td>\n", + " <td>unsigned</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>151</td>\n", + " <td>unclassified</td>\n", " <td>Philosophie</td>\n", - " <td>0.937586</td>\n", + " <td>0.986489</td>\n", " <td>Belles-lettres</td>\n", - " <td>0.021192</td>\n", - " <td>Histoire</td>\n", - " <td>0.012657</td>\n", + " <td>0.002821</td>\n", + " <td>Politique</td>\n", + " <td>0.001780</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>lge_1_a-1</td>\n", " <td>1</td>\n", " <td>2</td>\n", - " <td>A</td>\n", - " <td>1</td>\n", - " <td>a-1</td>\n", - " <td>A(Paléogr.). C’est à l’alphabet phénicien, on ...</td>\n", - " <td>839.0</td>\n", - " <td>Géographie</td>\n", - " <td>0.992606</td>\n", + " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", + " <td>Diderot & d'Alembert</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>208</td>\n", + " <td>unclassified</td>\n", + " <td>Philosophie</td>\n", + " <td>0.943809</td>\n", " <td>Histoire</td>\n", - " <td>0.002934</td>\n", - " <td>Histoire naturelle</td>\n", - " <td>0.001019</td>\n", + " <td>0.014932</td>\n", + " <td>Politique</td>\n", + " <td>0.014871</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>lge_1_a-2</td>\n", " <td>1</td>\n", " <td>3</td>\n", - " <td>A</td>\n", - " <td>4</td>\n", - " <td>a-2</td>\n", - " <td>A(Log.). Cette voyelle désigne les proposition...</td>\n", - " <td>56.0</td>\n", - " <td>Philosophie</td>\n", - " <td>0.982367</td>\n", + " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", + " <td>d'Alembert</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n</td>\n", + " <td>44669</td>\n", + " <td>unclassified</td>\n", " <td>Belles-lettres</td>\n", - " <td>0.004124</td>\n", + " <td>0.926219</td>\n", + " <td>Histoire</td>\n", + " <td>0.019612</td>\n", " <td>Beaux-arts</td>\n", - " <td>0.002203</td>\n", + " <td>0.011769</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>lge_1_a-3</td>\n", " <td>1</td>\n", - " <td>4</td>\n", - " <td>A</td>\n", - " <td>4</td>\n", - " <td>a-3</td>\n", - " <td>A(Mus.). La lettre a est employée par les musi...</td>\n", - " <td>267.0</td>\n", - " <td>Musique</td>\n", - " <td>0.905895</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Dumarsais5</td>\n", + " <td>Grammaire</td>\n", + " <td>v1-1-0</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>711</td>\n", + " <td>Philosophie</td>\n", + " <td>Philosophie</td>\n", + " <td>0.978732</td>\n", + " <td>Politique</td>\n", + " <td>0.004091</td>\n", " <td>Belles-lettres</td>\n", - " <td>0.029459</td>\n", - " <td>Histoire</td>\n", - " <td>0.014980</td>\n", + " <td>0.002425</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>lge_1_a-4</td>\n", " <td>1</td>\n", - " <td>5</td>\n", + " <td>6</td>\n", " <td>A</td>\n", - " <td>4</td>\n", - " <td>a-4</td>\n", - " <td>A(Numis.). Dans la numismatique grecque, la le...</td>\n", - " <td>67.0</td>\n", - " <td>Histoire</td>\n", - " <td>0.986111</td>\n", + " <td>Dumarsais5</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-1</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>238</td>\n", + " <td>unclassified</td>\n", + " <td>Philosophie</td>\n", + " <td>0.988337</td>\n", " <td>Belles-lettres</td>\n", - " <td>0.003949</td>\n", - " <td>Géographie</td>\n", - " <td>0.001527</td>\n", + " <td>0.003174</td>\n", + " <td>Beaux-arts</td>\n", + " <td>0.001221</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", - " <td>lge_1_aa-0</td>\n", " <td>1</td>\n", - " <td>6</td>\n", - " <td>AA</td>\n", - " <td>4</td>\n", - " <td>aa-0</td>\n", - " <td>AA. Ces deux lettres désignent l’atelier monét...</td>\n", - " <td>14.0</td>\n", - " <td>Commerce</td>\n", - " <td>0.986866</td>\n", - " <td>Droit Jurisprudence</td>\n", - " <td>0.002140</td>\n", - " <td>Politique</td>\n", - " <td>0.001812</td>\n", + " <td>7</td>\n", + " <td>A</td>\n", + " <td>Dumarsais</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-2</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, préposition vient du latin à , à dextris, ...</td>\n", + " <td>\\nA, préposition vient du latin à , à dextris, ...</td>\n", + " <td>\\nA, préposition vient du latin à , à dextris, ...</td>\n", + " <td>1980</td>\n", + " <td>unclassified</td>\n", + " <td>Philosophie</td>\n", + " <td>0.988102</td>\n", + " <td>Belles-lettres</td>\n", + " <td>0.002661</td>\n", + " <td>Beaux-arts</td>\n", + " <td>0.001391</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", - " <td>lge_1_aa-1</td>\n", " <td>1</td>\n", - " <td>7</td>\n", - " <td>AA</td>\n", - " <td>4</td>\n", - " <td>aa-1</td>\n", - " <td>AA. Nom de plusieurs cours d’eau de l’Europe o...</td>\n", - " <td>75.0</td>\n", - " <td>Géographie</td>\n", - " <td>0.954104</td>\n", + " <td>8</td>\n", + " <td>A</td>\n", + " <td>Mallet</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-3</td>\n", + " <td>NaN</td>\n", + " <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n", + " <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n", + " <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n", + " <td>200</td>\n", + " <td>unclassified</td>\n", " <td>Histoire</td>\n", - " <td>0.025117</td>\n", - " <td>Histoire naturelle</td>\n", - " <td>0.008872</td>\n", + " <td>0.631214</td>\n", + " <td>Belles-lettres</td>\n", + " <td>0.320553</td>\n", + " <td>Physique</td>\n", + " <td>0.007173</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", - " <td>lge_1_aa-2</td>\n", " <td>1</td>\n", - " <td>8</td>\n", - " <td>AA</td>\n", - " <td>5</td>\n", - " <td>aa-2</td>\n", - " <td>AA. Rivière de France, prend sa source aux Tro...</td>\n", - " <td>165.0</td>\n", - " <td>Géographie</td>\n", - " <td>0.998200</td>\n", + " <td>9</td>\n", + " <td>A, lettre symbolique</td>\n", + " <td>Mallet</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-4</td>\n", + " <td>NaN</td>\n", + " <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n", + " <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n", + " <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n", + " <td>82</td>\n", + " <td>unclassified</td>\n", " <td>Histoire</td>\n", - " <td>0.000280</td>\n", - " <td>Histoire naturelle</td>\n", - " <td>0.000190</td>\n", + " <td>0.979700</td>\n", + " <td>Belles-lettres</td>\n", + " <td>0.012630</td>\n", + " <td>Religion</td>\n", + " <td>0.001750</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", - " <td>lge_1_aa-3</td>\n", " <td>1</td>\n", - " <td>9</td>\n", - " <td>AA</td>\n", - " <td>5</td>\n", - " <td>aa-3</td>\n", - " <td>AA. Rivière de Hollande, affluent de la Dommel...</td>\n", - " <td>17.0</td>\n", - " <td>Géographie</td>\n", - " <td>0.995858</td>\n", - " <td>Histoire naturelle</td>\n", - " <td>0.001078</td>\n", + " <td>10</td>\n", + " <td>A, numismatique ou monétaire</td>\n", + " <td>Mallet</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-5</td>\n", + " <td>Médailles</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>112</td>\n", + " <td>unclassified</td>\n", " <td>Histoire</td>\n", - " <td>0.000548</td>\n", + " <td>0.947388</td>\n", + " <td>Commerce</td>\n", + " <td>0.027528</td>\n", + " <td>Belles-lettres</td>\n", + " <td>0.010894</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", - " <td>lge_1_aa-4</td>\n", " <td>1</td>\n", - " <td>10</td>\n", - " <td>AA</td>\n", - " <td>5</td>\n", - " <td>aa-4</td>\n", - " <td>AA. Nom de deux fleuves de la Russie. Le premi...</td>\n", - " <td>71.0</td>\n", - " <td>Géographie</td>\n", - " <td>0.997916</td>\n", + " <td>11</td>\n", + " <td>A, lapidaire</td>\n", + " <td>Mallet</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-6</td>\n", " <td>Histoire</td>\n", - " <td>0.000561</td>\n", - " <td>Militaire</td>\n", - " <td>0.000186</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>80</td>\n", + " <td>unclassified</td>\n", + " <td>Histoire</td>\n", + " <td>0.738804</td>\n", + " <td>Belles-lettres</td>\n", + " <td>0.193938</td>\n", + " <td>Beaux-arts</td>\n", + " <td>0.019706</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " uid lge-volume lge-numero lge-head lge-page lge-id \\\n", - "0 lge_1_a-0 1 1 A 0 a-0 \n", - "1 lge_1_a-1 1 2 A 1 a-1 \n", - "2 lge_1_a-2 1 3 A 4 a-2 \n", - "3 lge_1_a-3 1 4 A 4 a-3 \n", - "4 lge_1_a-4 1 5 A 4 a-4 \n", - "5 lge_1_aa-0 1 6 AA 4 aa-0 \n", - "6 lge_1_aa-1 1 7 AA 4 aa-1 \n", - "7 lge_1_aa-2 1 8 AA 5 aa-2 \n", - "8 lge_1_aa-3 1 9 AA 5 aa-3 \n", - "9 lge_1_aa-4 1 10 AA 5 aa-4 \n", + " volume numero head author \\\n", + "0 1 1 Title Page unsigned \n", + "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert \n", + "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert \n", + "3 1 5 A, a & a Dumarsais5 \n", + "4 1 6 A Dumarsais5 \n", + "5 1 7 A Dumarsais \n", + "6 1 8 A Mallet \n", + "7 1 9 A, lettre symbolique Mallet \n", + "8 1 10 A, numismatique ou monétaire Mallet \n", + "9 1 11 A, lapidaire Mallet \n", "\n", - " lge-content lge-nbWords \\\n", - "0 A(Ling.). Son vocal et première lettre de notr... 1761.0 \n", - "1 A(Paléogr.). C’est à l’alphabet phénicien, on ... 839.0 \n", - "2 A(Log.). Cette voyelle désigne les proposition... 56.0 \n", - "3 A(Mus.). La lettre a est employée par les musi... 267.0 \n", - "4 A(Numis.). Dans la numismatique grecque, la le... 67.0 \n", - "5 AA. Ces deux lettres désignent l’atelier monét... 14.0 \n", - "6 AA. Nom de plusieurs cours d’eau de l’Europe o... 75.0 \n", - "7 AA. Rivière de France, prend sa source aux Tro... 165.0 \n", - "8 AA. Rivière de Hollande, affluent de la Dommel... 17.0 \n", - "9 AA. Nom de deux fleuves de la Russie. Le premi... 71.0 \n", + " edda_class enccre_id enccre_class \\\n", + "0 unclassified NaN NaN \n", + "1 unclassified NaN NaN \n", + "2 unclassified NaN NaN \n", + "3 Grammaire v1-1-0 Grammaire \n", + "4 unclassified v1-1-1 Grammaire \n", + "5 unclassified v1-1-2 Grammaire \n", + "6 unclassified v1-1-3 NaN \n", + "7 unclassified v1-1-4 NaN \n", + "8 unclassified v1-1-5 Médailles \n", + "9 unclassified v1-1-6 Histoire \n", + "\n", + " content \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... \n", + "5 \\nA, préposition vient du latin à , à dextris, ... \n", + "6 \\nA, étoit une lettre numérale parmi les Ancie... \n", + "7 \\nA, lettre symbolique, étoit un hiéroglyphe c... \n", + "8 \\nA, numismatique ou monétaire, sur le revers ... \n", + "9 \\nA, lapidaire, dans les anciennes inscription... \n", + "\n", + " content_without_designant \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... \n", + "5 \\nA, préposition vient du latin à , à dextris, ... \n", + "6 \\nA, étoit une lettre numérale parmi les Ancie... \n", + "7 \\nA, lettre symbolique, étoit un hiéroglyphe c... \n", + "8 \\nA, numismatique ou monétaire, sur le revers ... \n", + "9 \\nA, lapidaire, dans les anciennes inscription... \n", + "\n", + " first_paragraph nb_words super_domain \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... 151 unclassified \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... 208 unclassified \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n 44669 unclassified \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... 711 Philosophie \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... 238 unclassified \n", + "5 \\nA, préposition vient du latin à , à dextris, ... 1980 unclassified \n", + "6 \\nA, étoit une lettre numérale parmi les Ancie... 200 unclassified \n", + "7 \\nA, lettre symbolique, étoit un hiéroglyphe c... 82 unclassified \n", + "8 \\nA, numismatique ou monétaire, sur le revers ... 112 unclassified \n", + "9 \\nA, lapidaire, dans les anciennes inscription... 80 unclassified \n", "\n", " lge-superdomainPred1 lge-superdomainProba1 lge-superdomainPred2 \\\n", - "0 Philosophie 0.937586 Belles-lettres \n", - "1 Géographie 0.992606 Histoire \n", - "2 Philosophie 0.982367 Belles-lettres \n", - "3 Musique 0.905895 Belles-lettres \n", - "4 Histoire 0.986111 Belles-lettres \n", - "5 Commerce 0.986866 Droit Jurisprudence \n", - "6 Géographie 0.954104 Histoire \n", - "7 Géographie 0.998200 Histoire \n", - "8 Géographie 0.995858 Histoire naturelle \n", - "9 Géographie 0.997916 Histoire \n", + "0 Philosophie 0.986489 Belles-lettres \n", + "1 Philosophie 0.943809 Histoire \n", + "2 Belles-lettres 0.926219 Histoire \n", + "3 Philosophie 0.978732 Politique \n", + "4 Philosophie 0.988337 Belles-lettres \n", + "5 Philosophie 0.988102 Belles-lettres \n", + "6 Histoire 0.631214 Belles-lettres \n", + "7 Histoire 0.979700 Belles-lettres \n", + "8 Histoire 0.947388 Commerce \n", + "9 Histoire 0.738804 Belles-lettres \n", "\n", " lge-superdomainProba2 lge-superdomainPred3 lge-superdomainProba3 \n", - "0 0.021192 Histoire 0.012657 \n", - "1 0.002934 Histoire naturelle 0.001019 \n", - "2 0.004124 Beaux-arts 0.002203 \n", - "3 0.029459 Histoire 0.014980 \n", - "4 0.003949 Géographie 0.001527 \n", - "5 0.002140 Politique 0.001812 \n", - "6 0.025117 Histoire naturelle 0.008872 \n", - "7 0.000280 Histoire naturelle 0.000190 \n", - "8 0.001078 Histoire 0.000548 \n", - "9 0.000561 Militaire 0.000186 " + "0 0.002821 Politique 0.001780 \n", + "1 0.014932 Politique 0.014871 \n", + "2 0.019612 Beaux-arts 0.011769 \n", + "3 0.004091 Belles-lettres 0.002425 \n", + "4 0.003174 Beaux-arts 0.001221 \n", + "5 0.002661 Beaux-arts 0.001391 \n", + "6 0.320553 Physique 0.007173 \n", + "7 0.012630 Religion 0.001750 \n", + "8 0.027528 Belles-lettres 0.010894 \n", + "9 0.193938 Beaux-arts 0.019706 " ] }, - "execution_count": 57, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1007,22 +1133,40 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 18, "metadata": { "id": "J9rObbvVr0zc" }, "outputs": [], "source": [ - "#df.to_csv(drive_path + \"predictions/EDdA_dataset_articles_superdomainBERT_230313.tsv\", sep=\"\\t\")\n", - "df.to_csv(drive_path + \"predictions/LGE_dataset_articles_superdomainBERT_230321.tsv\", sep=\"\\t\", index=False)" + "df.to_csv(drive_path + \"predictions/EDdA_dataset_articles_superdomainBERT_230327.tsv\", sep=\"\\t\", index=False)\n", + "#df.to_csv(drive_path + \"predictions/LGE_dataset_articles_superdomainBERT_230321.tsv\", sep=\"\\t\", index=False)" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": null, "metadata": { "id": "7TD1mbKj_fXH" }, + "outputs": [], + "source": [ + "df.loc[(df[corpus+'-superdomainProba1'] == 'Géographie')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, "outputs": [ { "data": { @@ -1045,14 +1189,18 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>uid</th>\n", - " <th>lge-volume</th>\n", - " <th>lge-numero</th>\n", - " <th>lge-head</th>\n", - " <th>lge-page</th>\n", - " <th>lge-id</th>\n", - " <th>lge-content</th>\n", - " <th>lge-nbWords</th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>author</th>\n", + " <th>edda_class</th>\n", + " <th>enccre_id</th>\n", + " <th>enccre_class</th>\n", + " <th>content</th>\n", + " <th>content_without_designant</th>\n", + " <th>first_paragraph</th>\n", + " <th>nb_words</th>\n", + " <th>super_domain</th>\n", " <th>lge-superdomainPred1</th>\n", " <th>lge-superdomainProba1</th>\n", " <th>lge-superdomainPred2</th>\n", @@ -1062,43 +1210,61 @@ " </tr>\n", " </thead>\n", " <tbody>\n", + " <tr>\n", + " <th>73362</th>\n", + " <td>17</td>\n", + " <td>2381</td>\n", + " <td>WOLSTROPE</td>\n", + " <td>Jaucourt</td>\n", + " <td>Géographie moderne</td>\n", + " <td>v17-1454-0</td>\n", + " <td>Géographie</td>\n", + " <td>\\nWOLSTROPE, (Géog. mod.) bourg d'Angleterre,\\...</td>\n", + " <td>\\nWOLSTROPE, bourg d'Angleterre,\\ndans le com...</td>\n", + " <td>\\nWOLSTROPE, bourg d'Angleterre,\\ndans le com...</td>\n", + " <td>5530</td>\n", + " <td>None</td>\n", + " <td>Géographie</td>\n", + " <td>0.998638</td>\n", + " <td>Histoire</td>\n", + " <td>0.00016</td>\n", + " <td>Militaire</td>\n", + " <td>0.000113</td>\n", + " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [uid, lge-volume, lge-numero, lge-head, lge-page, lge-id, lge-content, lge-nbWords, lge-superdomainPred1, lge-superdomainProba1, lge-superdomainPred2, lge-superdomainProba2, lge-superdomainPred3, lge-superdomainProba3]\n", - "Index: []" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.loc[(df[corpus+'-superdomainProba1'] == 'Géographie')]" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(134820, 14)" + " volume numero head author edda_class enccre_id \\\n", + "73362 17 2381 WOLSTROPE Jaucourt Géographie moderne v17-1454-0 \n", + "\n", + " enccre_class content \\\n", + "73362 Géographie \\nWOLSTROPE, (Géog. mod.) bourg d'Angleterre,\\... \n", + "\n", + " content_without_designant \\\n", + "73362 \\nWOLSTROPE, bourg d'Angleterre,\\ndans le com... \n", + "\n", + " first_paragraph nb_words \\\n", + "73362 \\nWOLSTROPE, bourg d'Angleterre,\\ndans le com... 5530 \n", + "\n", + " super_domain lge-superdomainPred1 lge-superdomainProba1 \\\n", + "73362 None Géographie 0.998638 \n", + "\n", + " lge-superdomainPred2 lge-superdomainProba2 lge-superdomainPred3 \\\n", + "73362 Histoire 0.00016 Militaire \n", + "\n", + " lge-superdomainProba3 \n", + "73362 0.000113 " ] }, - "execution_count": 60, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.shape" + "df[(df['head'] == 'WOLSTROPE')]" ] }, {