From 3bba32055665ed0e66292d6d8028f0d2e873efea Mon Sep 17 00:00:00 2001 From: Ludovic Moncla <moncla.ludovic@gmail.com> Date: Mon, 13 Mar 2023 23:02:10 +0100 Subject: [PATCH] Update Predict.ipynb --- notebooks/Predict.ipynb | 2816 +++++++++++---------------------------- 1 file changed, 757 insertions(+), 2059 deletions(-) diff --git a/notebooks/Predict.ipynb b/notebooks/Predict.ipynb index 6ccb683..0da102c 100644 --- a/notebooks/Predict.ipynb +++ b/notebooks/Predict.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -25,56 +25,7 @@ "id": "pwmZ5bBvgGNh", "outputId": "1a080856-4e47-4e1d-81d1-d38bb58948a5" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting transformers==4.10.3\n", - " Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)\n", - "\u001b[2K \u001b[90mâ”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m46.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from transformers==4.10.3) (2.25.1)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.8/dist-packages (from transformers==4.10.3) (1.21.6)\n", - "Collecting sacremoses\n", - " Downloading sacremoses-0.0.53.tar.gz (880 kB)\n", - "\u001b[2K \u001b[90mâ”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”\u001b[0m \u001b[32m880.6/880.6 KB\u001b[0m \u001b[31m45.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.8/dist-packages (from transformers==4.10.3) (23.0)\n", - "Collecting tokenizers<0.11,>=0.10.1\n", - " Downloading tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n", - "\u001b[2K \u001b[90mâ”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”\u001b[0m \u001b[32m3.3/3.3 MB\u001b[0m \u001b[31m57.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.8/dist-packages (from transformers==4.10.3) (4.64.1)\n", - "Collecting huggingface-hub>=0.0.12\n", - " Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)\n", - "\u001b[2K \u001b[90mâ”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”\u001b[0m \u001b[32m190.3/190.3 KB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from transformers==4.10.3) (3.9.0)\n", - "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers==4.10.3) (2022.6.2)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from transformers==4.10.3) (6.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.8/dist-packages (from huggingface-hub>=0.0.12->transformers==4.10.3) (4.4.0)\n", - "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->transformers==4.10.3) (4.0.0)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->transformers==4.10.3) (1.24.3)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->transformers==4.10.3) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests->transformers==4.10.3) (2022.12.7)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from sacremoses->transformers==4.10.3) (1.15.0)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.8/dist-packages (from sacremoses->transformers==4.10.3) (7.1.2)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.8/dist-packages (from sacremoses->transformers==4.10.3) (1.2.0)\n", - "Building wheels for collected packages: sacremoses\n", - " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=1a6d3101ab60a657a64074bebed597b1987c115de1092b993a013ae317d882f9\n", - " Stored in directory: /root/.cache/pip/wheels/82/ab/9b/c15899bf659ba74f623ac776e861cf2eb8608c1825ddec66a4\n", - "Successfully built sacremoses\n", - "Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers\n", - "Successfully installed huggingface-hub-0.12.0 sacremoses-0.0.53 tokenizers-0.10.3 transformers-4.10.3\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting sentencepiece\n", - " Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90mâ”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m33.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: sentencepiece\n", - "Successfully installed sentencepiece-0.1.97\n" - ] - } - ], + "outputs": [], "source": [ "!pip install transformers==4.10.3\n", "!pip install sentencepiece" @@ -91,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -99,17 +50,7 @@ "id": "WF0qFN_g3ekz", "outputId": "56e76858-932c-42fd-ace0-37bf11c7b4ce" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Your runtime has 27.3 gigabytes of available RAM\n", - "\n", - "You are using a high-RAM runtime!\n" - ] - } - ], + "outputs": [], "source": [ "from psutil import virtual_memory\n", "ram_gb = virtual_memory().total / 1e9\n", @@ -132,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -140,15 +81,7 @@ "id": "vL0S-s9Uofvn", "outputId": "dbe3e901-da63-48b5-d8c6-b8cbda503fef" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Mounted at /content/drive\n" - ] - } - ], + "outputs": [], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" @@ -165,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -175,11 +108,10 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "There are 1 GPU(s) available.\n", - "We will use the GPU: Tesla T4\n" + "We will use the GPU\n" ] } ], @@ -213,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": { "id": "SkErnwgMMbRj" }, @@ -237,19 +169,20 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "metadata": { "id": "M2awiee1r0zV" }, "outputs": [], "source": [ - "drive_path = \"drive/MyDrive/Classification-EDdA/\"\n", - "path = \"./\"" + "#drive_path = \"drive/MyDrive/Classification-EDdA/\"\n", + "drive_path = \"../\"\n", + "path = \"/Users/lmoncla/git/gitlab.liris/GEODE/EDdA/output/\"" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -257,25 +190,7 @@ "id": "X1A_J8MGr0zV", "outputId": "ca5c966c-00a2-4d74-cd1c-576c18f98d3d" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2023-02-15 07:14:06-- https://geode.liris.cnrs.fr/EDdA-Classification/datasets/Parallel_datatset_articles_230215.tsv\n", - "Resolving geode.liris.cnrs.fr (geode.liris.cnrs.fr)... 134.214.142.28\n", - "Connecting to geode.liris.cnrs.fr (geode.liris.cnrs.fr)|134.214.142.28|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 42343065 (40M) [text/tab-separated-values]\n", - "Saving to: ‘Parallel_datatset_articles_230215.tsv’\n", - "\n", - "Parallel_datatset_a 100%[===================>] 40.38M 74.9MB/s in 0.5s \n", - "\n", - "2023-02-15 07:14:07 (74.9 MB/s) - ‘Parallel_datatset_articles_230215.tsv’ saved [42343065/42343065]\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "#!wget https://geode.liris.cnrs.fr/files/datasets/EDdA/Classification/LGE_withContent.tsv\n", "#!wget https://geode.liris.cnrs.fr/EDdA-Classification/datasets/EDdA_dataset_articles_no_superdomain.tsv\n", @@ -284,20 +199,21 @@ }, { "cell_type": "code", - "source": [ - "#filepath = \"data/LGE_withContent.tsv\"\n", - "#filepath = \"EDdA_dataset_articles_no_superdomain.tsv\"\n", - "filepath = \"Parallel_datatset_articles_230215.tsv\"" - ], + "execution_count": 4, "metadata": { "id": "eea7F4vato1x" }, - "execution_count": 12, - "outputs": [] + "outputs": [], + "source": [ + "#filepath = \"data/LGE_withContent.tsv\"\n", + "#filepath = \"EDdA_dataset_articles_no_superdomain.tsv\"\n", + "#filepath = \"Parallel_datatset_articles_230215.tsv\"\n", + "filepath = \"EDdA_dataset_articles.tsv\"" + ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -308,49 +224,9 @@ }, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " idLGE tomeLGE rankLGE \\\n", - "0 aam-0 1 63 \n", - "1 abaco-0 1 92 \n", - "2 abacot-0 1 96 \n", - "3 abaddon-0 1 104 \n", - "4 abandonnement-0 1 138 \n", - "\n", - " contentLGE volumeEDdA numeroEDdA \\\n", - "0 AAM. Mesure de capacité pour les liquides en u... 1 31 \n", - "1 ABACO, architecte italien du xvi siècle (V. La... 1 42 \n", - "2 ABACOT. Double couronne que portaient autrefoi... 1 44 \n", - "3 ABADDONou APOLYON le Destructeur. « Elles\\nava... 1 46 \n", - "4 ABANDONNEMENT. I. Droit civil. — Ce mot est un... 1 75 \n", - "\n", - " headEDdA authorEDdA normclassEDdA \\\n", - "0 AAM Diderot unclassified \n", - "1 ABACO d'Alembert unclassified \n", - "2 ABACOT Diderot unclassified \n", - "3 ABADDON Diderot unclassified \n", - "4 ABANDONNEMENT Toussaint Droit \n", - "\n", - " contentEDdA nbWordsEDdA \\\n", - "0 \\n* AAM, s. mesure des Liquides, en usage à Am... 18 \n", - "1 \\nABACO, s. m. Quelques anciens Auteurs se ser... 26 \n", - "2 \\n* ABACOT, s. m. nom de l'ancienne parure dè\\... 22 \n", - "3 \\n* ABADDON, s. m. vient d'abad, perdre. C'est... 25 \n", - "4 \\nABANDONNEMENT, s. m. en Droit, est le délais... 77 \n", - "\n", - " superdomainEDdA \n", - "0 Unclassified \n", - "1 Unclassified \n", - "2 Unclassified \n", - "3 Unclassified \n", - "4 Droit Jurisprudence " - ], "text/html": [ - "\n", - " <div id=\"df-be30bfa5-3524-40b4-abed-43faebfa6628\">\n", - " <div class=\"colab-df-container\">\n", - " <div>\n", + "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", @@ -368,179 +244,140 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>idLGE</th>\n", - " <th>tomeLGE</th>\n", - " <th>rankLGE</th>\n", - " <th>contentLGE</th>\n", - " <th>volumeEDdA</th>\n", - " <th>numeroEDdA</th>\n", - " <th>headEDdA</th>\n", - " <th>authorEDdA</th>\n", - " <th>normclassEDdA</th>\n", - " <th>contentEDdA</th>\n", - " <th>nbWordsEDdA</th>\n", - " <th>superdomainEDdA</th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>author</th>\n", + " <th>edda_class</th>\n", + " <th>enccre_id</th>\n", + " <th>enccre_class</th>\n", + " <th>content</th>\n", + " <th>content_without_designant</th>\n", + " <th>first_paragraph</th>\n", + " <th>nb_words</th>\n", + " <th>super_domain</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>aam-0</td>\n", " <td>1</td>\n", - " <td>63</td>\n", - " <td>AAM. Mesure de capacité pour les liquides en u...</td>\n", " <td>1</td>\n", - " <td>31</td>\n", - " <td>AAM</td>\n", - " <td>Diderot</td>\n", + " <td>Title Page</td>\n", + " <td>unsigned</td>\n", " <td>unclassified</td>\n", - " <td>\\n* AAM, s. mesure des Liquides, en usage à Am...</td>\n", - " <td>18</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>151</td>\n", " <td>Unclassified</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>abaco-0</td>\n", " <td>1</td>\n", - " <td>92</td>\n", - " <td>ABACO, architecte italien du xvi siècle (V. La...</td>\n", - " <td>1</td>\n", - " <td>42</td>\n", - " <td>ABACO</td>\n", - " <td>d'Alembert</td>\n", + " <td>2</td>\n", + " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", + " <td>Diderot & d'Alembert</td>\n", " <td>unclassified</td>\n", - " <td>\\nABACO, s. m. Quelques anciens Auteurs se ser...</td>\n", - " <td>26</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>208</td>\n", " <td>Unclassified</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>abacot-0</td>\n", " <td>1</td>\n", - " <td>96</td>\n", - " <td>ABACOT. Double couronne que portaient autrefoi...</td>\n", - " <td>1</td>\n", - " <td>44</td>\n", - " <td>ABACOT</td>\n", - " <td>Diderot</td>\n", + " <td>3</td>\n", + " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", + " <td>d'Alembert</td>\n", " <td>unclassified</td>\n", - " <td>\\n* ABACOT, s. m. nom de l'ancienne parure dè\\...</td>\n", - " <td>22</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n</td>\n", + " <td>44669</td>\n", " <td>Unclassified</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>abaddon-0</td>\n", " <td>1</td>\n", - " <td>104</td>\n", - " <td>ABADDONou APOLYON le Destructeur. « Elles\\nava...</td>\n", - " <td>1</td>\n", - " <td>46</td>\n", - " <td>ABADDON</td>\n", - " <td>Diderot</td>\n", - " <td>unclassified</td>\n", - " <td>\\n* ABADDON, s. m. vient d'abad, perdre. C'est...</td>\n", - " <td>25</td>\n", - " <td>Unclassified</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Dumarsais5</td>\n", + " <td>Grammaire</td>\n", + " <td>v1-1-0</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>711</td>\n", + " <td>Philosophie</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>abandonnement-0</td>\n", - " <td>1</td>\n", - " <td>138</td>\n", - " <td>ABANDONNEMENT. I. Droit civil. — Ce mot est un...</td>\n", " <td>1</td>\n", - " <td>75</td>\n", - " <td>ABANDONNEMENT</td>\n", - " <td>Toussaint</td>\n", - " <td>Droit</td>\n", - " <td>\\nABANDONNEMENT, s. m. en Droit, est le délais...</td>\n", - " <td>77</td>\n", - " <td>Droit Jurisprudence</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>Dumarsais5</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-1</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>238</td>\n", + " <td>Unclassified</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", - "</div>\n", - " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-be30bfa5-3524-40b4-abed-43faebfa6628')\"\n", - " title=\"Convert this dataframe to an interactive table.\"\n", - " style=\"display:none;\">\n", - " \n", - " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", - " width=\"24px\">\n", - " <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", - " <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", - " </svg>\n", - " </button>\n", - " \n", - " <style>\n", - " .colab-df-container {\n", - " display:flex;\n", - " flex-wrap:wrap;\n", - " gap: 12px;\n", - " }\n", - "\n", - " .colab-df-convert {\n", - " background-color: #E8F0FE;\n", - " border: none;\n", - " border-radius: 50%;\n", - " cursor: pointer;\n", - " display: none;\n", - " fill: #1967D2;\n", - " height: 32px;\n", - " padding: 0 0 0 0;\n", - " width: 32px;\n", - " }\n", - "\n", - " .colab-df-convert:hover {\n", - " background-color: #E2EBFA;\n", - " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", - " fill: #174EA6;\n", - " }\n", - "\n", - " [theme=dark] .colab-df-convert {\n", - " background-color: #3B4455;\n", - " fill: #D2E3FC;\n", - " }\n", + "</div>" + ], + "text/plain": [ + " volume numero head author \\\n", + "0 1 1 Title Page unsigned \n", + "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert \n", + "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert \n", + "3 1 5 A, a & a Dumarsais5 \n", + "4 1 6 A Dumarsais5 \n", "\n", - " [theme=dark] .colab-df-convert:hover {\n", - " background-color: #434B5C;\n", - " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", - " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", - " fill: #FFFFFF;\n", - " }\n", - " </style>\n", + " edda_class enccre_id enccre_class \\\n", + "0 unclassified NaN NaN \n", + "1 unclassified NaN NaN \n", + "2 unclassified NaN NaN \n", + "3 Grammaire v1-1-0 Grammaire \n", + "4 unclassified v1-1-1 Grammaire \n", "\n", - " <script>\n", - " const buttonEl =\n", - " document.querySelector('#df-be30bfa5-3524-40b4-abed-43faebfa6628 button.colab-df-convert');\n", - " buttonEl.style.display =\n", - " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + " content \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... \n", "\n", - " async function convertToInteractive(key) {\n", - " const element = document.querySelector('#df-be30bfa5-3524-40b4-abed-43faebfa6628');\n", - " const dataTable =\n", - " await google.colab.kernel.invokeFunction('convertToInteractive',\n", - " [key], {});\n", - " if (!dataTable) return;\n", + " content_without_designant \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... \n", "\n", - " const docLinkHtml = 'Like what you see? Visit the ' +\n", - " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", - " + ' to learn more about interactive tables.';\n", - " element.innerHTML = '';\n", - " dataTable['output_type'] = 'display_data';\n", - " await google.colab.output.renderOutput(dataTable, element);\n", - " const docLink = document.createElement('div');\n", - " docLink.innerHTML = docLinkHtml;\n", - " element.appendChild(docLink);\n", - " }\n", - " </script>\n", - " </div>\n", - " </div>\n", - " " + " first_paragraph nb_words super_domain \n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... 151 Unclassified \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... 208 Unclassified \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n 44669 Unclassified \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... 711 Philosophie \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... 238 Unclassified " ] }, + "execution_count": 5, "metadata": {}, - "execution_count": 13 + "output_type": "execute_result" } ], "source": [ @@ -550,16 +387,16 @@ }, { "cell_type": "code", - "source": [ - "corpus = 'LGE'\n", - "#corpus = 'EDdA'\n", - "data = df['content'+corpus].values\n" - ], + "execution_count": 6, "metadata": { "id": "Ndw4UtgWt_MJ" }, - "execution_count": 28, - "outputs": [] + "outputs": [], + "source": [ + "#corpus = 'LGE'\n", + "corpus = ''\n", + "data = df['content'+corpus].values\n" + ] }, { "cell_type": "markdown", @@ -574,7 +411,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": { "id": "0qDZ86qTr0zX" }, @@ -589,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 9, "metadata": { "id": "KEljGX0br0zX" }, @@ -695,7 +532,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -752,67 +589,11 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Loading Bert Tokenizer...\n" ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Downloading: 0%| | 0.00/996k [00:00<?, ?B/s]" - ], - "application/vnd.jupyter.widget-view+json": { - "version_major": 2, - "version_minor": 0, - "model_id": "11c285bed74e46a08fbb7bf88715aafa" - } - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Downloading: 0%| | 0.00/29.0 [00:00<?, ?B/s]" - ], - "application/vnd.jupyter.widget-view+json": { - "version_major": 2, - "version_minor": 0, - "model_id": "9be44ba364a344f2b6b2546ae9d61ba8" - } - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Downloading: 0%| | 0.00/1.96M [00:00<?, ?B/s]" - ], - "application/vnd.jupyter.widget-view+json": { - "version_major": 2, - "version_minor": 0, - "model_id": "aa6a7a9106554f85a91150bd65c271d0" - } - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Downloading: 0%| | 0.00/625 [00:00<?, ?B/s]" - ], - "application/vnd.jupyter.widget-view+json": { - "version_major": 2, - "version_minor": 0, - "model_id": "4c46904f8e944d2b834ba9d384b00a8c" - } - }, - "metadata": {} } ], "source": [ @@ -826,11 +607,19 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 11, "metadata": { "id": "-O6NspVTr0zZ" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (75311 > 512). Running this sequence through the model will result in indexing errors\n" + ] + } + ], "source": [ "data_loader = generate_dataloader(tokenizer, data)" ] @@ -849,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 15, "metadata": { "id": "CN8EZst-r0zZ" }, @@ -858,12 +647,13 @@ "#model = torch.load(model_path, map_location=torch.device('mps'))\n", "#model.load_state_dict(torch.load(model_path, map_location=torch.device('mps')))\n", "\n", - "model = BertForSequenceClassification.from_pretrained(model_path).to(\"cuda\")" + "#model = BertForSequenceClassification.from_pretrained(model_path).to(\"cuda\")\n", + "model = BertForSequenceClassification.from_pretrained(model_path).to(\"mps\")" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 16, "metadata": { "id": "_fzgS5USJeAF" }, @@ -874,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -882,1029 +672,28 @@ "id": "ISkijyclr0za", "outputId": "8120e858-9950-4380-f887-70ca47360c76" }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[4,\n", - " 1,\n", - " 7,\n", - " 16,\n", - " 5,\n", - " 7,\n", - " 7,\n", - " 8,\n", - " 6,\n", - " 6,\n", - " 0,\n", - " 9,\n", - " 7,\n", - " 5,\n", - " 6,\n", - " 3,\n", - " 11,\n", - " 7,\n", - " 11,\n", - " 9,\n", - " 12,\n", - " 5,\n", - " 5,\n", - " 13,\n", - " 9,\n", - " 16,\n", - " 6,\n", - " 5,\n", - " 9,\n", - " 1,\n", - " 7,\n", - " 11,\n", - " 4,\n", - " 5,\n", - " 6,\n", - " 8,\n", - " 14,\n", - " 1,\n", - " 8,\n", - " 13,\n", - " 14,\n", - " 16,\n", - " 16,\n", - " 13,\n", - " 8,\n", - " 8,\n", - " 8,\n", - " 8,\n", - " 6,\n", - " 8,\n", - " 13,\n", - " 10,\n", - " 13,\n", - " 5,\n", - " 5,\n", - " 13,\n", - " 13,\n", - " 2,\n", - " 1,\n", - " 14,\n", - " 4,\n", - " 13,\n", - " 7,\n", - " 0,\n", - " 1,\n", - " 11,\n", - " 12,\n", - " 9,\n", - " 10,\n", - " 7,\n", - " 12,\n", - " 3,\n", - " 9,\n", - " 5,\n", - " 5,\n", - " 13,\n", - " 11,\n", - " 8,\n", - " 7,\n", - " 6,\n", - " 4,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 11,\n", - " 7,\n", - " 14,\n", - " 6,\n", - " 5,\n", - " 5,\n", - " 5,\n", - " 4,\n", - " 16,\n", - " 2,\n", - " 13,\n", - " 7,\n", - " 14,\n", - " 2,\n", - " 10,\n", - " 7,\n", - " 8,\n", - " 14,\n", - " 5,\n", - " 1,\n", - " 6,\n", - " 16,\n", - " 14,\n", - " 13,\n", - " 6,\n", - " 7,\n", - " 5,\n", - " 5,\n", - " 11,\n", - " 5,\n", - " 0,\n", - " 6,\n", - " 5,\n", - " 13,\n", - " 9,\n", - " 4,\n", - " 8,\n", - " 7,\n", - " 6,\n", - " 5,\n", - " 13,\n", - " 6,\n", - " 5,\n", - " 5,\n", - " 5,\n", - " 7,\n", - " 11,\n", - " 2,\n", - " 7,\n", - " 8,\n", - " 7,\n", - " 13,\n", - " 5,\n", - " 4,\n", - " 8,\n", - " 6,\n", - " 6,\n", - " 5,\n", - " 12,\n", - " 8,\n", - " 7,\n", - " 13,\n", - " 6,\n", - " 7,\n", - " 9,\n", - " 10,\n", - " 13,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 9,\n", - " 9,\n", - " 8,\n", - " 8,\n", - " 6,\n", - " 8,\n", - " 13,\n", - " 14,\n", - " 11,\n", - " 13,\n", - " 6,\n", - " 1,\n", - " 11,\n", - " 1,\n", - " 4,\n", - " 8,\n", - " 6,\n", - " 1,\n", - " 9,\n", - " 2,\n", - " 8,\n", - " 6,\n", - " 5,\n", - " 4,\n", - " 8,\n", - " 7,\n", - " 4,\n", - " 7,\n", - " 14,\n", - " 14,\n", - " 8,\n", - " 7,\n", - " 7,\n", - " 16,\n", - " 6,\n", - " 13,\n", - " 9,\n", - " 9,\n", - " 9,\n", - " 16,\n", - " 6,\n", - " 6,\n", - " 14,\n", - " 6,\n", - " 8,\n", - " 6,\n", - " 14,\n", - " 7,\n", - " 8,\n", - " 5,\n", - " 6,\n", - " 6,\n", - " 14,\n", - " 14,\n", - " 6,\n", - " 0,\n", - " 4,\n", - " 10,\n", - " 6,\n", - " 10,\n", - " 14,\n", - " 8,\n", - " 6,\n", - " 2,\n", - " 3,\n", - " 8,\n", - " 3,\n", - " 2,\n", - " 8,\n", - " 8,\n", - " 13,\n", - " 7,\n", - " 6,\n", - " 7,\n", - " 4,\n", - " 8,\n", - " 8,\n", - " 6,\n", - " 13,\n", - " 11,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 13,\n", - " 5,\n", - " 9,\n", - " 12,\n", - " 11,\n", - " 7,\n", - " 2,\n", - " 11,\n", - " 8,\n", - " 3,\n", - " 3,\n", - " 9,\n", - " 2,\n", - " 8,\n", - " 7,\n", - " 5,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 13,\n", - " 9,\n", - " 6,\n", - " 14,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 8,\n", - " 6,\n", - " 5,\n", - " 6,\n", - " 7,\n", - " 3,\n", - " 8,\n", - " 7,\n", - " 5,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 5,\n", - " 5,\n", - " 8,\n", - " 14,\n", - " 6,\n", - " 6,\n", - " 7,\n", - " 11,\n", - " 8,\n", - " 7,\n", - " 6,\n", - " 1,\n", - " 7,\n", - " 5,\n", - " 2,\n", - " 11,\n", - " 6,\n", - " 11,\n", - " 16,\n", - " 2,\n", - " 7,\n", - " 2,\n", - " 4,\n", - " 2,\n", - " 7,\n", - " 13,\n", - " 6,\n", - " 11,\n", - " 13,\n", - " 13,\n", - " 2,\n", - " 13,\n", - " 11,\n", - " 11,\n", - " 6,\n", - " 11,\n", - " 6,\n", - " 8,\n", - " 8,\n", - " 1,\n", - " 6,\n", - " 9,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 7,\n", - " 0,\n", - " 6,\n", - " 11,\n", - " 6,\n", - " 7,\n", - " 11,\n", - " 6,\n", - " 6,\n", - " 7,\n", - " 12,\n", - " 9,\n", - " 11,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 8,\n", - " 14,\n", - " 6,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 5,\n", - " 7,\n", - " 6,\n", - " 2,\n", - " 7,\n", - " 6,\n", - " 14,\n", - " 5,\n", - " 8,\n", - " 14,\n", - " 8,\n", - " 11,\n", - " 9,\n", - " 9,\n", - " 11,\n", - " 2,\n", - " 14,\n", - " 7,\n", - " 9,\n", - " 7,\n", - " 8,\n", - " 16,\n", - " 11,\n", - " 13,\n", - " 14,\n", - " 5,\n", - " 6,\n", - " 6,\n", - " 14,\n", - " 10,\n", - " 7,\n", - " 7,\n", - " 8,\n", - " 6,\n", - " 2,\n", - " 7,\n", - " 6,\n", - " 13,\n", - " 13,\n", - " 10,\n", - " 6,\n", - " 11,\n", - " 16,\n", - " 6,\n", - " 6,\n", - " 12,\n", - " 2,\n", - " 6,\n", - " 11,\n", - " 13,\n", - " 6,\n", - " 11,\n", - " 2,\n", - " 6,\n", - " 5,\n", - " 13,\n", - " 7,\n", - " 6,\n", - " 11,\n", - " 11,\n", - " 7,\n", - " 6,\n", - " 14,\n", - " 8,\n", - " 8,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 2,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 6,\n", - " 7,\n", - " 16,\n", - " 2,\n", - " 2,\n", - " 11,\n", - " 11,\n", - " 10,\n", - " 11,\n", - " 16,\n", - " 3,\n", - " 16,\n", - " 11,\n", - " 7,\n", - " 5,\n", - " 5,\n", - " 3,\n", - " 6,\n", - " 8,\n", - " 1,\n", - " 11,\n", - " 6,\n", - " 13,\n", - " 14,\n", - " 5,\n", - " 5,\n", - " 12,\n", - " 9,\n", - " 14,\n", - " 5,\n", - " 13,\n", - " 6,\n", - " 8,\n", - " 11,\n", - " 14,\n", - " 8,\n", - " 9,\n", - " 7,\n", - " 7,\n", - " 6,\n", - " 3,\n", - " 1,\n", - " 1,\n", - " 6,\n", - " 14,\n", - " 6,\n", - " 5,\n", - " 13,\n", - " 6,\n", - " 8,\n", - " 12,\n", - " 1,\n", - " 6,\n", - " 7,\n", - " 3,\n", - " 7,\n", - " 16,\n", - " 14,\n", - " 3,\n", - " 7,\n", - " 10,\n", - " 5,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 9,\n", - " 7,\n", - " 3,\n", - " 1,\n", - " 1,\n", - " 1,\n", - " 1,\n", - " 5,\n", - " 10,\n", - " 5,\n", - " 7,\n", - " 12,\n", - " 12,\n", - " 6,\n", - " 14,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 8,\n", - " 6,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 1,\n", - " 6,\n", - " 8,\n", - " 7,\n", - " 14,\n", - " 8,\n", - " 7,\n", - " 2,\n", - " 12,\n", - " 7,\n", - " 16,\n", - " 6,\n", - " 10,\n", - " 8,\n", - " 7,\n", - " 14,\n", - " 6,\n", - " 9,\n", - " 1,\n", - " 9,\n", - " 9,\n", - " 16,\n", - " 13,\n", - " 5,\n", - " 7,\n", - " 6,\n", - " 9,\n", - " 7,\n", - " 6,\n", - " 11,\n", - " 8,\n", - " 9,\n", - " 9,\n", - " 5,\n", - " 2,\n", - " 5,\n", - " 5,\n", - " 9,\n", - " 3,\n", - " 0,\n", - " 5,\n", - " 8,\n", - " 7,\n", - " 2,\n", - " 2,\n", - " 7,\n", - " 11,\n", - " 11,\n", - " 13,\n", - " 13,\n", - " 14,\n", - " 3,\n", - " 13,\n", - " 1,\n", - " 6,\n", - " 7,\n", - " 7,\n", - " 14,\n", - " 7,\n", - " 11,\n", - " 8,\n", - " 16,\n", - " 6,\n", - " 6,\n", - " 1,\n", - " 8,\n", - " 13,\n", - " 7,\n", - " 8,\n", - " 4,\n", - " 11,\n", - " 6,\n", - " 7,\n", - " 5,\n", - " 5,\n", - " 5,\n", - " 4,\n", - " 5,\n", - " 6,\n", - " 5,\n", - " 8,\n", - " 2,\n", - " 13,\n", - " 6,\n", - " 13,\n", - " 12,\n", - " 16,\n", - " 8,\n", - " 14,\n", - " 7,\n", - " 3,\n", - " 13,\n", - " 11,\n", - " 6,\n", - " 7,\n", - " 6,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 1,\n", - " 7,\n", - " 11,\n", - " 14,\n", - " 7,\n", - " 11,\n", - " 1,\n", - " 9,\n", - " 0,\n", - " 11,\n", - " 5,\n", - " 1,\n", - " 0,\n", - " 5,\n", - " 12,\n", - " 1,\n", - " 14,\n", - " 12,\n", - " 8,\n", - " 13,\n", - " 13,\n", - " 4,\n", - " 12,\n", - " 3,\n", - " 1,\n", - " 6,\n", - " 7,\n", - " 5,\n", - " 5,\n", - " 5,\n", - " 5,\n", - " 13,\n", - " 5,\n", - " 12,\n", - " 7,\n", - " 8,\n", - " 6,\n", - " 2,\n", - " 5,\n", - " 6,\n", - " 9,\n", - " 13,\n", - " 7,\n", - " 16,\n", - " 6,\n", - " 7,\n", - " 7,\n", - " 4,\n", - " 11,\n", - " 6,\n", - " 12,\n", - " 2,\n", - " 7,\n", - " 6,\n", - " 2,\n", - " 14,\n", - " 7,\n", - " 7,\n", - " 14,\n", - " 13,\n", - " 11,\n", - " 5,\n", - " 6,\n", - " 7,\n", - " 13,\n", - " 7,\n", - " 7,\n", - " 8,\n", - " 13,\n", - " 8,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 8,\n", - " 11,\n", - " 7,\n", - " 6,\n", - " 0,\n", - " 9,\n", - " 7,\n", - " 6,\n", - " 7,\n", - " 7,\n", - " 9,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 1,\n", - " 4,\n", - " 1,\n", - " 7,\n", - " 6,\n", - " 1,\n", - " 8,\n", - " 9,\n", - " 7,\n", - " 5,\n", - " 5,\n", - " 8,\n", - " 7,\n", - " 0,\n", - " 10,\n", - " 9,\n", - " 9,\n", - " 3,\n", - " 6,\n", - " 9,\n", - " 9,\n", - " 1,\n", - " 9,\n", - " 0,\n", - " 2,\n", - " 2,\n", - " 6,\n", - " 3,\n", - " 8,\n", - " 7,\n", - " 7,\n", - " 3,\n", - " 1,\n", - " 1,\n", - " 6,\n", - " 5,\n", - " 6,\n", - " 1,\n", - " 6,\n", - " 11,\n", - " 9,\n", - " 8,\n", - " 7,\n", - " 5,\n", - " 5,\n", - " 5,\n", - " 5,\n", - " 1,\n", - " 13,\n", - " 6,\n", - " 5,\n", - " 6,\n", - " 7,\n", - " 2,\n", - " 6,\n", - " 6,\n", - " 13,\n", - " 1,\n", - " 7,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 7,\n", - " 7,\n", - " 6,\n", - " 13,\n", - " 8,\n", - " 8,\n", - " 1,\n", - " 6,\n", - " 2,\n", - " 3,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 8,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 7,\n", - " 14,\n", - " 6,\n", - " 4,\n", - " 8,\n", - " 11,\n", - " 7,\n", - " 7,\n", - " 6,\n", - " 7,\n", - " 6,\n", - " 3,\n", - " 6,\n", - " 14,\n", - " 6,\n", - " 6,\n", - " 10,\n", - " 1,\n", - " 14,\n", - " 4,\n", - " 11,\n", - " 12,\n", - " 1,\n", - " 6,\n", - " 7,\n", - " 6,\n", - " 9,\n", - " 6,\n", - " 6,\n", - " 8,\n", - " 6,\n", - " 6,\n", - " 8,\n", - " 2,\n", - " 7,\n", - " 6,\n", - " 5,\n", - " 12,\n", - " 7,\n", - " 1,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 1,\n", - " 10,\n", - " 16,\n", - " 5,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 0,\n", - " 12,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 13,\n", - " 6,\n", - " 6,\n", - " 9,\n", - " 3,\n", - " 7,\n", - " 3,\n", - " 13,\n", - " 6,\n", - " 6,\n", - " 7,\n", - " 7,\n", - " 6,\n", - " 8,\n", - " 8,\n", - " 7,\n", - " 7,\n", - " 10,\n", - " 6,\n", - " 16,\n", - " 2,\n", - " 7,\n", - " 6,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 13,\n", - " 6,\n", - " 2,\n", - " 6,\n", - " 5,\n", - " 3,\n", - " 12,\n", - " 6,\n", - " 8,\n", - " 4,\n", - " 6,\n", - " 10,\n", - " 11,\n", - " 11,\n", - " 8,\n", - " 5,\n", - " 1,\n", - " 1,\n", - " 13,\n", - " 5,\n", - " 14,\n", - " 6,\n", - " 12,\n", - " 6,\n", - " 11,\n", - " 12,\n", - " 6,\n", - " 0,\n", - " 0,\n", - " 9,\n", - " 11,\n", - " 1,\n", - " 6,\n", - " 6,\n", - " 3,\n", - " 3,\n", - " 8,\n", - " 6,\n", - " 8,\n", - " 6,\n", - " 12,\n", - " 8,\n", - " 9,\n", - " 6,\n", - " 8,\n", - " 7,\n", - " 8,\n", - " 8,\n", - " 1,\n", - " 9,\n", - " 12,\n", - " 8,\n", - " 6,\n", - " 14,\n", - " 12,\n", - " 0,\n", - " 4,\n", - " 6,\n", - " 6,\n", - " 5,\n", - " 7,\n", - " 3,\n", - " 7,\n", - " 7,\n", - " 6,\n", - " 9,\n", - " 9,\n", - " 9,\n", - " 12,\n", - " 3,\n", - " 6,\n", - " 8,\n", - " 8,\n", - " 8,\n", - " 12,\n", - " 12,\n", - " 9,\n", - " 7,\n", - " 7,\n", - " 8,\n", - " 6,\n", - " 14,\n", - " 3,\n", - " 3,\n", - " 5,\n", - " 9,\n", - " 3,\n", - " 1,\n", - " 7,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 7,\n", - " 7,\n", - " 6,\n", - " 1,\n", - " 6,\n", - " 6,\n", - " 6,\n", - " 1,\n", - " 9,\n", - " 14,\n", - " 9,\n", - " 0,\n", - " 9,\n", - " 1,\n", - " 3,\n", - " 6,\n", - " 7,\n", - " 5,\n", - " 6,\n", - " 8,\n", - " 7,\n", - " 7,\n", - " 4,\n", - " 0,\n", - " 6,\n", - " 6,\n", - " 9,\n", - " 9,\n", - " 9,\n", - " 0,\n", - " 11,\n", - " 8,\n", - " 7,\n", - " 4,\n", - " 5,\n", - " 6,\n", - " 6,\n", - " 8,\n", - " 6,\n", - " 9,\n", - " 10,\n", - " 12,\n", - " 4,\n", - " 14,\n", - " 6,\n", - " 10,\n", - " 5,\n", - " 7,\n", - " 1,\n", - " 1,\n", - " 6,\n", - " 0,\n", - " 6,\n", - " 3,\n", - " 6,\n", - " ...]" - ] - }, - "metadata": {}, - "execution_count": 32 - } - ], + "outputs": [], "source": [ "pred" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 17, "metadata": { "id": "fo6k4li1r0za" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniforge/base/envs/geode-classification-py39/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator LabelEncoder from version 1.0.2 when using version 1.1.3. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", + "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", + " warnings.warn(\n" + ] + } + ], "source": [ "import pickle \n", "#encoder_filename = \"models/label_encoder.pkl\"\n", @@ -1915,7 +704,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 18, "metadata": { "id": "UU7qg7zVr0zb" }, @@ -1926,7 +715,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 19, "metadata": { "id": "w4eHpBztr0zb" }, @@ -1937,9 +726,7 @@ }, { "cell_type": "code", - "source": [ - "df[df.numero == 2835]['content'+corpus].values" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1947,25 +734,14 @@ "id": "KsJQMhCBxpSF", "outputId": "2ffa7475-e6de-4c42-a413-22c0d4b2d45f" }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([\"\\nQueue, terme de Chancellerie, ce mot se dit de la\\nmaniere de sceller les lettres. Une lettre est scellée à \\nsimple queue, quand le sceau est attaché à un coin du\\nparchemin de la lettre qu'on a fendu exprès ; & elle\\nest scellée à double queue, quand le sceau est pendant\\nà une bande en double de parchemin passée au-travers de la lettre, comme on fait dans les expéditions\\nimportantes.\\n\",\n", - " \"\\nPiquer, v. act. (Charp. & Maçon.) piquer en Charpenterie, c'est marquer un piece de bois, pour la\\ntailler & la façonner. Piquer en Maçonnerie, c'est\\nrustiquer le parement ou les lits d'une pierre, c'est-à -dire que piquer signifie en fait de moilon le tailler\\ngrossierement ; on emploie le moilon piqué de la sorte\\naux voûtes de caves, aux puits & aux murs de clôture.\\nPiquer signifie aussi faire sur les matériaux destinés à \\nla construction extérieure les bâtimens, les petits\\npoints ou creux nécessaires pour leur servir d'ornement ; \\non pique de cette maniere la pierre de taille,\\n\\nle grès & le moilon particulierement pour l'ordre\\ntoscan. (D. J.)\\n\"],\n", - " dtype=object)" - ] - }, - "metadata": {}, - "execution_count": 34 - } + "outputs": [], + "source": [ + "df[df.numero == 2835]['content'+corpus].values" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1976,74 +752,9 @@ }, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " idLGE tomeLGE rankLGE \\\n", - "0 aam-0 1 63 \n", - "1 abaco-0 1 92 \n", - "2 abacot-0 1 96 \n", - "3 abaddon-0 1 104 \n", - "4 abandonnement-0 1 138 \n", - "5 abantes-0 1 143 \n", - "6 abaque-0 1 146 \n", - "7 abaremo-temo-0 1 152 \n", - "8 abares-0 1 153 \n", - "9 abarim-0 1 154 \n", - "\n", - " contentLGE volumeEDdA numeroEDdA \\\n", - "0 AAM. Mesure de capacité pour les liquides en u... 1 31 \n", - "1 ABACO, architecte italien du xvi siècle (V. La... 1 42 \n", - "2 ABACOT. Double couronne que portaient autrefoi... 1 44 \n", - "3 ABADDONou APOLYON le Destructeur. « Elles\\nava... 1 46 \n", - "4 ABANDONNEMENT. I. Droit civil. — Ce mot est un... 1 75 \n", - "5 ABANTES. Peuplade d’origine douteuse que l’on ... 1 81 \n", - "6 ABAQUE. I. Antiquité.— Dans l’antiquité on don... 1 84 \n", - "7 ABAREMO-TEMO(Bot.). Nom sous lequel Pison\\n(Br... 1 90 \n", - "8 ABARES. Nom de deux peuples distincts, habitan... 1 91 \n", - "9 ABARIM. Chaîne de montagnes de la Palestine au... 1 92 \n", - "\n", - " headEDdA authorEDdA normclassEDdA \\\n", - "0 AAM Diderot unclassified \n", - "1 ABACO d'Alembert unclassified \n", - "2 ABACOT Diderot unclassified \n", - "3 ABADDON Diderot unclassified \n", - "4 ABANDONNEMENT Toussaint Droit \n", - "5 ABANTES Diderot unclassified \n", - "6 ABAQUE d'Alembert2 unclassified \n", - "7 ABAREMO-TEMO Diderot unclassified \n", - "8 ABARES Diderot unclassified \n", - "9 ABARIM Diderot unclassified \n", - "\n", - " contentEDdA nbWordsEDdA \\\n", - "0 \\n* AAM, s. mesure des Liquides, en usage à Am... 18 \n", - "1 \\nABACO, s. m. Quelques anciens Auteurs se ser... 26 \n", - "2 \\n* ABACOT, s. m. nom de l'ancienne parure dè\\... 22 \n", - "3 \\n* ABADDON, s. m. vient d'abad, perdre. C'est... 25 \n", - "4 \\nABANDONNEMENT, s. m. en Droit, est le délais... 77 \n", - "5 \\n* ABANTES, s. m. pl. Peuples de Thrace qui p... 26 \n", - "6 \\nABAQUE, s. m. chez les anciens Mathématicien... 52 \n", - "7 \\n* ABAREMO-TEMO, s. m. arbre qui croît, dit-o... 55 \n", - "8 \\n* ABARES, restes de la Nation des Huns qui s... 24 \n", - "9 \\n* ABARIM, montagne de l'Arabie d'où Moyse vi... 23 \n", - "\n", - " superdomainEDdA superdomainBertEDdA superdomainBertLGE \n", - "0 Unclassified Commerce Commerce \n", - "1 Unclassified Physique Beaux-arts \n", - "2 Unclassified Histoire Histoire \n", - "3 Unclassified Histoire Religion \n", - "4 Droit Jurisprudence Droit Jurisprudence Droit Jurisprudence \n", - "5 Unclassified Histoire Histoire \n", - "6 Unclassified Physique Histoire \n", - "7 Unclassified Histoire naturelle Histoire naturelle \n", - "8 Unclassified Histoire Géographie \n", - "9 Unclassified Géographie Géographie " - ], "text/html": [ - "\n", - " <div id=\"df-825c5672-f5f9-49ed-95eb-fdcae67ba1f1\">\n", - " <div class=\"colab-df-container\">\n", - " <div>\n", + "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", @@ -2061,276 +772,263 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>idLGE</th>\n", - " <th>tomeLGE</th>\n", - " <th>rankLGE</th>\n", - " <th>contentLGE</th>\n", - " <th>volumeEDdA</th>\n", - " <th>numeroEDdA</th>\n", - " <th>headEDdA</th>\n", - " <th>authorEDdA</th>\n", - " <th>normclassEDdA</th>\n", - " <th>contentEDdA</th>\n", - " <th>nbWordsEDdA</th>\n", - " <th>superdomainEDdA</th>\n", - " <th>superdomainBertEDdA</th>\n", - " <th>superdomainBertLGE</th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>author</th>\n", + " <th>edda_class</th>\n", + " <th>enccre_id</th>\n", + " <th>enccre_class</th>\n", + " <th>content</th>\n", + " <th>content_without_designant</th>\n", + " <th>first_paragraph</th>\n", + " <th>nb_words</th>\n", + " <th>super_domain</th>\n", + " <th>superdomainBert</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>aam-0</td>\n", " <td>1</td>\n", - " <td>63</td>\n", - " <td>AAM. Mesure de capacité pour les liquides en u...</td>\n", " <td>1</td>\n", - " <td>31</td>\n", - " <td>AAM</td>\n", - " <td>Diderot</td>\n", + " <td>Title Page</td>\n", + " <td>unsigned</td>\n", " <td>unclassified</td>\n", - " <td>\\n* AAM, s. mesure des Liquides, en usage à Am...</td>\n", - " <td>18</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>\\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ...</td>\n", + " <td>151</td>\n", " <td>Unclassified</td>\n", - " <td>Commerce</td>\n", - " <td>Commerce</td>\n", + " <td>Philosophie</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>abaco-0</td>\n", - " <td>1</td>\n", - " <td>92</td>\n", - " <td>ABACO, architecte italien du xvi siècle (V. La...</td>\n", " <td>1</td>\n", - " <td>42</td>\n", - " <td>ABACO</td>\n", - " <td>d'Alembert</td>\n", + " <td>2</td>\n", + " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", + " <td>Diderot & d'Alembert</td>\n", " <td>unclassified</td>\n", - " <td>\\nABACO, s. m. Quelques anciens Auteurs se ser...</td>\n", - " <td>26</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>\\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS...</td>\n", + " <td>208</td>\n", " <td>Unclassified</td>\n", - " <td>Physique</td>\n", - " <td>Beaux-arts</td>\n", + " <td>Philosophie</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>abacot-0</td>\n", - " <td>1</td>\n", - " <td>96</td>\n", - " <td>ABACOT. Double couronne que portaient autrefoi...</td>\n", " <td>1</td>\n", - " <td>44</td>\n", - " <td>ABACOT</td>\n", - " <td>Diderot</td>\n", + " <td>3</td>\n", + " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", + " <td>d'Alembert</td>\n", " <td>unclassified</td>\n", - " <td>\\n* ABACOT, s. m. nom de l'ancienne parure dè\\...</td>\n", - " <td>22</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n...</td>\n", + " <td>\\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n</td>\n", + " <td>44669</td>\n", " <td>Unclassified</td>\n", - " <td>Histoire</td>\n", - " <td>Histoire</td>\n", + " <td>Belles-lettres</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>abaddon-0</td>\n", - " <td>1</td>\n", - " <td>104</td>\n", - " <td>ABADDONou APOLYON le Destructeur. « Elles\\nava...</td>\n", " <td>1</td>\n", - " <td>46</td>\n", - " <td>ABADDON</td>\n", - " <td>Diderot</td>\n", - " <td>unclassified</td>\n", - " <td>\\n* ABADDON, s. m. vient d'abad, perdre. C'est...</td>\n", - " <td>25</td>\n", - " <td>Unclassified</td>\n", - " <td>Histoire</td>\n", - " <td>Religion</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Dumarsais5</td>\n", + " <td>Grammaire</td>\n", + " <td>v1-1-0</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>\\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc...</td>\n", + " <td>711</td>\n", + " <td>Philosophie</td>\n", + " <td>Philosophie</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>abandonnement-0</td>\n", - " <td>1</td>\n", - " <td>138</td>\n", - " <td>ABANDONNEMENT. I. Droit civil. — Ce mot est un...</td>\n", " <td>1</td>\n", - " <td>75</td>\n", - " <td>ABANDONNEMENT</td>\n", - " <td>Toussaint</td>\n", - " <td>Droit</td>\n", - " <td>\\nABANDONNEMENT, s. m. en Droit, est le délais...</td>\n", - " <td>77</td>\n", - " <td>Droit Jurisprudence</td>\n", - " <td>Droit Jurisprudence</td>\n", - " <td>Droit Jurisprudence</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>Dumarsais5</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-1</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>\\nA, mot, est 1. la troisieme personne du prés...</td>\n", + " <td>238</td>\n", + " <td>Unclassified</td>\n", + " <td>Philosophie</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", - " <td>abantes-0</td>\n", " <td>1</td>\n", - " <td>143</td>\n", - " <td>ABANTES. Peuplade d’origine douteuse que l’on ...</td>\n", - " <td>1</td>\n", - " <td>81</td>\n", - " <td>ABANTES</td>\n", - " <td>Diderot</td>\n", + " <td>7</td>\n", + " <td>A</td>\n", + " <td>Dumarsais</td>\n", " <td>unclassified</td>\n", - " <td>\\n* ABANTES, s. m. pl. Peuples de Thrace qui p...</td>\n", - " <td>26</td>\n", + " <td>v1-1-2</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nA, préposition vient du latin à , à dextris, ...</td>\n", + " <td>\\nA, préposition vient du latin à , à dextris, ...</td>\n", + " <td>\\nA, préposition vient du latin à , à dextris, ...</td>\n", + " <td>1980</td>\n", " <td>Unclassified</td>\n", - " <td>Histoire</td>\n", - " <td>Histoire</td>\n", + " <td>Philosophie</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", - " <td>abaque-0</td>\n", - " <td>1</td>\n", - " <td>146</td>\n", - " <td>ABAQUE. I. Antiquité.— Dans l’antiquité on don...</td>\n", " <td>1</td>\n", - " <td>84</td>\n", - " <td>ABAQUE</td>\n", - " <td>d'Alembert2</td>\n", + " <td>8</td>\n", + " <td>A</td>\n", + " <td>Mallet</td>\n", " <td>unclassified</td>\n", - " <td>\\nABAQUE, s. m. chez les anciens Mathématicien...</td>\n", - " <td>52</td>\n", + " <td>v1-1-3</td>\n", + " <td>NaN</td>\n", + " <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n", + " <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n", + " <td>\\nA, étoit une lettre numérale parmi les Ancie...</td>\n", + " <td>200</td>\n", " <td>Unclassified</td>\n", - " <td>Physique</td>\n", " <td>Histoire</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", - " <td>abaremo-temo-0</td>\n", " <td>1</td>\n", - " <td>152</td>\n", - " <td>ABAREMO-TEMO(Bot.). Nom sous lequel Pison\\n(Br...</td>\n", - " <td>1</td>\n", - " <td>90</td>\n", - " <td>ABAREMO-TEMO</td>\n", - " <td>Diderot</td>\n", + " <td>9</td>\n", + " <td>A, lettre symbolique</td>\n", + " <td>Mallet</td>\n", " <td>unclassified</td>\n", - " <td>\\n* ABAREMO-TEMO, s. m. arbre qui croît, dit-o...</td>\n", - " <td>55</td>\n", + " <td>v1-1-4</td>\n", + " <td>NaN</td>\n", + " <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n", + " <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n", + " <td>\\nA, lettre symbolique, étoit un hiéroglyphe c...</td>\n", + " <td>82</td>\n", " <td>Unclassified</td>\n", - " <td>Histoire naturelle</td>\n", - " <td>Histoire naturelle</td>\n", + " <td>Histoire</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", - " <td>abares-0</td>\n", " <td>1</td>\n", - " <td>153</td>\n", - " <td>ABARES. Nom de deux peuples distincts, habitan...</td>\n", - " <td>1</td>\n", - " <td>91</td>\n", - " <td>ABARES</td>\n", - " <td>Diderot</td>\n", + " <td>10</td>\n", + " <td>A, numismatique ou monétaire</td>\n", + " <td>Mallet</td>\n", " <td>unclassified</td>\n", - " <td>\\n* ABARES, restes de la Nation des Huns qui s...</td>\n", - " <td>24</td>\n", + " <td>v1-1-5</td>\n", + " <td>Médailles</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>112</td>\n", " <td>Unclassified</td>\n", " <td>Histoire</td>\n", - " <td>Géographie</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", - " <td>abarim-0</td>\n", - " <td>1</td>\n", - " <td>154</td>\n", - " <td>ABARIM. Chaîne de montagnes de la Palestine au...</td>\n", " <td>1</td>\n", - " <td>92</td>\n", - " <td>ABARIM</td>\n", - " <td>Diderot</td>\n", + " <td>11</td>\n", + " <td>A, lapidaire</td>\n", + " <td>Mallet</td>\n", " <td>unclassified</td>\n", - " <td>\\n* ABARIM, montagne de l'Arabie d'où Moyse vi...</td>\n", - " <td>23</td>\n", + " <td>v1-1-6</td>\n", + " <td>Histoire</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>80</td>\n", " <td>Unclassified</td>\n", - " <td>Géographie</td>\n", - " <td>Géographie</td>\n", + " <td>Histoire</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", - "</div>\n", - " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-825c5672-f5f9-49ed-95eb-fdcae67ba1f1')\"\n", - " title=\"Convert this dataframe to an interactive table.\"\n", - " style=\"display:none;\">\n", - " \n", - " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", - " width=\"24px\">\n", - " <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", - " <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", - " </svg>\n", - " </button>\n", - " \n", - " <style>\n", - " .colab-df-container {\n", - " display:flex;\n", - " flex-wrap:wrap;\n", - " gap: 12px;\n", - " }\n", - "\n", - " .colab-df-convert {\n", - " background-color: #E8F0FE;\n", - " border: none;\n", - " border-radius: 50%;\n", - " cursor: pointer;\n", - " display: none;\n", - " fill: #1967D2;\n", - " height: 32px;\n", - " padding: 0 0 0 0;\n", - " width: 32px;\n", - " }\n", - "\n", - " .colab-df-convert:hover {\n", - " background-color: #E2EBFA;\n", - " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", - " fill: #174EA6;\n", - " }\n", + "</div>" + ], + "text/plain": [ + " volume numero head author \\\n", + "0 1 1 Title Page unsigned \n", + "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert \n", + "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert \n", + "3 1 5 A, a & a Dumarsais5 \n", + "4 1 6 A Dumarsais5 \n", + "5 1 7 A Dumarsais \n", + "6 1 8 A Mallet \n", + "7 1 9 A, lettre symbolique Mallet \n", + "8 1 10 A, numismatique ou monétaire Mallet \n", + "9 1 11 A, lapidaire Mallet \n", "\n", - " [theme=dark] .colab-df-convert {\n", - " background-color: #3B4455;\n", - " fill: #D2E3FC;\n", - " }\n", + " edda_class enccre_id enccre_class \\\n", + "0 unclassified NaN NaN \n", + "1 unclassified NaN NaN \n", + "2 unclassified NaN NaN \n", + "3 Grammaire v1-1-0 Grammaire \n", + "4 unclassified v1-1-1 Grammaire \n", + "5 unclassified v1-1-2 Grammaire \n", + "6 unclassified v1-1-3 NaN \n", + "7 unclassified v1-1-4 NaN \n", + "8 unclassified v1-1-5 Médailles \n", + "9 unclassified v1-1-6 Histoire \n", "\n", - " [theme=dark] .colab-df-convert:hover {\n", - " background-color: #434B5C;\n", - " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", - " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", - " fill: #FFFFFF;\n", - " }\n", - " </style>\n", + " content \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... \n", + "5 \\nA, préposition vient du latin à , à dextris, ... \n", + "6 \\nA, étoit une lettre numérale parmi les Ancie... \n", + "7 \\nA, lettre symbolique, étoit un hiéroglyphe c... \n", + "8 \\nA, numismatique ou monétaire, sur le revers ... \n", + "9 \\nA, lapidaire, dans les anciennes inscription... \n", "\n", - " <script>\n", - " const buttonEl =\n", - " document.querySelector('#df-825c5672-f5f9-49ed-95eb-fdcae67ba1f1 button.colab-df-convert');\n", - " buttonEl.style.display =\n", - " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + " content_without_designant \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n\\n... \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... \n", + "5 \\nA, préposition vient du latin à , à dextris, ... \n", + "6 \\nA, étoit une lettre numérale parmi les Ancie... \n", + "7 \\nA, lettre symbolique, étoit un hiéroglyphe c... \n", + "8 \\nA, numismatique ou monétaire, sur le revers ... \n", + "9 \\nA, lapidaire, dans les anciennes inscription... \n", "\n", - " async function convertToInteractive(key) {\n", - " const element = document.querySelector('#df-825c5672-f5f9-49ed-95eb-fdcae67ba1f1');\n", - " const dataTable =\n", - " await google.colab.kernel.invokeFunction('convertToInteractive',\n", - " [key], {});\n", - " if (!dataTable) return;\n", + " first_paragraph nb_words super_domain \\\n", + "0 \\n\\nENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES ... 151 Unclassified \n", + "1 \\n\\nA MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINIS... 208 Unclassified \n", + "2 \\n\\nDISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\n\\n 44669 Unclassified \n", + "3 \\nA, a & a s.m. (ordre Encyclopéd.\\nEntend. Sc... 711 Philosophie \n", + "4 \\nA, mot, est 1. la troisieme personne du prés... 238 Unclassified \n", + "5 \\nA, préposition vient du latin à , à dextris, ... 1980 Unclassified \n", + "6 \\nA, étoit une lettre numérale parmi les Ancie... 200 Unclassified \n", + "7 \\nA, lettre symbolique, étoit un hiéroglyphe c... 82 Unclassified \n", + "8 \\nA, numismatique ou monétaire, sur le revers ... 112 Unclassified \n", + "9 \\nA, lapidaire, dans les anciennes inscription... 80 Unclassified \n", "\n", - " const docLinkHtml = 'Like what you see? Visit the ' +\n", - " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", - " + ' to learn more about interactive tables.';\n", - " element.innerHTML = '';\n", - " dataTable['output_type'] = 'display_data';\n", - " await google.colab.output.renderOutput(dataTable, element);\n", - " const docLink = document.createElement('div');\n", - " docLink.innerHTML = docLinkHtml;\n", - " element.appendChild(docLink);\n", - " }\n", - " </script>\n", - " </div>\n", - " </div>\n", - " " + " superdomainBert \n", + "0 Philosophie \n", + "1 Philosophie \n", + "2 Belles-lettres \n", + "3 Philosophie \n", + "4 Philosophie \n", + "5 Philosophie \n", + "6 Histoire \n", + "7 Histoire \n", + "8 Histoire \n", + "9 Histoire " ] }, + "execution_count": 20, "metadata": {}, - "execution_count": 36 + "output_type": "execute_result" } ], "source": [ @@ -2339,45 +1037,45 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 21, "metadata": { "id": "J9rObbvVr0zc" }, "outputs": [], "source": [ - "df.to_csv(drive_path + \"/predictions/predictions_parallel_superdomain.tsv\", sep=\"\\t\")" + "df.to_csv(drive_path + \"predictions/EDdA_dataset_articles_superdomainBERT_230313.tsv\", sep=\"\\t\")" ] }, { "cell_type": "code", - "source": [ - "df.drop(columns=['contentLGE', 'contentEDdA'], inplace=True)" - ], + "execution_count": null, "metadata": { "id": "8cX6XBq8_F5T" }, - "execution_count": 39, - "outputs": [] + "outputs": [], + "source": [ + "df.drop(columns=['contentLGE', 'contentEDdA'], inplace=True)" + ] }, { "cell_type": "code", - "source": [ - "df.to_csv(drive_path + \"/predictions/metadata_parallel_predictions_superdomain.csv\", sep=\",\", index=False)" - ], + "execution_count": null, "metadata": { "id": "7fx6BPpg0iNc" }, - "execution_count": 41, - "outputs": [] + "outputs": [], + "source": [ + "df.to_csv(drive_path + \"predictions/metadata_parallel_predictions_superdomain.csv\", sep=\",\", index=False)" + ] }, { "cell_type": "code", - "source": [], + "execution_count": null, "metadata": { "id": "7TD1mbKj_fXH" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] } ], "metadata": { @@ -2410,77 +1108,147 @@ }, "widgets": { "application/vnd.jupyter.widget-state+json": { - "11c285bed74e46a08fbb7bf88715aafa": { + "0180ffc200e8466191a11a723c82e43f": { "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", + "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_3fde7318ebc3458cb64f8927fdcbaee3", - "IPY_MODEL_8d57eb44d9394604981a8f8f97f48b7c", - "IPY_MODEL_1cb6ed877c2b455b9463b12c2da877d8" - ], - "layout": "IPY_MODEL_5e03651dca944a5f91b675c503feeeac" + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c4ea841cb43747cdbce35f8f9c711cde", + "max": 29, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2d937fce2e6c4b69816352bd264ded41", + "value": 29 } }, - "3fde7318ebc3458cb64f8927fdcbaee3": { + "04a86b4164fa49de8fd47d4d373e1d81": { "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", + "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", - "_view_name": "HTMLView", + "_view_name": "ProgressView", + "bar_style": "success", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_0521c3cc6abd44ae989ac0701100045d", - "placeholder": "​", - "style": "IPY_MODEL_d12a8ef069af4d79870bd783f2343184", - "value": "Downloading: 100%" + "layout": "IPY_MODEL_4edc5b66f0eb44a0b05876fda90f0d1b", + "max": 1961828, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_5285a390fb42415289d89585e04c8994", + "value": 1961828 } }, - "8d57eb44d9394604981a8f8f97f48b7c": { + "0521c3cc6abd44ae989ac0701100045d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "087ebcb093bb41c28485bdc762fb5da6": { "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "11c285bed74e46a08fbb7bf88715aafa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", + "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_28d38094dcd54d6694e2efad7fea6abb", - "max": 995526, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6f80ea06220b4a498e6169e55cd8800f", - "value": 995526 + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3fde7318ebc3458cb64f8927fdcbaee3", + "IPY_MODEL_8d57eb44d9394604981a8f8f97f48b7c", + "IPY_MODEL_1cb6ed877c2b455b9463b12c2da877d8" + ], + "layout": "IPY_MODEL_5e03651dca944a5f91b675c503feeeac" } }, "1cb6ed877c2b455b9463b12c2da877d8": { "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", "model_module_version": "1.5.0", + "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -2498,10 +1266,10 @@ "value": " 996k/996k [00:00<00:00, 2.00MB/s]" } }, - "5e03651dca944a5f91b675c503feeeac": { + "209ff109c8e142dfba37baea2d3d5de7": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2550,10 +1318,10 @@ "width": null } }, - "0521c3cc6abd44ae989ac0701100045d": { + "28d38094dcd54d6694e2efad7fea6abb": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2602,10 +1370,26 @@ "width": null } }, - "d12a8ef069af4d79870bd783f2343184": { + "2924cdc1348942cfb23f28a5383af3e4": { "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2b9b4eac7994405ca9bce38332df2629": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", @@ -2617,10 +1401,26 @@ "description_width": "" } }, - "28d38094dcd54d6694e2efad7fea6abb": { + "2d937fce2e6c4b69816352bd264ded41": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "39636049d60a4bb4bde7d0ef1af25d78": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2669,26 +1469,10 @@ "width": null } }, - "6f80ea06220b4a498e6169e55cd8800f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, "3de8b4b0d6494c058589c535dc24dc3e": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2737,47 +1521,10 @@ "width": null } }, - "e0df5e2d4ebd4eb3b126c16dadb2ba62": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "9be44ba364a344f2b6b2546ae9d61ba8": { + "3fde7318ebc3458cb64f8927fdcbaee3": { "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_fe472df31774495c83aa159e116ba2ee", - "IPY_MODEL_0180ffc200e8466191a11a723c82e43f", - "IPY_MODEL_a07ac2935a3f4d84971ae9147a854969" - ], - "layout": "IPY_MODEL_af4ae976808042bf929ab17df10530b2" - } - }, - "fe472df31774495c83aa159e116ba2ee": { - "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", - "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -2789,61 +1536,53 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_b2277b3d600c43f999b3a07215ac2e13", + "layout": "IPY_MODEL_0521c3cc6abd44ae989ac0701100045d", "placeholder": "​", - "style": "IPY_MODEL_ebe5e6f8af1e4e04a8a2b5939ac09039", + "style": "IPY_MODEL_d12a8ef069af4d79870bd783f2343184", "value": "Downloading: 100%" } }, - "0180ffc200e8466191a11a723c82e43f": { + "4203b950e245481590e8105f31301782": { "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", "state": { - "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", + "_model_name": "DescriptionStyleModel", "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c4ea841cb43747cdbce35f8f9c711cde", - "max": 29, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_2d937fce2e6c4b69816352bd264ded41", - "value": 29 + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" } }, - "a07ac2935a3f4d84971ae9147a854969": { + "4c46904f8e944d2b834ba9d384b00a8c": { "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", "model_module_version": "1.5.0", + "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", + "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_64b57e3be2c743b3b0e58d338243c656", - "placeholder": "​", - "style": "IPY_MODEL_6ca9688ac7fa4e638994b91242c0ac87", - "value": " 29.0/29.0 [00:00<00:00, 1.88kB/s]" + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ef37bbf1f34e4765b1803a607716d0d1", + "IPY_MODEL_c2d6041cd6674043953e094791ab9659", + "IPY_MODEL_e4c43817f44743388e6fd98b8dbb2eda" + ], + "layout": "IPY_MODEL_39636049d60a4bb4bde7d0ef1af25d78" } }, - "af4ae976808042bf929ab17df10530b2": { + "4edc5b66f0eb44a0b05876fda90f0d1b": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2892,10 +1631,26 @@ "width": null } }, - "b2277b3d600c43f999b3a07215ac2e13": { + "5285a390fb42415289d89585e04c8994": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "53643db8401846f2af6f15f5cd0c9998": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2944,25 +1699,10 @@ "width": null } }, - "ebe5e6f8af1e4e04a8a2b5939ac09039": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c4ea841cb43747cdbce35f8f9c711cde": { + "5e03651dca944a5f91b675c503feeeac": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3011,26 +1751,10 @@ "width": null } }, - "2d937fce2e6c4b69816352bd264ded41": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, "64b57e3be2c743b3b0e58d338243c656": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3081,8 +1805,8 @@ }, "6ca9688ac7fa4e638994b91242c0ac87": { "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", @@ -3094,98 +1818,26 @@ "description_width": "" } }, - "aa6a7a9106554f85a91150bd65c271d0": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_ea3f471546734f5994edfdc214319368", - "IPY_MODEL_04a86b4164fa49de8fd47d4d373e1d81", - "IPY_MODEL_be067a8a406f41779e42bd35abcbfcf0" - ], - "layout": "IPY_MODEL_7df91507e47d4a6992464293ce002a29" - } - }, - "ea3f471546734f5994edfdc214319368": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ecef81814a7c4481aa49eb73807bfe4d", - "placeholder": "​", - "style": "IPY_MODEL_2b9b4eac7994405ca9bce38332df2629", - "value": "Downloading: 100%" - } - }, - "04a86b4164fa49de8fd47d4d373e1d81": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4edc5b66f0eb44a0b05876fda90f0d1b", - "max": 1961828, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_5285a390fb42415289d89585e04c8994", - "value": 1961828 - } - }, - "be067a8a406f41779e42bd35abcbfcf0": { + "6f80ea06220b4a498e6169e55cd8800f": { "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", "state": { - "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", + "_model_name": "ProgressStyleModel", "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_53643db8401846f2af6f15f5cd0c9998", - "placeholder": "​", - "style": "IPY_MODEL_bc4825e1a43f4a20b496d82ea3687e6f", - "value": " 1.96M/1.96M [00:00<00:00, 2.16MB/s]" + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" } }, "7df91507e47d4a6992464293ce002a29": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3234,77 +1886,99 @@ "width": null } }, - "ecef81814a7c4481aa49eb73807bfe4d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", + "8d57eb44d9394604981a8f8f97f48b7c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_28d38094dcd54d6694e2efad7fea6abb", + "max": 995526, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6f80ea06220b4a498e6169e55cd8800f", + "value": 995526 + } + }, + "9be44ba364a344f2b6b2546ae9d61ba8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_fe472df31774495c83aa159e116ba2ee", + "IPY_MODEL_0180ffc200e8466191a11a723c82e43f", + "IPY_MODEL_a07ac2935a3f4d84971ae9147a854969" + ], + "layout": "IPY_MODEL_af4ae976808042bf929ab17df10530b2" } }, - "2b9b4eac7994405ca9bce38332df2629": { + "a07ac2935a3f4d84971ae9147a854969": { "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", + "model_name": "HTMLModel", "state": { + "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", + "_model_name": "HTMLModel", "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_64b57e3be2c743b3b0e58d338243c656", + "placeholder": "​", + "style": "IPY_MODEL_6ca9688ac7fa4e638994b91242c0ac87", + "value": " 29.0/29.0 [00:00<00:00, 1.88kB/s]" } }, - "4edc5b66f0eb44a0b05876fda90f0d1b": { + "aa6a7a9106554f85a91150bd65c271d0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ea3f471546734f5994edfdc214319368", + "IPY_MODEL_04a86b4164fa49de8fd47d4d373e1d81", + "IPY_MODEL_be067a8a406f41779e42bd35abcbfcf0" + ], + "layout": "IPY_MODEL_7df91507e47d4a6992464293ce002a29" + } + }, + "af4ae976808042bf929ab17df10530b2": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3353,26 +2027,10 @@ "width": null } }, - "5285a390fb42415289d89585e04c8994": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "53643db8401846f2af6f15f5cd0c9998": { + "b2277b3d600c43f999b3a07215ac2e13": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3423,8 +2081,8 @@ }, "bc4825e1a43f4a20b496d82ea3687e6f": { "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", @@ -3436,32 +2094,10 @@ "description_width": "" } }, - "4c46904f8e944d2b834ba9d384b00a8c": { + "be067a8a406f41779e42bd35abcbfcf0": { "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_ef37bbf1f34e4765b1803a607716d0d1", - "IPY_MODEL_c2d6041cd6674043953e094791ab9659", - "IPY_MODEL_e4c43817f44743388e6fd98b8dbb2eda" - ], - "layout": "IPY_MODEL_39636049d60a4bb4bde7d0ef1af25d78" - } - }, - "ef37bbf1f34e4765b1803a607716d0d1": { - "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", - "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -3473,16 +2109,16 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_c3e73d423c2c41c0a942331070fda723", + "layout": "IPY_MODEL_53643db8401846f2af6f15f5cd0c9998", "placeholder": "​", - "style": "IPY_MODEL_087ebcb093bb41c28485bdc762fb5da6", - "value": "Downloading: 100%" + "style": "IPY_MODEL_bc4825e1a43f4a20b496d82ea3687e6f", + "value": " 1.96M/1.96M [00:00<00:00, 2.16MB/s]" } }, "c2d6041cd6674043953e094791ab9659": { "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -3503,31 +2139,10 @@ "value": 625 } }, - "e4c43817f44743388e6fd98b8dbb2eda": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_209ff109c8e142dfba37baea2d3d5de7", - "placeholder": "​", - "style": "IPY_MODEL_4203b950e245481590e8105f31301782", - "value": " 625/625 [00:00<00:00, 35.2kB/s]" - } - }, - "39636049d60a4bb4bde7d0ef1af25d78": { + "c3e73d423c2c41c0a942331070fda723": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3576,10 +2191,10 @@ "width": null } }, - "c3e73d423c2c41c0a942331070fda723": { + "c4ea841cb43747cdbce35f8f9c711cde": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3628,10 +2243,10 @@ "width": null } }, - "087ebcb093bb41c28485bdc762fb5da6": { + "d12a8ef069af4d79870bd783f2343184": { "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", @@ -3645,8 +2260,8 @@ }, "de270f0aa8194e0bb470e693a35d7d6e": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3695,26 +2310,82 @@ "width": null } }, - "2924cdc1348942cfb23f28a5383af3e4": { + "e0df5e2d4ebd4eb3b126c16dadb2ba62": { "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", + "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", - "bar_color": null, "description_width": "" } }, - "209ff109c8e142dfba37baea2d3d5de7": { + "e4c43817f44743388e6fd98b8dbb2eda": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_209ff109c8e142dfba37baea2d3d5de7", + "placeholder": "​", + "style": "IPY_MODEL_4203b950e245481590e8105f31301782", + "value": " 625/625 [00:00<00:00, 35.2kB/s]" + } + }, + "ea3f471546734f5994edfdc214319368": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ecef81814a7c4481aa49eb73807bfe4d", + "placeholder": "​", + "style": "IPY_MODEL_2b9b4eac7994405ca9bce38332df2629", + "value": "Downloading: 100%" + } + }, + "ebe5e6f8af1e4e04a8a2b5939ac09039": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ecef81814a7c4481aa49eb73807bfe4d": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3763,19 +2434,46 @@ "width": null } }, - "4203b950e245481590e8105f31301782": { + "ef37bbf1f34e4765b1803a607716d0d1": { "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", + "model_name": "HTMLModel", "state": { + "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", + "_model_name": "HTMLModel", "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c3e73d423c2c41c0a942331070fda723", + "placeholder": "​", + "style": "IPY_MODEL_087ebcb093bb41c28485bdc762fb5da6", + "value": "Downloading: 100%" + } + }, + "fe472df31774495c83aa159e116ba2ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b2277b3d600c43f999b3a07215ac2e13", + "placeholder": "​", + "style": "IPY_MODEL_ebe5e6f8af1e4e04a8a2b5939ac09039", + "value": "Downloading: 100%" } } } @@ -3783,4 +2481,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} -- GitLab