From 9b5fa5c02397a09e503e55d37ed6b9c05a535a5d Mon Sep 17 00:00:00 2001 From: Ludovic Moncla <moncla.ludovic@gmail.com> Date: Sun, 10 Jul 2022 18:39:05 +0200 Subject: [PATCH] Delete CorpusTEI_EDdA_to_dataframe.ipynb --- notebooks/CorpusTEI_EDdA_to_dataframe.ipynb | 2991 ------------------- 1 file changed, 2991 deletions(-) delete mode 100644 notebooks/CorpusTEI_EDdA_to_dataframe.ipynb diff --git a/notebooks/CorpusTEI_EDdA_to_dataframe.ipynb b/notebooks/CorpusTEI_EDdA_to_dataframe.ipynb deleted file mode 100644 index 13d04f1..0000000 --- a/notebooks/CorpusTEI_EDdA_to_dataframe.ipynb +++ /dev/null @@ -1,2991 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "metallic-shelf", - "metadata": {}, - "source": [ - "# Préparation du corpus EDdA pour la classification en domaine" - ] - }, - { - "cell_type": "markdown", - "id": "designing-advice", - "metadata": {}, - "source": [ - "## Preparing data" - ] - }, - { - "cell_type": "markdown", - "id": "floppy-fleet", - "metadata": {}, - "source": [ - "### Import des librairies" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "appreciated-victim", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from bs4 import BeautifulSoup\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "import urllib, json\n", - "from urllib.request import urlopen" - ] - }, - { - "cell_type": "markdown", - "id": "c7fc80b7", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "id": "framed-fossil", - "metadata": {}, - "source": [ - "### Parsing des articles TEI" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "suburban-honduras", - "metadata": {}, - "outputs": [], - "source": [ - "input_path = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/Alice/EDdA/\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "scenic-vermont", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Volume : 11\n", - "Volume : 16\n", - "Volume : 17\n", - "Volume : 10\n", - "Volume : 5\n", - "Volume : 2\n", - "Volume : 3\n", - "Volume : 4\n", - "Volume : 15\n", - "Volume : 12\n", - "Volume : 13\n", - "Volume : 14\n", - "Volume : 1\n", - "Volume : 6\n", - "Volume : 8\n", - "Volume : 9\n", - "Volume : 7\n" - ] - } - ], - "source": [ - "# récupération dans une liste des métadonnées (volume, numéro, nom de l'article, classe et auteur) à partir des fichiers TEI\n", - "data = []\n", - "\n", - "for tome in os.listdir(input_path):\n", - " volume = tome[1:]\n", - " print(\"Volume : \", volume)\n", - " \n", - " for article in os.listdir(input_path + tome +\"/\"):\n", - " #print(\"Article : \", article[7:-4])\n", - " numero = article[7:-4]\n", - " extension = article[-4:]\n", - " if extension == '.tei':\n", - "\n", - " try:\n", - " soup = BeautifulSoup(open(input_path+tome+\"/\"+article))\n", - "\n", - " head = soup.find(type=\"head\")\n", - " author = soup.find(type=\"author\")\n", - " normclass = soup.find(type=\"normclass\")\n", - " classEDdA = soup.find(type=\"class\")\n", - " \n", - " #print(volume, numero, head.get('value'), normclass.get('value'), author.get('value'))\n", - " data.append([int(volume), int(numero), head.get('value').strip(), normclass.get('value').strip(), classEDdA.get('value').strip(), author.get('value').strip()])\n", - " \n", - " except AttributeError as e:\n", - " #print('Volume : ', volume, ' Numéro : ', numero)\n", - " #print('Error : ' + str(e))\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "excess-waterproof", - "metadata": {}, - "outputs": [], - "source": [ - "# transformation de la liste en dataframe\n", - "df = pd.DataFrame(data, columns=['volume', 'numero', 'head', 'normClass', 'classEDdA', 'author'])\n", - "df = df.sort_values(['volume', 'numero']).reset_index(drop = True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "blocked-reading", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>volume</th>\n", - " <th>numero</th>\n", - " <th>head</th>\n", - " <th>normClass</th>\n", - " <th>classEDdA</th>\n", - " <th>author</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>41327</th>\n", - " <td>10</td>\n", - " <td>2211</td>\n", - " <td>MILIAIRE fievre</td>\n", - " <td>Médecine</td>\n", - " <td>Medecine.</td>\n", - " <td>Jaucourt</td>\n", - " </tr>\n", - " <tr>\n", - " <th>69509</th>\n", - " <td>16</td>\n", - " <td>3317</td>\n", - " <td>TRIMONTIUM</td>\n", - " <td>Géographie ancienne</td>\n", - " <td>Géog. anc.</td>\n", - " <td>unsigned</td>\n", - " </tr>\n", - " <tr>\n", - " <th>32448</th>\n", - " <td>8</td>\n", - " <td>1600</td>\n", - " <td>HRADSCHIN</td>\n", - " <td>Géographie</td>\n", - " <td>Géog.</td>\n", - " <td>unsigned</td>\n", - " </tr>\n", - " <tr>\n", - " <th>60365</th>\n", - " <td>14</td>\n", - " <td>4069</td>\n", - " <td>Sauveur</td>\n", - " <td>Art numismatique</td>\n", - " <td>Art numismat.</td>\n", - " <td>Jaucourt</td>\n", - " </tr>\n", - " <tr>\n", - " <th>15763</th>\n", - " <td>3</td>\n", - " <td>3846</td>\n", - " <td>CONFORMISTES, (non-)</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>unsigned</td>\n", - " </tr>\n", - " <tr>\n", - " <th>33076</th>\n", - " <td>8</td>\n", - " <td>2228</td>\n", - " <td>Jardin</td>\n", - " <td>Fauconnerie</td>\n", - " <td>Fauconnerie.</td>\n", - " <td>unsigned</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6245</th>\n", - " <td>2</td>\n", - " <td>985</td>\n", - " <td>Bassin</td>\n", - " <td>Boulangerie</td>\n", - " <td>terme de Boulanger</td>\n", - " <td>unsigned</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1967</th>\n", - " <td>1</td>\n", - " <td>1969</td>\n", - " <td>ALTIN</td>\n", - " <td>Commerce</td>\n", - " <td>Commerce.</td>\n", - " <td>unsigned</td>\n", - " </tr>\n", - " <tr>\n", - " <th>11512</th>\n", - " <td>2</td>\n", - " <td>6252</td>\n", - " <td>CAVER</td>\n", - " <td>Escrime</td>\n", - " <td>en Escrime.</td>\n", - " <td>unsigned</td>\n", - " </tr>\n", - " <tr>\n", - " <th>23387</th>\n", - " <td>5</td>\n", - " <td>2106</td>\n", - " <td>EMPANNON</td>\n", - " <td>Charpenterie</td>\n", - " <td>Charpent.</td>\n", - " <td>unsigned</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " volume numero head normClass \\\n", - "41327 10 2211 MILIAIRE fievre Médecine \n", - "69509 16 3317 TRIMONTIUM Géographie ancienne \n", - "32448 8 1600 HRADSCHIN Géographie \n", - "60365 14 4069 Sauveur Art numismatique \n", - "15763 3 3846 CONFORMISTES, (non-) unclassified \n", - "33076 8 2228 Jardin Fauconnerie \n", - "6245 2 985 Bassin Boulangerie \n", - "1967 1 1969 ALTIN Commerce \n", - "11512 2 6252 CAVER Escrime \n", - "23387 5 2106 EMPANNON Charpenterie \n", - "\n", - " classEDdA author \n", - "41327 Medecine. Jaucourt \n", - "69509 Géog. anc. unsigned \n", - "32448 Géog. unsigned \n", - "60365 Art numismat. Jaucourt \n", - "15763 unclassified unsigned \n", - "33076 Fauconnerie. unsigned \n", - "6245 terme de Boulanger unsigned \n", - "1967 Commerce. unsigned \n", - "11512 en Escrime. unsigned \n", - "23387 Charpent. unsigned " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# affichage aléatoire de 50 lignes du dataframe\n", - "df.sample(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "expired-click", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "74190" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# nombre d'articles dans le dataframe\n", - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "considered-adjustment", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>volume</th>\n", - " <th>numero</th>\n", - " <th>head</th>\n", - " <th>classEDdA</th>\n", - " <th>author</th>\n", - " </tr>\n", - " <tr>\n", - " <th>normClass</th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th></th>\n", - " <td>44</td>\n", - " <td>44</td>\n", - " <td>44</td>\n", - " <td>44</td>\n", - " <td>44</td>\n", - " </tr>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>17</td>\n", - " <td>17</td>\n", - " <td>17</td>\n", - " <td>17</td>\n", - " <td>17</td>\n", - " </tr>\n", - " <tr>\n", - " <th>Abus des langues</th>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <th>Accord de sons</th>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <th>Acoustique</th>\n", - " <td>6</td>\n", - " <td>6</td>\n", - " <td>6</td>\n", - " <td>6</td>\n", - " <td>6</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " volume numero head classEDdA author\n", - "normClass \n", - " 44 44 44 44 44\n", - "0 17 17 17 17 17\n", - "Abus des langues 1 1 1 1 1\n", - "Accord de sons 1 1 1 1 1\n", - "Acoustique 6 6 6 6 6" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# On regroupe les lignes du dataframe en fonction du normclass\n", - "classes = df.groupby(['normClass']).count()\n", - "classes.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "instructional-variation", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2908" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Nombre de classes \n", - "len(classes)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "handmade-contest", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "12685" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# nombre d'articles 'unclassified'\n", - "len(df.loc[df['normClass']==\"unclassified\",:])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "crude-olympus", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1614" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# nombre de classes avec un seul article\n", - "len(classes.loc[classes['volume']==1])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "sized-barrier", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2656" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# nombre de classes avec moins de 20 articles\n", - "len(classes.loc[classes['volume']<20])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "indian-selection", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "weighted-hanging", - "metadata": {}, - "source": [ - "### Enregistrement" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "stainless-stewart", - "metadata": {}, - "outputs": [], - "source": [ - "# enregistrement du résultat du groupby\n", - "#classes['volume'].to_csv('classesEDdA.tsv',sep='\\t',header=False) " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "hearing-olive", - "metadata": {}, - "outputs": [], - "source": [ - "# enregistrement du dataframe (permet de ne pas reparser tous les fichiers TEI pour recharger ce dataframe)\n", - "df.to_csv('../../../Data/EDdA-Classification/EDdA_dataframe_orginal.tsv',sep='\\t', index=False) " - ] - }, - { - "cell_type": "markdown", - "id": "stuck-courage", - "metadata": {}, - "source": [ - "### Lecture" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "thick-destiny", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv('../../../Data/EDdA-Classification/EDdA_dataframe_orginal.tsv', sep='\\t') " - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "typical-munich", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>volume</th>\n", - " <th>numero</th>\n", - " <th>head</th>\n", - " <th>normClass</th>\n", - " <th>classEDdA</th>\n", - " <th>author</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>Title Page</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>unsigned</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>Diderot & d'Alembert</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>1</td>\n", - " <td>3</td>\n", - " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>d'Alembert</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>1</td>\n", - " <td>5</td>\n", - " <td>A, a & a</td>\n", - " <td>Grammaire</td>\n", - " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", - " <td>Dumarsais5</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>1</td>\n", - " <td>6</td>\n", - " <td>A</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>Dumarsais5</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " volume numero head normClass \\\n", - "0 1 1 Title Page unclassified \n", - "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON unclassified \n", - "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS unclassified \n", - "3 1 5 A, a & a Grammaire \n", - "4 1 6 A unclassified \n", - "\n", - " classEDdA author \n", - "0 unclassified unsigned \n", - "1 unclassified Diderot & d'Alembert \n", - "2 unclassified d'Alembert \n", - "3 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 \n", - "4 unclassified Dumarsais5 " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "individual-protection", - "metadata": {}, - "outputs": [], - "source": [ - "# ensembles de domaines et domaines ENCCRE\n", - "domaines_regroupes = {}\n", - "domaines_regroupes['Agriculture - Economie rustique'] = ['Agriculture', 'Economie rustique', 'Fontainier', 'Graines', 'Jardinage', 'Moulin', 'Sucre', 'Tabac', 'Vigne', 'Vin']\n", - "domaines_regroupes['Anatomie'] = ['Anatomie', 'Economie animale']\n", - "domaines_regroupes['Antiquité'] = ['Antiquité', 'Iconologie', 'Mythologie']\n", - "domaines_regroupes['Architecture'] = ['Architecture', 'Carreleur', 'Carrier', 'Coupe des pierres', 'Couvreur', 'Décoration', 'Maçonnerie']\n", - "domaines_regroupes['Arts et métiers'] = ['Arts et métiers', 'Arts mécaniques', 'Manufacture']\n", - "domaines_regroupes['Beaux-arts'] = ['Beaux-arts', 'Dessin', 'Gravue', 'Peinture', 'Sculpture']\n", - "domaines_regroupes['Belles-lettres - Poésie'] = ['Belles-lettres', 'Eloquence', 'Littérature', 'Poésie', 'Rhétorique']\n", - "domaines_regroupes['Blason'] = ['Blason']\n", - "domaines_regroupes['Caractères'] = ['Caractères', 'Ecriture']\n", - "domaines_regroupes['Chasse'] = ['Chasse', 'Fauconnerie', 'Oisellerie', 'Vénerie']\n", - "domaines_regroupes['Chimie'] = ['Alchimie', 'Chimie', 'Docimasie']\n", - "domaines_regroupes['Commerce'] = ['Commerce', 'Marchand', 'Voiturier']\n", - "domaines_regroupes['Droit - Jurisprudence'] = ['Chancellerie', 'Corporation', 'Douane', 'Droit', 'Eaux et Forêts', 'Finance', 'Jurisprudence', 'Palais']\n", - "domaines_regroupes['Economie domestique'] = ['Cuisine','Economie domestique']\n", - "#domaines_regroupes['Géographie'] = ['Géographie', 'Géographie Histoire naturelle', 'Géographie ancienne', 'Géographie des Arabes', 'Géographie du moyen âge',\n", - "# 'Géographie ecclésiastique', 'Géographie historique', 'Géographie maritime ancienne', 'Géographie des Romains', 'Géographie morderne',\n", - "# 'Géographie naturelle', 'Géographie physique', 'Géographie sacrée', 'Géographie sainte', 'Géographie transcendante', 'Géographie transcendantee']\n", - "domaines_regroupes['Géographie'] = ['Géographie', 'Topographie']\n", - "domaines_regroupes['Grammaire'] = ['Grammaire', 'Langues', 'Synonymes']\n", - "domaines_regroupes['Histoire'] = ['Calendrier','Chevalerie','Chronologie','Coutumes','Généalogie','Histoire','Inscriptions','Inventions', 'Voyage']\n", - "domaines_regroupes['Histoire naturelle'] = ['Botanique','Conchyliologie','Fossiles','Histoire naturelle', 'Ichtyologie','Insectologie','Ophiologie','Ornithologie','Zoologie']\n", - "domaines_regroupes['Jeu'] = ['Jeu']\n", - "domaines_regroupes['Maréchage - Manège'] = ['Maréchage', 'Manège']\n", - "domaines_regroupes['Marine'] = ['Galère','Marine', 'Navigation', 'Rivière']\n", - "domaines_regroupes['Mathématiques'] = ['Algèbre','Analyse des hasards', 'Arithmétique', 'Arpentage','Géométrie', 'Mathématiques', 'Trigonométrie']\n", - "domaines_regroupes['Médailles'] = ['Médailles','Numismatique']\n", - "domaines_regroupes['Médecine - Chirurgie'] = ['Chirurgie', 'Diète', 'Gymnastique', 'Maladie', 'Matière médicale', 'Médecine', 'Pathologie', 'Physiologie', 'Séméiotique', 'Thérapeutique']\n", - "domaines_regroupes['Mesure'] = ['Balancier', 'Jaugeage', 'Mesure', 'Poids']\n", - "domaines_regroupes[\"Métiers\"] = ['Boucherie', 'Boulangerie', 'Brasserie', 'Charcuterie', 'Confiserie', 'Distillation', 'Epicerie', 'Pâtisserie', 'Rôtisserie', 'Vinaigrier']\n", - "domaines_regroupes[\"Métiers\"] += ['Bois', 'Boissellerie', 'Charpenterie', 'Charronnage', 'Coffretier', 'Ebénisterie', 'Formier', 'Layeterie', 'Menuiserie', 'Tonnelier', 'Vannerie']\n", - "domaines_regroupes[\"Métiers\"] += ['Bourrelier', 'Boyaudier', 'Cardier', 'Chamoiseur', 'Corroierie', 'Cuir', 'Gainier', 'Hongroyeur', 'Maroquinier', 'Mégisserie', 'Parcheminerie', 'Peausserie', 'Pelleterie', 'Sellier', 'Tannerie']\n", - "domaines_regroupes[\"Métiers\"] += ['Aiguilletier-Epinglier', 'Ardoiserie', 'Argent', \"Batteur d'or\", 'Bijouterie', 'Bimblotier', 'Chaînetier', 'Chaudronnerie', 'Ciselure', 'Cloche', 'Cloutier', 'Coutellerie', 'Cuivre', 'Diamantaire', 'Dorure', 'Eperonnier', 'Fer']\n", - "domaines_regroupes[\"Métiers\"] += ['Ferblanterie', 'Fonderie', 'Forge', 'Fourbisseur', 'Glaces', 'Joaillier', 'Lapidaire', 'Lunetier', 'Marbrier', 'Maréchal-grossier', 'Métal', 'Metteur en oeuvre', 'Miroiterie', 'Or', 'Orfèvrerie']\n", - "domaines_regroupes[\"Métiers\"] += ['Pierres', 'Plomberie', \"Potier d'étain\", 'Serrurerie', 'Taillanderie', \"Tireur d'or\", 'Verrerie', 'Vitrerie']\n", - "domaines_regroupes[\"Métiers\"] += ['Cartier', 'Cartonnier', 'Imprimerie', 'Librairie', 'Marbreur de papier', 'Papeterie', 'Reliure']\n", - "domaines_regroupes[\"Métiers\"] += ['Bas au métier', 'Blanchissage des toiles', 'Blondier', 'Bonneterie', 'Bottier', 'Bourserie', 'Boutonnier', 'Broderie', 'Cardeur', 'Ceinturier', 'Chapellerie', 'Cordonnerie','Coton', 'Couture', 'Découpeur', 'Dentelle', 'Draperie']\n", - "domaines_regroupes[\"Métiers\"] += ['Etoffe', 'Fil', 'Friseur', 'Ganterie', 'Gazier', 'Laine', 'Lingerie', 'Mode', 'Ourdissage', 'Passementerie', 'Perruquier', 'Plumasserie', 'Rubanerie', 'Soierie', 'Tailleur', 'Tapisserie', 'Teinturerie', 'Tisserand', 'Toilerie', 'Tonderie de drap']\n", - "domaines_regroupes[\"Métiers\"] += ['Amidonnier', 'Blanchisserie de cire', 'Chandelier', 'Cirerie', 'Corderie', 'Emailleur', 'Eventailliste', 'Faïencier', 'Filassier', 'Fleuriste', 'Horlogerie', 'Marqueterie', 'Métiers peu attestés', 'Parfumeur', 'Paumier', 'Poterie']\n", - "domaines_regroupes[\"Métiers\"] += ['Salpêtrerie', 'Savonnerie', 'Sel', 'Tabatière', 'Tabletier-Cornetier', 'Tourneur', 'Vergetier', 'Vernisseur']\n", - "domaines_regroupes['Militaire (Art) - Guerre - Arme'] = ['Armurerie', 'Artificier', 'Artillerie', 'Canon','Escrime','Fortification','Guerre','Milice','Militaire']\n", - "domaines_regroupes['Minéralogie'] = ['Lithologie','Métallurgie','Minéralogie']\n", - "domaines_regroupes['Monnaie'] = ['Monnaie']\n", - "domaines_regroupes['Musique'] = ['Danse', 'Lutherie','Musique','Orgue', 'Voix']\n", - "domaines_regroupes['Pêche'] = ['Pêche']\n", - "domaines_regroupes['Pharmacie'] = ['Drogues', 'Pharmacie']\n", - "domaines_regroupes['Philosophie'] = ['Education', 'Logique', 'Métaphysique', 'Morale', 'Philologie','Philosophie', 'Sciences']\n", - "domaines_regroupes['Physique - [Sciences physico-mathématiques]'] = ['Acoustique', 'Astrologie', 'Astronomie', 'Cosmographie-Cosmologie', 'Gnomonique', 'Hydraulique', 'Mécanique', 'Optique', 'Perspective', 'Physique', 'Science microscopique']\n", - "domaines_regroupes['Politique'] = ['Economie', 'Gouvernement', 'Police', 'Politique']\n", - "domaines_regroupes['Religion'] = ['Critique sacrée', 'Culte', 'Eglise', 'Histoire ecclésiastique', 'Idolâtrie', 'Religion', 'Théologie']\n", - "domaines_regroupes['Spectacle'] = ['Opéra','Spectacle', 'Théâtre']\n", - "domaines_regroupes['Superstition'] = ['Divination', 'Magie', 'Superstition']" - ] - }, - { - "cell_type": "markdown", - "id": "variable-instrument", - "metadata": {}, - "source": [ - "### Récupération correspondance EDdA / ENCCRE" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "south-equation", - "metadata": {}, - "outputs": [], - "source": [ - "df_correspondances = pd.read_csv(\"/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Classification domaines EDdA/correspondances_ARTFL-ENCCRE.csv\") \n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "protecting-incentive", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>path</th>\n", - " <th>entreeid</th>\n", - " <th>tome</th>\n", - " <th>article</th>\n", - " <th>adresse</th>\n", - " <th>entree</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>T1/article5</td>\n", - " <td>v1-1-0</td>\n", - " <td>1</td>\n", - " <td>5</td>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>T1/article6</td>\n", - " <td>v1-1-1</td>\n", - " <td>1</td>\n", - " <td>6</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>T1/article7</td>\n", - " <td>v1-1-2</td>\n", - " <td>1</td>\n", - " <td>7</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>T1/article8</td>\n", - " <td>v1-1-3</td>\n", - " <td>1</td>\n", - " <td>8</td>\n", - " <td>1</td>\n", - " <td>3</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>T1/article9</td>\n", - " <td>v1-1-4</td>\n", - " <td>1</td>\n", - " <td>9</td>\n", - " <td>1</td>\n", - " <td>4</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " path entreeid tome article adresse entree\n", - "0 T1/article5 v1-1-0 1 5 1 0\n", - "1 T1/article6 v1-1-1 1 6 1 1\n", - "2 T1/article7 v1-1-2 1 7 1 2\n", - "3 T1/article8 v1-1-3 1 8 1 3\n", - "4 T1/article9 v1-1-4 1 9 1 4" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_correspondances.head()" - ] - }, - { - "cell_type": "markdown", - "id": "continuous-feedback", - "metadata": {}, - "source": [ - "### Test récupération données ENCCRE" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "7820200b", - "metadata": {}, - "outputs": [], - "source": [ - "import urllib, json\n", - "from urllib.request import urlopen" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "spread-feature", - "metadata": {}, - "outputs": [], - "source": [ - "json_url = urlopen(\"http://enccre.academie-sciences.fr/icefront/api/article/v1-544-0\")\n", - "data = json.loads(json_url.read())" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "facial-syndicate", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'géographie'" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data['annotations']['constit'][0]['domgen'][0]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "removed-nickel", - "metadata": {}, - "outputs": [], - "source": [ - "def get_key(val):\n", - " for key, value in domaines_regroupes.items():\n", - " for v in value:\n", - " v = v.replace(\" \", \"\")\n", - " if val == v.lower():\n", - " return key\n", - " \n", - " return None\n" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "nuclear-murder", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Histoire naturelle\n" - ] - } - ], - "source": [ - "print(get_key(\"histoirenaturelle\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "placed-homework", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git &> /dev/null\n", - "!pip install spacy\n", - "!python -m spacy download fr_core_news_sm" - ] - }, - { - "cell_type": "markdown", - "id": "extraordinary-settlement", - "metadata": {}, - "source": [ - "### Ajout des colonnes domaines, texte, etc." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "0c378939", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to\n", - "[nltk_data] /Users/lmoncla/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n", - "[nltk_data] Downloading package wordnet to /Users/lmoncla/nltk_data...\n", - "[nltk_data] Package wordnet is already up-to-date!\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import string\n", - "import nltk\n", - "from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer\n", - "\n", - "import spacy\n", - "nlp = spacy.load(\"fr_core_news_sm\")\n", - "\n", - "nltk.download('stopwords')\n", - "nltk.download('wordnet')" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "96448195", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "lst_stopwords = nltk.corpus.stopwords.words(\"french\")\n", - "lst_stopwords += ['plus', 'dun', 'deux', 'autre', 'cette', 'quelque', 'étoit', 'avoit', 'si', 'dont', 'quon', 'voyez', 'lautre', 'comme', 'fait', 'aussi', 'leurs', 'tous', 'toute', 'autres', 'dit', 'selon', 'tout']\n", - "lst_stopwords += ['étoient', 'faire', 'lon', 'celle', 'ainsi', 'quelle', 'être', 'faut', 'peut', 'entre', 'elles', 'ceux', 'donc', 'celui', 'nest', 'dautre', 'doit', 'cet', ]\n", - "lst_stopwords += [\"un\", \"deux\", \"trois\", \"quatre\", \"cinq\", \"six\", \"sept\", \"huit\", \"neuf\", \"dix\", \"très\", \"plus\", \"ni\", \"fit\", \"parce\", \"dire\"]\n", - "lst_stopwords += [\"douze\", \"toutes\", \"après\"]\n", - "lst_stopwords += [\"l\\'\", \"qu'\", \"s'\", \"c'\", \"d'\", \"n'\", \"j'\", \"m'\", \"t'\", \"jusqu'\", \"lorsqu'\", \"puisqu'\", \"quoiqu'\"]\n", - "\n", - "lem = FrenchLefffLemmatizer()\n", - "\n", - "def utils_preprocess_text(content_str):\n", - " #text = re.sub(r'[^\\w\\s]', '', str(text).lower().strip())\n", - " #text = nltk.tokenize.word_tokenize(text, language = \"french\")\n", - " \n", - " #text = unescape(mt.tokenize(text, return_str=True))\n", - " \n", - " text = nlp(content_str) # spacy\n", - "\n", - " return len(text), \" \".join([lem.lemmatize(word.text.lower()) for word in text if word.text.lower() not in lst_stopwords and word.text not in string.punctuation])" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "de693cc7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8 indication donner aujourd'hui ...\n" - ] - } - ], - "source": [ - "a, b = utils_preprocess_text(\"L'indication qu'il faut donner aujourd'hui ...\")\n", - "print(a, b)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "cf7bb9ca", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"A, préposition vient du latin à , à dextris, à sinistris, à droite, à gauche. Plus souvent encore notre\\nà vient de la préposition latine ad, loqui ad, parler\\nà . On trouve aussi dicere ad. Cic. It lucrum ad me,\\n(Plaute) le profit en vient à moi. Sinite parvulos venire ad me, laissez venir ces enfans à moi.\\nObservez que a mot, n'est jamais que ou la troisieme \\npersonne du présent de l'indicatis du verbe\\navoir, ou une simple préposition. Ainsi à n'est jamais\\nadverbe, comme quelques Grammairiens l'ont cru,\\nquoiqu il entre dans plusieurs façons de parler adverbiales.\\nCar l'adverbe n'a pas besoin d'être suivi d'un\\nautre mot qui le détermine, ou, comme disent communément \\nles Grammairiens, l'adverbe n'a jamais\\nde régime ; parce que l'adverbe renferme en soi la\\npréposition & le nom : prudemment, avec prudence.\\n(V. Adverbe) au lieu que la préposition a toûjours\\nun régime, c'est-à -dire, qu'elle est toujours suivie\\nd'un autre mot, qui détermine la relation ou l'espece\\nde rapport que la préposition indique. Ainsi la préposition \\nà peut bien entrer, comme toutes les autres\\nprépositions, dans des façons de parler adverbiales:\\nmais comme elle est toûjours suivie de son complément,\\nou, comme on dit, de son régime, elle ne peut\\njamais être adverbe.\\nA n'est pas non plus une simple particule qui marque\\n\\n le datif ; parce qu'en françois nous n'avons ni\\ndéclinaison, ni cas, ni par conséquent de datif. V.\\nCas. Le rapport que les Latins marquoient par la\\nterminaison du datif, nous l'indiquons par la préposition \\nà . C'est ainsi que les Latins mêmes se sont servis\\nde la préposition ad, quod attinet ad me. Cic. Accedit\\nad, referre ad aliquem, & alicui. Ils disoient aussi également loqui ad aliquem, & loqui alicui, parler à quelqu'un, &c.\\nA l'égard des différens usages de la préposition à ,\\nil faut observer 1. que toute préposition est entre\\ndeux termes, qu'elle lie & qu'elle met en rapport.\\n2. Que ce rapport est souvent marqué par la signification \\npropre de la préposition même, comme\\navec, dans, sur, &c.\\n3. Mais que souvent aussi les prépositions, surtout\\nà , de ou du, outre le rapport qu'elles indiquent quand\\nelles sont prises dans leur sens primitif & propre, ne\\nsont ensuite par figure & par extension, que de simples \\nprépositions unitives ou indicatives, qui ne font\\nque mettre deux mots en rapport ; ensorte qu'alors\\nc'est à l'esprit même à remarquer la sorte de rapport\\nqu'il y a entre les deux termes de la relation unis entre-eux par la préposition : par exemple, approchez-vous du feu : du, lie feu avec approchez-vous, & l'esprit \\nobserve ensuite un rapport d'approximation,\\nque du ne marque pas. Eloignez-vous du feu ; du, lie\\nfeu avec éloignez-vous, & l'esprit observe-là un rapport \\nd'éloignement. Vous voyez que la même préposition \\nsert à marquer des rapports opposés. On dit\\nde même donner à & ôter à . Ainsi ces sortes de rapports\\ndifferent autant que les mots different entre-eux.\\nJe crois donc que lorsque les prépositions ne sont,\\nou ne paroissent pas prises dans le sens propre de leur\\npremiere destination, & que par conséquent elles\\nn'indiquent pas par elles-mêmes la sorte de rapport\\nparticulier que celui qui parle veut faire entendre ;\\nalors c'est à celui qui écoute ou qui lit, à reconnoître\\nla sorte de rapport qui se trouve entre les mots liés\\npar la préposition simplement unitive & indicative.\\nCependant quelques Grammairiens ont mieux aimé \\népuiser la Métaphysique la plus recherchée, &\\nsi je l'ose dire, la plus inutile & la plus vaine, que\\nd'abandonner le Lecteur au discernement que lui donne \\nla connoissance & l'usage de sa propre Langue.\\nRapport de cause, rapport d'effet, d'instrument, de situation,\\nd'époque, table à pieds de biche, c'est-là un rapport \\nde forme, dit M. l'Abbé Girard, tom. II. p. 199.\\nBassin à barbe, rapport de service, (id. ib.) Pierre à feu,\\nrapport de propriété productive, (id. ib.) &c. La préposition \\nà n'est point destinée à marquer par elle-même un rapport de propriété productive, ou de service,\\nou de forme, &c. quoique ces rapports se trouvent\\nentre les mots liés par la préposition à . D'ailleurs,\\nles mêmes rapports sont souvent indiqués par des\\nprépositions différentes, & souvent des rapports opposés \\nsont indiqués par la même préposition.\\nIl me paroit donc que l'on doit d'abord observer la\\npremiere & principale destination d'une préposition.\\nPar exemple : la principale destination de la préposition \\nà , est de marquer la relation d'une chose à une\\nautre, comme, le terme où l'on va, ou à quoi ce\\nqu'on fait se termine, le but, la sin, l'attribution,\\nle pourquoi. Aller à Rome, préter de l'argent à usure,\\nà gros intérét. Donner quelque chose à quelqu'un, &c.\\nLes autres usages de cette préposition reviennent ensuite \\nà ceux-là par catachrese, abus, extension, ou\\nimitation : mais il est bon de remarquer quelques-uns\\nde ces usages, afin d'avoir des exemples qui puissent\\nservir de regle, & aider à décider les doutes par analogie \\n& par imitation. On dit donc:\\n\\nAprès un nom substantif.\\nAir à chanter. Billet à ordre, c'est-à -dire, payable\\n\\n\\nà ordre. Chaise à deux. Doute à éclaircir. Entreprise à \\nexécuter. Femme à la hotte? (au vocatif). Grenier à \\nsel. Habit à la mode. Instrument à vent. Lettre de change\\nà vûe, à dix jours de vûe. Matiere à procès. Nez à lunette.\\nOEufs à la coque. Plaine à perte de vûe. Question\\nà juger. Route à gauche. Vache à lait.\\n\\nA après un adjectif\\nAgréable à la vûe. Bon à prendre & à laisser. Contraire à la santé. Délicieux à manger. Facile à faire.\\nObservez qu'on dit : Il est facile de faire cela.\\n\\nQuand on le veut il est facile\\nDe s'assûrer un repos plein d'appas. Quinault.\\nLa raison de cette différence est que dans le dernier \\nexemple de n'a pas rapport à facile, mais à il ; il,\\nhoc, cela, à savoir de faire, &c. est facile, est une\\nchose facile. Ainsi, il, de s'assûrer un repos plein d'appas, est le sujet de la proposition, & est facile en est\\nl'attribut.\\n\\nQu'il est doux de trouver dans un amant qu'on aime\\nUn époux que l'on doit aimer! (Idem.)\\nIl, à savoir, de trouver un époux dans un amant,\\n&c. est doux, est une chose douce. (V. Proposition).\\nIl est gauche à tout ce qu'il fait. Heureux à la guerre.\\nHabile à dessiner, à écrire. Payable a ordre. Pareil à ,\\n&c. Propre à , &c. Semblable à , &c. Utile à la santé.\\n\\nAprès un verbe.\\nS'abandonner à ses passions. S'amuser à des bagatelles.\\nApplaudir à quelqu'un. Aimer à boire, à faire du bien.\\nLes hommes n'aiment point à admirer les autres ; ils\\ncherchent eux-mêmes à être goûtés & à être applaudis.\\nLa Bruyere. Aller à cheval, à califourchon, c'est-à -dire, jambe deçà , jambe delà . S'appliquer à , &c. S'attacher à , &c. Blesser a, il a été blessé à la jambe. Crier\\nà l'aide, au feu, au secours. Conseiller quelque chose à \\nquelqu'un. Donner à boire à quelqu'un. Demander à \\nboire. Etre à . Il est à écrire, à jouer. Il est à jeun. Il\\nest à Rome. Il est à cent lieues. Il est long-tems à venir.\\nCela est à faire, à taire, à publier, à payer. C'est à vous\\nà mettre le prix à votre marchandise. J'ai fait cela à votre\\nconsidération, à votre intention. Il faut des livres à votre\\nfils. Joüer à Colin Maillard, joüer à l'ombre, aux échecs.\\nGarder à vûe. La dépense se monte à cent écus, & la recette \\nà , &c. Monter à cheval. Payer à quelqu'un. Payer\\nà vûe, à jour marqué. Persuader à . Préter à . Puiser à \\nla source. Prendre garde à soi. Prendre à gauche. Ils\\nvont un à un, deux à deux, trois à trois. Voyons à qui\\nl'aura, c'est-à -dire, voyons à ceci, (attendamus ad\\nhoc nempe) à savoir qui l'aura.\\n\\nA avant une autre Préposition.\\nA se trouve quelquefois avant la préposition de\\ncomme en ces exemples.\\n\\nPeut-on ne pas céder à de si puissans charmes?\\nEt peut-on refuser son coeur\\nA de beaux yeux qui le demandent?\\nJe crois qu'en ces occasions il y a une ellipse synthétique.\\nL'esprit est occupé des charmes particuliers\\nqui l'ont frappé; & il met ces charmes au rang des\\ncharmes puissans, dont on ne sauroit se garantir.\\nPeut-on ne pas céder à ces charmes qui sont du nombre \\ndes charmes si puissans, &c. Peut-on ne pas céder\\nà l'attrait, au pouvoir de si puissans charmes? Peut-on\\nrefuser son coeur à ces yeux, qui sont de la classe\\ndes beaux yeux. L'usage abrege ensuite l'expression,\\n& introduit des façons de parler particulieres auxquelles \\non doit se conformer, & qui ne détruisent\\npas les regles.\\nAinsi, je crois que de ou des sont toûjours des prépositions \\nextractives, & que quand on dit des Savans\\nsoûtiennent, des hommes m'ont dit, &c. des Savans, des\\nhommes, ne sont pas au nominatif. Et de même quand\\non dit, j'ai vû des hommes, j'ai vû des femmes, &c. des\\n\\nhommes, des femmes, ne sont pas à l'accusatif ; car,\\nsi l'on veut bien y prendre garde, on reconnoîtra\\nque ex hominibus, ex mulieribus, &c. ne peuvent\\nêtre ni le sujet de la proposition, ni le terme de l'action \\ndu verbe ; & que celui qui parle veut dire, que\\nquelques-uns des Savans soûtiennent, &c. quelques-uns des hommes, quelques-unes des femmes, disent, &c.\\n\\nA après des adverbes.\\nOn ne se sert de la préposition à après un adverbe,\\nque lorsque l'adverbe marque relation. Alors l'adverbe \\nexprime la sorte de relation, & la préposition\\nindique le corrélatif. Ainsi, on dit conformément à .\\nOn a jugé conformément à l'Ordonnance de 1667. On\\ndit aussi relativement à .\\nD'ailleurs l'adverbe ne marquant qu'une circonstance \\nabsolue & déterminée de l'action, n'est pas\\nsuivi de la préposition à .\\n\\nA en des façons de parler adverbiales, & en celles qui\\nsont équivalentes à des prépositions Latines, ou de\\nquelqu'autre Langue.\\nA jamais, à toûjours. A l'encontre. Tour à tour.\\nPas à pas. Vis-à -vis. A pleines mains. A fur & à mesure.\\nA la fin, tandem, aliquando, C'est-à -dire, nempe,\\nscilicet. Suivre à la piste. Faire le diable à quatre.\\nSe faire tenir à quatre. A cause, qu'on rend en latin par\\nla proposition propter. A raison de. Jusqu'à , ou jusques\\nà . Au-delà . Au-dessus. Au-dessous. A quoi bon, quorsùm.\\nA la vûe, à la présence, ou en présence, coram.\\nTelles sont les principales occasions où l'usage a\\nconsacré la préposition à . Les exemples que nous venons \\nde rapporter, serviront a décider par analogie\\nles difficultés que l'on pourroit avoir sur cette préposition.\\nAu reste la préposition au est la même que la préposition \\nà . La seule différence qu'il y a entre l'une\\n& l'autre, c'est que à est un mot simple, & que au\\nest un mot composé.\\nAinsi il faut considérer la préposition à en deux\\nétats différens.\\nI. Dans son état simple : 1°. Rendez à César ce\\nqui appartient à Céfar ; 2°. se prêter à l'exemple ;\\n3°. se rendre à la raison. Dans le premier exemple\\nà est devant un nom sans article. Dans le second\\nexemple à est suivi de l'article masculin, parce que\\nle mot commence par une voyelle : à l'exemple, à \\nl'esprit, à l'amour. Enfin dans le dernier, la préposition \\nà précede l'article féminin, à la raison, à l'autorité.\\nII. Hors de ces trois cas, la préposition à devient\\nun mot composé par sa jonction avec l'article le ou\\navec l'article pluriel les. L'article le à cause du son\\nsourd de l'e muet a amené au, de sorte qu'au lieu\\nde dire à le nous disons au, si le nom ne commence\\npas par une voyelle. S'adonner au bien ; & au pluriel\\nau lieu de dire à les, nous changeons l en u, ce qui\\narrive souvent dans notre Langue, & nous disons\\naux, soit que le nom commence par une voyelle ou\\npar une consonne : aux hommes, aux femmes, &c.\\nainsi au est autant que à le, & aux que à les.\\nA est aussi une préposition inséparable qui entre\\ndans la composition des mots ; donner, s'adonner,\\nporter, apporter, mener, amener, &c. ce qui sert ou à \\nl'énergie, ou à marquer d'autres points de vûe ajoûtés \\nà la premiere signification du mot.\\nIl faut encore observer qu'en Grec à marque\\n1. Privation, & alors on l'appelle alpha privatif,\\nce que les Latins ont quelquefois imité, comme dans\\namens qui est compose de mens, entendement, intelligence,\\n& de l'alpha privatif. Nous avons conservé\\nplusieurs mots où se trouve l'alpha privatif, comme\\namazone, asyle, abysme, &c. l'alpha privatif vient\\nde la préposition ἄτεÏ, sine, sans.\\n2. A en composition marque augmentation, & alors\\nil vient de ἄγαν, beaucoup.\\n3. A avec un accent circonflexe & un esprit doux\\nἆ marque admiration, desir, surprise, comme notre\\nah! ou ha! vox quiritantis, optantis, admirantis, dit\\nRobertson. Ces divers usages de l'a en Grec ont\\ndonné lieu à ce vers des Racines Greques\\n\\nA fait un, prive, augmente, admire.\\nEn terme de Grammaire, & sur-tout de Grammaire Greque, on appelle a pur un a qui seul fait\\nune syllabe comme en φιλία, amicitia. (F)\"" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "## cellule de test\n", - "volume = 1\n", - "numero = 7\n", - "\n", - "txt_file = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/all_txt/volume0\"+str(volume)+\"-\"+str(numero)+\".txt\"\n", - "\n", - "txtContent = open(txt_file, \"r\").read()\n", - "txtContent" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "pursuant-camel", - "metadata": {}, - "outputs": [], - "source": [ - "def getDomaineEnccre(volume, numero, classEDDA):\n", - " #print(volume, ' ', numero)\n", - "\n", - " domaine = \"\"\n", - " ensemble_domaine = \"\"\n", - " entreeid = \"\"\n", - " \n", - " try : \n", - " #entreeid = df_correspondances.loc[(df_correspondances['tome']==volume) & (df_correspondances['article']==numero)]['entreeid'][0]\n", - " d = df_correspondances.loc[(df_correspondances['tome']==volume) & (df_correspondances['article']==numero)].reset_index(drop=True)\n", - " entreeid = d['entreeid'][0]\n", - "\n", - " json_url = urlopen(\"http://enccre.academie-sciences.fr/icefront/api/article/\" + entreeid)\n", - " data = json.loads(json_url.read())\n", - " #print(data['annotations']['constit'][0]['domgen'][0])\n", - " \n", - " try : \n", - "\n", - " domaine = data['annotations']['constit'][0]['domgen'][0]\n", - " ensemble_domaine = get_key(domaine)\n", - " \n", - " '''\n", - " for constit in data['annotations']['constit']:\n", - " \n", - " domaine = constit['domgen'][0]\n", - " print(domaine)\n", - "\n", - " for domgen in constit['domgen']: \n", - " domaine_multi += domgen + \";\"\n", - " ensemble = get_key(domgen)\n", - " if ensemble:\n", - " ensemble_domaine_multi.append(ensemble)\n", - " \n", - " #print(domaine)\n", - " '''\n", - " except KeyError:\n", - " pass\n", - " \n", - " except KeyError:\n", - " pass\n", - " \n", - " try :\n", - " if volume < 10:\n", - " txt_file = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/all_txt/volume0\"+str(volume)+\"-\"+str(numero)+\".txt\"\n", - " else :\n", - " txt_file = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/all_txt/volume\"+str(volume)+\"-\"+str(numero)+\".txt\"\n", - "\n", - " txtContent = open(txt_file, \"r\").read()\n", - " \n", - " classEDDA = str(classEDDA)\n", - " \n", - " #supprime le désignant du texte\n", - " classEDDA_with_brcts = '('+ classEDDA +')'\n", - " txtContentWithoutClass = txtContent.replace(classEDDA_with_brcts, \"\")\n", - " txtContentWithoutClass = txtContentWithoutClass.replace(classEDDA, \"\")\n", - " \n", - " firstParagraph = txtContentWithoutClass.split('\\n \\n')[0] ## ne fonctionne pas !\n", - " \n", - " nbWords, txtContentWithoutClass = utils_preprocess_text(txtContentWithoutClass)\n", - " nbWords1stPara, firstParagraph = utils_preprocess_text(firstParagraph)\n", - " \n", - "\n", - " except FileNotFoundError:\n", - " txtContent = \"\"\n", - " txtContentWithoutClass = \"\"\n", - " firstParagraph = \"\"\n", - " nbWords = 0\n", - " nbWords1stPara = 0\n", - " \n", - " #ensemble_domaine_multi = ';'.join(list(set(ensemble_domaine)))\n", - " \n", - " #print(entreeid, domaine, ensemble_domaine, txtContent, txtContentWithoutClass, firstParagraph)\n", - " \n", - " return pd.Series([entreeid, domaine, ensemble_domaine, txtContent, txtContentWithoutClass, firstParagraph, nbWords, nbWords1stPara])\n", - " \n" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "36ae000f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>volume</th>\n", - " <th>numero</th>\n", - " <th>head</th>\n", - " <th>normClass</th>\n", - " <th>classEDdA</th>\n", - " <th>author</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>21735</th>\n", - " <td>5</td>\n", - " <td>454</td>\n", - " <td>Doyen des Avocats</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>Boucher d'Argis</td>\n", - " </tr>\n", - " <tr>\n", - " <th>28995</th>\n", - " <td>7</td>\n", - " <td>1532</td>\n", - " <td>GARBIN</td>\n", - " <td>Marine</td>\n", - " <td>Marine.</td>\n", - " <td>Bellin</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5415</th>\n", - " <td>2</td>\n", - " <td>155</td>\n", - " <td>BAGUE</td>\n", - " <td>Histoire ancienne | Histoire moderne</td>\n", - " <td>Hist. anc. & mod.</td>\n", - " <td>Diderot</td>\n", - " </tr>\n", - " <tr>\n", - " <th>53231</th>\n", - " <td>13</td>\n", - " <td>1375</td>\n", - " <td>Prevôt des Bandes ou des Bandes françoises</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>Boucher d'Argis</td>\n", - " </tr>\n", - " <tr>\n", - " <th>27878</th>\n", - " <td>7</td>\n", - " <td>415</td>\n", - " <td>FOSSET</td>\n", - " <td>Tonnelier | Economie rustique</td>\n", - " <td>Econom. rustiq. ou Tonnelier.</td>\n", - " <td>Diderot</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " volume numero head \\\n", - "21735 5 454 Doyen des Avocats \n", - "28995 7 1532 GARBIN \n", - "5415 2 155 BAGUE \n", - "53231 13 1375 Prevôt des Bandes ou des Bandes françoises \n", - "27878 7 415 FOSSET \n", - "\n", - " normClass classEDdA \\\n", - "21735 unclassified unclassified \n", - "28995 Marine Marine. \n", - "5415 Histoire ancienne | Histoire moderne Hist. anc. & mod. \n", - "53231 unclassified unclassified \n", - "27878 Tonnelier | Economie rustique Econom. rustiq. ou Tonnelier. \n", - "\n", - " author \n", - "21735 Boucher d'Argis \n", - "28995 Bellin \n", - "5415 Diderot \n", - "53231 Boucher d'Argis \n", - "27878 Diderot " - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.sample(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "christian-advice", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "df['id_enccre'], df['domaine_enccre'], df['ensemble_domaine_enccre'], df['content'], df['contentWithoutClass'], df['firstParagraph'], df['nb_words'], df['nb_words_1stPara'] = df.apply(lambda row: getDomaineEnccre(row.volume, row.numero, row.classEDdA), axis=1).T.values\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "9ea62866", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>volume</th>\n", - " <th>numero</th>\n", - " <th>head</th>\n", - " <th>normClass</th>\n", - " <th>classEDdA</th>\n", - " <th>author</th>\n", - " <th>id_enccre</th>\n", - " <th>domaine_enccre</th>\n", - " <th>ensemble_domaine_enccre</th>\n", - " <th>content</th>\n", - " <th>contentWithoutClass</th>\n", - " <th>firstParagraph</th>\n", - " <th>nb_words</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>34467</th>\n", - " <td>8</td>\n", - " <td>3619</td>\n", - " <td>JOUBARBE</td>\n", - " <td>Botanique</td>\n", - " <td>Botan.</td>\n", - " <td>Jaucourt</td>\n", - " <td>v8-2770-0</td>\n", - " <td>botanique</td>\n", - " <td>Histoire naturelle</td>\n", - " <td>JOUBARBE, s. f. (Botan.) Sedum, genre de plant...</td>\n", - " <td>joubarbe s. f. sedum genre plante \\n fleur r...</td>\n", - " <td>joubarbe s. f. sedum genre plante \\n fleur r...</td>\n", - " <td>854</td>\n", - " </tr>\n", - " <tr>\n", - " <th>67530</th>\n", - " <td>16</td>\n", - " <td>1338</td>\n", - " <td>THYONÉ</td>\n", - " <td>Mythologie</td>\n", - " <td>Mytholog.</td>\n", - " <td>unsigned</td>\n", - " <td>v16-815-0</td>\n", - " <td>mythologie</td>\n", - " <td>Antiquité</td>\n", - " <td>THYONÉ, (Mytholog.) c'est, selon Ovide, le\\nno...</td>\n", - " <td>thyoné ovide \\n nom sou lequel sémélé mise j...</td>\n", - " <td>thyoné ovide \\n nom sou lequel sémélé mise j...</td>\n", - " <td>47</td>\n", - " </tr>\n", - " <tr>\n", - " <th>25346</th>\n", - " <td>6</td>\n", - " <td>408</td>\n", - " <td>Evêque de la cour</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>unsigned</td>\n", - " <td>v6-181-7</td>\n", - " <td>histoireecclésiastique</td>\n", - " <td>Religion</td>\n", - " <td>Evêque de la cour ; on donne quelquetois ce\\nt...</td>\n", - " <td>evêque cour donne quelquetois \\n titre grand a...</td>\n", - " <td>evêque cour donne quelquetois \\n titre grand a...</td>\n", - " <td>20</td>\n", - " </tr>\n", - " <tr>\n", - " <th>22927</th>\n", - " <td>5</td>\n", - " <td>1646</td>\n", - " <td>EGRATIGNÉE (Maniere)</td>\n", - " <td>Peinture</td>\n", - " <td>Peint.</td>\n", - " <td>Jaucourt</td>\n", - " <td>v5-775-0</td>\n", - " <td>peinture</td>\n", - " <td>Beaux-arts</td>\n", - " <td>EGRATIGNÉE, (Maniere) Peint. espece de\\npeintu...</td>\n", - " <td>egratignée maniere espece \\n peinture fresqu...</td>\n", - " <td>egratignée maniere espece \\n peinture fresqu...</td>\n", - " <td>256</td>\n", - " </tr>\n", - " <tr>\n", - " <th>66518</th>\n", - " <td>16</td>\n", - " <td>326</td>\n", - " <td>Tenaille</td>\n", - " <td>Docimastique</td>\n", - " <td>Docimastique.</td>\n", - " <td>Jaucourt</td>\n", - " <td>v16-170-1</td>\n", - " <td>docimasie</td>\n", - " <td>Chimie</td>\n", - " <td>Tenaille, s. f. (Docimastique.) entre les uste...</td>\n", - " <td>tenaille s. f. ustensile \\n art essai rend i...</td>\n", - " <td>tenaille s. f. ustensile \\n art essai rend i...</td>\n", - " <td>753</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " volume numero head normClass classEDdA \\\n", - "34467 8 3619 JOUBARBE Botanique Botan. \n", - "67530 16 1338 THYONÉ Mythologie Mytholog. \n", - "25346 6 408 Evêque de la cour unclassified unclassified \n", - "22927 5 1646 EGRATIGNÉE (Maniere) Peinture Peint. \n", - "66518 16 326 Tenaille Docimastique Docimastique. \n", - "\n", - " author id_enccre domaine_enccre ensemble_domaine_enccre \\\n", - "34467 Jaucourt v8-2770-0 botanique Histoire naturelle \n", - "67530 unsigned v16-815-0 mythologie Antiquité \n", - "25346 unsigned v6-181-7 histoireecclésiastique Religion \n", - "22927 Jaucourt v5-775-0 peinture Beaux-arts \n", - "66518 Jaucourt v16-170-1 docimasie Chimie \n", - "\n", - " content \\\n", - "34467 JOUBARBE, s. f. (Botan.) Sedum, genre de plant... \n", - "67530 THYONÉ, (Mytholog.) c'est, selon Ovide, le\\nno... \n", - "25346 Evêque de la cour ; on donne quelquetois ce\\nt... \n", - "22927 EGRATIGNÉE, (Maniere) Peint. espece de\\npeintu... \n", - "66518 Tenaille, s. f. (Docimastique.) entre les uste... \n", - "\n", - " contentWithoutClass \\\n", - "34467 joubarbe s. f. sedum genre plante \\n fleur r... \n", - "67530 thyoné ovide \\n nom sou lequel sémélé mise j... \n", - "25346 evêque cour donne quelquetois \\n titre grand a... \n", - "22927 egratignée maniere espece \\n peinture fresqu... \n", - "66518 tenaille s. f. ustensile \\n art essai rend i... \n", - "\n", - " firstParagraph nb_words \n", - "34467 joubarbe s. f. sedum genre plante \\n fleur r... 854 \n", - "67530 thyoné ovide \\n nom sou lequel sémélé mise j... 47 \n", - "25346 evêque cour donne quelquetois \\n titre grand a... 20 \n", - "22927 egratignée maniere espece \\n peinture fresqu... 256 \n", - "66518 tenaille s. f. ustensile \\n art essai rend i... 753 " - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.sample(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "daily-office", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "11640" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# nombre d'articles non classés par ENCCRE (à partir de la correspondance automatique)\n", - "len(df.loc[(df['domaine_enccre']==\"\")])" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "suited-methodology", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "12685" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# nombre d'article non classés par ARTFL\n", - "len(df.loc[(df['normClass']==\"unclassified\")])" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "special-investigation", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>volume</th>\n", - " <th>numero</th>\n", - " <th>head</th>\n", - " <th>normClass</th>\n", - " <th>classEDdA</th>\n", - " <th>author</th>\n", - " <th>id_enccre</th>\n", - " <th>ensemble_domaine_enccre</th>\n", - " <th>content</th>\n", - " <th>contentWithoutClass</th>\n", - " <th>firstParagraph</th>\n", - " <th>nb_words</th>\n", - " </tr>\n", - " <tr>\n", - " <th>domaine_enccre</th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th></th>\n", - " <td>11640</td>\n", - " <td>11640</td>\n", - " <td>11640</td>\n", - " <td>11636</td>\n", - " <td>11637</td>\n", - " <td>11640</td>\n", - " <td>11640</td>\n", - " <td>11640</td>\n", - " <td>11640</td>\n", - " <td>11640</td>\n", - " <td>11640</td>\n", - " <td>11640</td>\n", - " </tr>\n", - " <tr>\n", - " <th>acoustique</th>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " </tr>\n", - " <tr>\n", - " <th>agriculture</th>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " <td>112</td>\n", - " </tr>\n", - " <tr>\n", - " <th>aiguilletierepinglier</th>\n", - " <td>114</td>\n", - " <td>114</td>\n", - " <td>114</td>\n", - " <td>114</td>\n", - " <td>114</td>\n", - " <td>114</td>\n", - " <td>114</td>\n", - " <td>0</td>\n", - " <td>114</td>\n", - " <td>114</td>\n", - " <td>114</td>\n", - " <td>114</td>\n", - " </tr>\n", - " <tr>\n", - " <th>alchimie</th>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " volume numero head normClass classEDdA author \\\n", - "domaine_enccre \n", - " 11640 11640 11640 11636 11637 11640 \n", - "acoustique 12 12 12 12 12 12 \n", - "agriculture 112 112 112 112 112 112 \n", - "aiguilletierepinglier 114 114 114 114 114 114 \n", - "alchimie 24 24 24 24 24 24 \n", - "\n", - " id_enccre ensemble_domaine_enccre content \\\n", - "domaine_enccre \n", - " 11640 11640 11640 \n", - "acoustique 12 12 12 \n", - "agriculture 112 112 112 \n", - "aiguilletierepinglier 114 0 114 \n", - "alchimie 24 24 24 \n", - "\n", - " contentWithoutClass firstParagraph nb_words \n", - "domaine_enccre \n", - " 11640 11640 11640 \n", - "acoustique 12 12 12 \n", - "agriculture 112 112 112 \n", - "aiguilletierepinglier 114 114 114 \n", - "alchimie 24 24 24 " - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# nombre de classe ENCCRE\n", - "\n", - "classes_enccre = df.groupby(['domaine_enccre']).count()\n", - "classes_enccre.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "id": "legendary-independence", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "312" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(classes_enccre)" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "fourth-involvement", - "metadata": {}, - "outputs": [], - "source": [ - "# enregistrement du dataframe dans un fichier tsv\n", - "df.to_csv('../../../Data/EDdA-Classification/EDdA_dataframe_withContent.tsv', sep='\\t', index=False) " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "framed-sodium", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv('../../../Data/EDdA-Classification/EDdA_dataframe_withContent.tsv', sep='\\t') " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "tutorial-savannah", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "74190" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "minus-waterproof", - "metadata": {}, - "outputs": [], - "source": [ - "df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre'], inplace= True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "scenic-sugar", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "61362" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "seasonal-suspect", - "metadata": {}, - "outputs": [], - "source": [ - "# enregistrement du dataframe dans un fichier tsv\n", - "#df.to_csv('../../../Data/EDdA-Classification/EDdA_dataframe_withContent.tsv',sep='\\t',index=False) " - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "opposed-binding", - "metadata": {}, - "outputs": [], - "source": [ - "######\n", - "#df = pd.read_csv('../../../Data/EDdA-Classification/EDdA_dataframe_withContent.tsv', sep='\\t') " - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "id": "endless-cathedral", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(61362, 13)" - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "id": "4062b7f5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>volume</th>\n", - " <th>numero</th>\n", - " <th>head</th>\n", - " <th>normClass</th>\n", - " <th>classEDdA</th>\n", - " <th>author</th>\n", - " <th>id_enccre</th>\n", - " <th>domaine_enccre</th>\n", - " <th>ensemble_domaine_enccre</th>\n", - " <th>content</th>\n", - " <th>contentWithoutClass</th>\n", - " <th>firstParagraph</th>\n", - " <th>nb_words</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>1</td>\n", - " <td>5</td>\n", - " <td>A, a & a</td>\n", - " <td>Grammaire</td>\n", - " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", - " <td>Dumarsais5</td>\n", - " <td>v1-1-0</td>\n", - " <td>grammaire</td>\n", - " <td>Grammaire</td>\n", - " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", - " <td>a a a s.m ordre encyclopéd \\n entend science h...</td>\n", - " <td>a a a s.m ordre encyclopéd \\n entend science h...</td>\n", - " <td>1092</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>1</td>\n", - " <td>6</td>\n", - " <td>A</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>Dumarsais5</td>\n", - " <td>v1-1-1</td>\n", - " <td>grammaire</td>\n", - " <td>Grammaire</td>\n", - " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", - " <td>a mot 1 troisieme personne présent \\n indicati...</td>\n", - " <td>a mot 1 troisieme personne présent \\n indicati...</td>\n", - " <td>381</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>1</td>\n", - " <td>7</td>\n", - " <td>A</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>Dumarsais</td>\n", - " <td>v1-1-2</td>\n", - " <td>grammaire</td>\n", - " <td>Grammaire</td>\n", - " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", - " <td>a préposition vient latin dextris sinistris dr...</td>\n", - " <td>a préposition vient latin dextris sinistris dr...</td>\n", - " <td>3067</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8</th>\n", - " <td>1</td>\n", - " <td>10</td>\n", - " <td>A, numismatique ou monétaire</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>Mallet</td>\n", - " <td>v1-1-5</td>\n", - " <td>numismatique</td>\n", - " <td>Médailles</td>\n", - " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", - " <td>a numismatique monétaire revers \\n ancien méda...</td>\n", - " <td>a numismatique monétaire revers \\n ancien méda...</td>\n", - " <td>156</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9</th>\n", - " <td>1</td>\n", - " <td>11</td>\n", - " <td>A, lapidaire</td>\n", - " <td>unclassified</td>\n", - " <td>unclassified</td>\n", - " <td>Mallet</td>\n", - " <td>v1-1-6</td>\n", - " <td>inscriptions</td>\n", - " <td>Histoire</td>\n", - " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", - " <td>a lapidaire ancien inscription \\n marbre c. si...</td>\n", - " <td>a lapidaire ancien inscription \\n marbre c. si...</td>\n", - " <td>122</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " volume numero head normClass \\\n", - "3 1 5 A, a & a Grammaire \n", - "4 1 6 A unclassified \n", - "5 1 7 A unclassified \n", - "8 1 10 A, numismatique ou monétaire unclassified \n", - "9 1 11 A, lapidaire unclassified \n", - "\n", - " classEDdA author id_enccre \\\n", - "3 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 v1-1-0 \n", - "4 unclassified Dumarsais5 v1-1-1 \n", - "5 unclassified Dumarsais v1-1-2 \n", - "8 unclassified Mallet v1-1-5 \n", - "9 unclassified Mallet v1-1-6 \n", - "\n", - " domaine_enccre ensemble_domaine_enccre \\\n", - "3 grammaire Grammaire \n", - "4 grammaire Grammaire \n", - "5 grammaire Grammaire \n", - "8 numismatique Médailles \n", - "9 inscriptions Histoire \n", - "\n", - " content \\\n", - "3 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... \n", - "4 A, mot, est 1. la troisieme personne du présen... \n", - "5 A, préposition vient du latin à , à dextris, à ... \n", - "8 A, numismatique ou monétaire, sur le revers de... \n", - "9 A, lapidaire, dans les anciennes inscriptions ... \n", - "\n", - " contentWithoutClass \\\n", - "3 a a a s.m ordre encyclopéd \\n entend science h... \n", - "4 a mot 1 troisieme personne présent \\n indicati... \n", - "5 a préposition vient latin dextris sinistris dr... \n", - "8 a numismatique monétaire revers \\n ancien méda... \n", - "9 a lapidaire ancien inscription \\n marbre c. si... \n", - "\n", - " firstParagraph nb_words \n", - "3 a a a s.m ordre encyclopéd \\n entend science h... 1092 \n", - "4 a mot 1 troisieme personne présent \\n indicati... 381 \n", - "5 a préposition vient latin dextris sinistris dr... 3067 \n", - "8 a numismatique monétaire revers \\n ancien méda... 156 \n", - "9 a lapidaire ancien inscription \\n marbre c. si... 122 " - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "id": "corrected-batman", - "metadata": {}, - "outputs": [], - "source": [ - "df = df.loc[(df['nb_words']>=15)]" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "id": "documentary-prince", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(58509, 13)" - ] - }, - "execution_count": 82, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "id": "opened-november", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "\n", - "train_x, validation_x, train_y, validation_y = train_test_split(df, df[\"ensemble_domaine_enccre\"], test_size=0.2, random_state=42, stratify = df[\"ensemble_domaine_enccre\"] )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "id": "noticed-evanescence", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(46807, 13)" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_x.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "id": "welcome-homework", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(11702, 13)" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "validation_x.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "id": "hearing-moses", - "metadata": {}, - "outputs": [], - "source": [ - "train_x.to_csv('../../../Data/EDdA-Classification/training_set.tsv',sep='\\t',index=False) \n", - "validation_x.to_csv('../../../Data/EDdA-Classification/test_set.tsv',sep='\\t',index=False) \n" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "id": "exterior-praise", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>ensemble_domaine_enccre</th>\n", - " <th>counts</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>Agriculture - Economie rustique</td>\n", - " <td>1163</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Anatomie</td>\n", - " <td>1073</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>Antiquité</td>\n", - " <td>1362</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>Architecture</td>\n", - " <td>1389</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>Arts et métiers</td>\n", - " <td>563</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>Beaux-arts</td>\n", - " <td>429</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6</th>\n", - " <td>Belles-lettres - Poésie</td>\n", - " <td>1031</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7</th>\n", - " <td>Blason</td>\n", - " <td>539</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8</th>\n", - " <td>Caractères</td>\n", - " <td>114</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9</th>\n", - " <td>Chasse</td>\n", - " <td>581</td>\n", - " </tr>\n", - " <tr>\n", - " <th>10</th>\n", - " <td>Chimie</td>\n", - " <td>520</td>\n", - " </tr>\n", - " <tr>\n", - " <th>11</th>\n", - " <td>Commerce</td>\n", - " <td>1879</td>\n", - " </tr>\n", - " <tr>\n", - " <th>12</th>\n", - " <td>Droit - Jurisprudence</td>\n", - " <td>6419</td>\n", - " </tr>\n", - " <tr>\n", - " <th>13</th>\n", - " <td>Economie domestique</td>\n", - " <td>135</td>\n", - " </tr>\n", - " <tr>\n", - " <th>14</th>\n", - " <td>Grammaire</td>\n", - " <td>2258</td>\n", - " </tr>\n", - " <tr>\n", - " <th>15</th>\n", - " <td>Géographie</td>\n", - " <td>13104</td>\n", - " </tr>\n", - " <tr>\n", - " <th>16</th>\n", - " <td>Histoire</td>\n", - " <td>3080</td>\n", - " </tr>\n", - " <tr>\n", - " <th>17</th>\n", - " <td>Histoire naturelle</td>\n", - " <td>4814</td>\n", - " </tr>\n", - " <tr>\n", - " <th>18</th>\n", - " <td>Jeu</td>\n", - " <td>282</td>\n", - " </tr>\n", - " <tr>\n", - " <th>19</th>\n", - " <td>Marine</td>\n", - " <td>2076</td>\n", - " </tr>\n", - " <tr>\n", - " <th>20</th>\n", - " <td>Maréchage - Manège</td>\n", - " <td>524</td>\n", - " </tr>\n", - " <tr>\n", - " <th>21</th>\n", - " <td>Mathématiques</td>\n", - " <td>698</td>\n", - " </tr>\n", - " <tr>\n", - " <th>22</th>\n", - " <td>Mesure</td>\n", - " <td>184</td>\n", - " </tr>\n", - " <tr>\n", - " <th>23</th>\n", - " <td>Militaire (Art) - Guerre - Arme</td>\n", - " <td>1289</td>\n", - " </tr>\n", - " <tr>\n", - " <th>24</th>\n", - " <td>Minéralogie</td>\n", - " <td>111</td>\n", - " </tr>\n", - " <tr>\n", - " <th>25</th>\n", - " <td>Monnaie</td>\n", - " <td>317</td>\n", - " </tr>\n", - " <tr>\n", - " <th>26</th>\n", - " <td>Musique</td>\n", - " <td>685</td>\n", - " </tr>\n", - " <tr>\n", - " <th>27</th>\n", - " <td>Médailles</td>\n", - " <td>117</td>\n", - " </tr>\n", - " <tr>\n", - " <th>28</th>\n", - " <td>Médecine - Chirurgie</td>\n", - " <td>2275</td>\n", - " </tr>\n", - " <tr>\n", - " <th>29</th>\n", - " <td>Métiers</td>\n", - " <td>5254</td>\n", - " </tr>\n", - " <tr>\n", - " <th>30</th>\n", - " <td>Pharmacie</td>\n", - " <td>326</td>\n", - " </tr>\n", - " <tr>\n", - " <th>31</th>\n", - " <td>Philosophie</td>\n", - " <td>470</td>\n", - " </tr>\n", - " <tr>\n", - " <th>32</th>\n", - " <td>Physique - [Sciences physico-mathématiques]</td>\n", - " <td>1324</td>\n", - " </tr>\n", - " <tr>\n", - " <th>33</th>\n", - " <td>Politique</td>\n", - " <td>116</td>\n", - " </tr>\n", - " <tr>\n", - " <th>34</th>\n", - " <td>Pêche</td>\n", - " <td>210</td>\n", - " </tr>\n", - " <tr>\n", - " <th>35</th>\n", - " <td>Religion</td>\n", - " <td>1641</td>\n", - " </tr>\n", - " <tr>\n", - " <th>36</th>\n", - " <td>Spectacle</td>\n", - " <td>48</td>\n", - " </tr>\n", - " <tr>\n", - " <th>37</th>\n", - " <td>Superstition</td>\n", - " <td>109</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " ensemble_domaine_enccre counts\n", - "0 Agriculture - Economie rustique 1163\n", - "1 Anatomie 1073\n", - "2 Antiquité 1362\n", - "3 Architecture 1389\n", - "4 Arts et métiers 563\n", - "5 Beaux-arts 429\n", - "6 Belles-lettres - Poésie 1031\n", - "7 Blason 539\n", - "8 Caractères 114\n", - "9 Chasse 581\n", - "10 Chimie 520\n", - "11 Commerce 1879\n", - "12 Droit - Jurisprudence 6419\n", - "13 Economie domestique 135\n", - "14 Grammaire 2258\n", - "15 Géographie 13104\n", - "16 Histoire 3080\n", - "17 Histoire naturelle 4814\n", - "18 Jeu 282\n", - "19 Marine 2076\n", - "20 Maréchage - Manège 524\n", - "21 Mathématiques 698\n", - "22 Mesure 184\n", - "23 Militaire (Art) - Guerre - Arme 1289\n", - "24 Minéralogie 111\n", - "25 Monnaie 317\n", - "26 Musique 685\n", - "27 Médailles 117\n", - "28 Médecine - Chirurgie 2275\n", - "29 Métiers 5254\n", - "30 Pharmacie 326\n", - "31 Philosophie 470\n", - "32 Physique - [Sciences physico-mathématiques] 1324\n", - "33 Politique 116\n", - "34 Pêche 210\n", - "35 Religion 1641\n", - "36 Spectacle 48\n", - "37 Superstition 109" - ] - }, - "execution_count": 87, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts')" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "id": "unable-agenda", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>ensemble_domaine_enccre</th>\n", - " <th>counts</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>Agriculture - Economie rustique</td>\n", - " <td>930</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Anatomie</td>\n", - " <td>858</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>Antiquité</td>\n", - " <td>1090</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>Architecture</td>\n", - " <td>1111</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>Arts et métiers</td>\n", - " <td>451</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>Beaux-arts</td>\n", - " <td>343</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6</th>\n", - " <td>Belles-lettres - Poésie</td>\n", - " <td>825</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7</th>\n", - " <td>Blason</td>\n", - " <td>431</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8</th>\n", - " <td>Caractères</td>\n", - " <td>91</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9</th>\n", - " <td>Chasse</td>\n", - " <td>465</td>\n", - " </tr>\n", - " <tr>\n", - " <th>10</th>\n", - " <td>Chimie</td>\n", - " <td>416</td>\n", - " </tr>\n", - " <tr>\n", - " <th>11</th>\n", - " <td>Commerce</td>\n", - " <td>1503</td>\n", - " </tr>\n", - " <tr>\n", - " <th>12</th>\n", - " <td>Droit - Jurisprudence</td>\n", - " <td>5135</td>\n", - " </tr>\n", - " <tr>\n", - " <th>13</th>\n", - " <td>Economie domestique</td>\n", - " <td>108</td>\n", - " </tr>\n", - " <tr>\n", - " <th>14</th>\n", - " <td>Grammaire</td>\n", - " <td>1806</td>\n", - " </tr>\n", - " <tr>\n", - " <th>15</th>\n", - " <td>Géographie</td>\n", - " <td>10483</td>\n", - " </tr>\n", - " <tr>\n", - " <th>16</th>\n", - " <td>Histoire</td>\n", - " <td>2464</td>\n", - " </tr>\n", - " <tr>\n", - " <th>17</th>\n", - " <td>Histoire naturelle</td>\n", - " <td>3851</td>\n", - " </tr>\n", - " <tr>\n", - " <th>18</th>\n", - " <td>Jeu</td>\n", - " <td>226</td>\n", - " </tr>\n", - " <tr>\n", - " <th>19</th>\n", - " <td>Marine</td>\n", - " <td>1661</td>\n", - " </tr>\n", - " <tr>\n", - " <th>20</th>\n", - " <td>Maréchage - Manège</td>\n", - " <td>419</td>\n", - " </tr>\n", - " <tr>\n", - " <th>21</th>\n", - " <td>Mathématiques</td>\n", - " <td>558</td>\n", - " </tr>\n", - " <tr>\n", - " <th>22</th>\n", - " <td>Mesure</td>\n", - " <td>147</td>\n", - " </tr>\n", - " <tr>\n", - " <th>23</th>\n", - " <td>Militaire (Art) - Guerre - Arme</td>\n", - " <td>1031</td>\n", - " </tr>\n", - " <tr>\n", - " <th>24</th>\n", - " <td>Minéralogie</td>\n", - " <td>89</td>\n", - " </tr>\n", - " <tr>\n", - " <th>25</th>\n", - " <td>Monnaie</td>\n", - " <td>254</td>\n", - " </tr>\n", - " <tr>\n", - " <th>26</th>\n", - " <td>Musique</td>\n", - " <td>548</td>\n", - " </tr>\n", - " <tr>\n", - " <th>27</th>\n", - " <td>Médailles</td>\n", - " <td>94</td>\n", - " </tr>\n", - " <tr>\n", - " <th>28</th>\n", - " <td>Médecine - Chirurgie</td>\n", - " <td>1820</td>\n", - " </tr>\n", - " <tr>\n", - " <th>29</th>\n", - " <td>Métiers</td>\n", - " <td>4203</td>\n", - " </tr>\n", - " <tr>\n", - " <th>30</th>\n", - " <td>Pharmacie</td>\n", - " <td>261</td>\n", - " </tr>\n", - " <tr>\n", - " <th>31</th>\n", - " <td>Philosophie</td>\n", - " <td>376</td>\n", - " </tr>\n", - " <tr>\n", - " <th>32</th>\n", - " <td>Physique - [Sciences physico-mathématiques]</td>\n", - " <td>1059</td>\n", - " </tr>\n", - " <tr>\n", - " <th>33</th>\n", - " <td>Politique</td>\n", - " <td>93</td>\n", - " </tr>\n", - " <tr>\n", - " <th>34</th>\n", - " <td>Pêche</td>\n", - " <td>168</td>\n", - " </tr>\n", - " <tr>\n", - " <th>35</th>\n", - " <td>Religion</td>\n", - " <td>1313</td>\n", - " </tr>\n", - " <tr>\n", - " <th>36</th>\n", - " <td>Spectacle</td>\n", - " <td>39</td>\n", - " </tr>\n", - " <tr>\n", - " <th>37</th>\n", - " <td>Superstition</td>\n", - " <td>87</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " ensemble_domaine_enccre counts\n", - "0 Agriculture - Economie rustique 930\n", - "1 Anatomie 858\n", - "2 Antiquité 1090\n", - "3 Architecture 1111\n", - "4 Arts et métiers 451\n", - "5 Beaux-arts 343\n", - "6 Belles-lettres - Poésie 825\n", - "7 Blason 431\n", - "8 Caractères 91\n", - "9 Chasse 465\n", - "10 Chimie 416\n", - "11 Commerce 1503\n", - "12 Droit - Jurisprudence 5135\n", - "13 Economie domestique 108\n", - "14 Grammaire 1806\n", - "15 Géographie 10483\n", - "16 Histoire 2464\n", - "17 Histoire naturelle 3851\n", - "18 Jeu 226\n", - "19 Marine 1661\n", - "20 Maréchage - Manège 419\n", - "21 Mathématiques 558\n", - "22 Mesure 147\n", - "23 Militaire (Art) - Guerre - Arme 1031\n", - "24 Minéralogie 89\n", - "25 Monnaie 254\n", - "26 Musique 548\n", - "27 Médailles 94\n", - "28 Médecine - Chirurgie 1820\n", - "29 Métiers 4203\n", - "30 Pharmacie 261\n", - "31 Philosophie 376\n", - "32 Physique - [Sciences physico-mathématiques] 1059\n", - "33 Politique 93\n", - "34 Pêche 168\n", - "35 Religion 1313\n", - "36 Spectacle 39\n", - "37 Superstition 87" - ] - }, - "execution_count": 88, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_x.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts')" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "id": "potential-friday", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>ensemble_domaine_enccre</th>\n", - " <th>counts</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>Agriculture - Economie rustique</td>\n", - " <td>233</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Anatomie</td>\n", - " <td>215</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>Antiquité</td>\n", - " <td>272</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>Architecture</td>\n", - " <td>278</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>Arts et métiers</td>\n", - " <td>112</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>Beaux-arts</td>\n", - " <td>86</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6</th>\n", - " <td>Belles-lettres - Poésie</td>\n", - " <td>206</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7</th>\n", - " <td>Blason</td>\n", - " <td>108</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8</th>\n", - " <td>Caractères</td>\n", - " <td>23</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9</th>\n", - " <td>Chasse</td>\n", - " <td>116</td>\n", - " </tr>\n", - " <tr>\n", - " <th>10</th>\n", - " <td>Chimie</td>\n", - " <td>104</td>\n", - " </tr>\n", - " <tr>\n", - " <th>11</th>\n", - " <td>Commerce</td>\n", - " <td>376</td>\n", - " </tr>\n", - " <tr>\n", - " <th>12</th>\n", - " <td>Droit - Jurisprudence</td>\n", - " <td>1284</td>\n", - " </tr>\n", - " <tr>\n", - " <th>13</th>\n", - " <td>Economie domestique</td>\n", - " <td>27</td>\n", - " </tr>\n", - " <tr>\n", - " <th>14</th>\n", - " <td>Grammaire</td>\n", - " <td>452</td>\n", - " </tr>\n", - " <tr>\n", - " <th>15</th>\n", - " <td>Géographie</td>\n", - " <td>2621</td>\n", - " </tr>\n", - " <tr>\n", - " <th>16</th>\n", - " <td>Histoire</td>\n", - " <td>616</td>\n", - " </tr>\n", - " <tr>\n", - " <th>17</th>\n", - " <td>Histoire naturelle</td>\n", - " <td>963</td>\n", - " </tr>\n", - " <tr>\n", - " <th>18</th>\n", - " <td>Jeu</td>\n", - " <td>56</td>\n", - " </tr>\n", - " <tr>\n", - " <th>19</th>\n", - " <td>Marine</td>\n", - " <td>415</td>\n", - " </tr>\n", - " <tr>\n", - " <th>20</th>\n", - " <td>Maréchage - Manège</td>\n", - " <td>105</td>\n", - " </tr>\n", - " <tr>\n", - " <th>21</th>\n", - " <td>Mathématiques</td>\n", - " <td>140</td>\n", - " </tr>\n", - " <tr>\n", - " <th>22</th>\n", - " <td>Mesure</td>\n", - " <td>37</td>\n", - " </tr>\n", - " <tr>\n", - " <th>23</th>\n", - " <td>Militaire (Art) - Guerre - Arme</td>\n", - " <td>258</td>\n", - " </tr>\n", - " <tr>\n", - " <th>24</th>\n", - " <td>Minéralogie</td>\n", - " <td>22</td>\n", - " </tr>\n", - " <tr>\n", - " <th>25</th>\n", - " <td>Monnaie</td>\n", - " <td>63</td>\n", - " </tr>\n", - " <tr>\n", - " <th>26</th>\n", - " <td>Musique</td>\n", - " <td>137</td>\n", - " </tr>\n", - " <tr>\n", - " <th>27</th>\n", - " <td>Médailles</td>\n", - " <td>23</td>\n", - " </tr>\n", - " <tr>\n", - " <th>28</th>\n", - " <td>Médecine - Chirurgie</td>\n", - " <td>455</td>\n", - " </tr>\n", - " <tr>\n", - " <th>29</th>\n", - " <td>Métiers</td>\n", - " <td>1051</td>\n", - " </tr>\n", - " <tr>\n", - " <th>30</th>\n", - " <td>Pharmacie</td>\n", - " <td>65</td>\n", - " </tr>\n", - " <tr>\n", - " <th>31</th>\n", - " <td>Philosophie</td>\n", - " <td>94</td>\n", - " </tr>\n", - " <tr>\n", - " <th>32</th>\n", - " <td>Physique - [Sciences physico-mathématiques]</td>\n", - " <td>265</td>\n", - " </tr>\n", - " <tr>\n", - " <th>33</th>\n", - " <td>Politique</td>\n", - " <td>23</td>\n", - " </tr>\n", - " <tr>\n", - " <th>34</th>\n", - " <td>Pêche</td>\n", - " <td>42</td>\n", - " </tr>\n", - " <tr>\n", - " <th>35</th>\n", - " <td>Religion</td>\n", - " <td>328</td>\n", - " </tr>\n", - " <tr>\n", - " <th>36</th>\n", - " <td>Spectacle</td>\n", - " <td>9</td>\n", - " </tr>\n", - " <tr>\n", - " <th>37</th>\n", - " <td>Superstition</td>\n", - " <td>22</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " ensemble_domaine_enccre counts\n", - "0 Agriculture - Economie rustique 233\n", - "1 Anatomie 215\n", - "2 Antiquité 272\n", - "3 Architecture 278\n", - "4 Arts et métiers 112\n", - "5 Beaux-arts 86\n", - "6 Belles-lettres - Poésie 206\n", - "7 Blason 108\n", - "8 Caractères 23\n", - "9 Chasse 116\n", - "10 Chimie 104\n", - "11 Commerce 376\n", - "12 Droit - Jurisprudence 1284\n", - "13 Economie domestique 27\n", - "14 Grammaire 452\n", - "15 Géographie 2621\n", - "16 Histoire 616\n", - "17 Histoire naturelle 963\n", - "18 Jeu 56\n", - "19 Marine 415\n", - "20 Maréchage - Manège 105\n", - "21 Mathématiques 140\n", - "22 Mesure 37\n", - "23 Militaire (Art) - Guerre - Arme 258\n", - "24 Minéralogie 22\n", - "25 Monnaie 63\n", - "26 Musique 137\n", - "27 Médailles 23\n", - "28 Médecine - Chirurgie 455\n", - "29 Métiers 1051\n", - "30 Pharmacie 65\n", - "31 Philosophie 94\n", - "32 Physique - [Sciences physico-mathématiques] 265\n", - "33 Politique 23\n", - "34 Pêche 42\n", - "35 Religion 328\n", - "36 Spectacle 9\n", - "37 Superstition 22" - ] - }, - "execution_count": 89, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "validation_x.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "demanding-essay", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "vanilla-italy", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "verified-compression", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} -- GitLab