From c33bee9cde56ea9fc00a75acae1640802974a08a Mon Sep 17 00:00:00 2001 From: lmoncla <moncla.ludovic@gmail.com> Date: Thu, 6 Jan 2022 10:38:26 +0100 Subject: [PATCH] add notebooks --- .gitignore | 1 + notebooks/CorpusTEI_EDdA_to_dataframe.ipynb | 5646 +++++++++++++++++ .../EDdA_Classification_BertFineTuning.ipynb | 4421 +++++++++++++ .../EDdA_Classification_ClassicModels.ipynb | 861 +++ .../EDdA_Classification_DeepLearning.ipynb | 1351 ++++ .../EDdA_Classification_DeepLearning_2.ipynb | 1349 ++++ ...ssification_Generate_ConfusionMatrix.ipynb | 1181 ++++ 7 files changed, 14810 insertions(+) create mode 100644 notebooks/CorpusTEI_EDdA_to_dataframe.ipynb create mode 100644 notebooks/EDdA_Classification_BertFineTuning.ipynb create mode 100644 notebooks/EDdA_Classification_ClassicModels.ipynb create mode 100644 notebooks/EDdA_Classification_DeepLearning.ipynb create mode 100644 notebooks/EDdA_Classification_DeepLearning_2.ipynb create mode 100644 notebooks/EDdA_Classification_Generate_ConfusionMatrix.ipynb diff --git a/.gitignore b/.gitignore index 71a9a39..03156e2 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ dataframe_with_domaine_enccre.csv dataframe_with_normClass_artfl.csv *.pkl .DS_Store +.DS_Store diff --git a/notebooks/CorpusTEI_EDdA_to_dataframe.ipynb b/notebooks/CorpusTEI_EDdA_to_dataframe.ipynb new file mode 100644 index 0000000..a1321a2 --- /dev/null +++ b/notebooks/CorpusTEI_EDdA_to_dataframe.ipynb @@ -0,0 +1,5646 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "metallic-shelf", + "metadata": {}, + "source": [ + "# Préparation du corpus EDdA pour la classification en domaine" + ] + }, + { + "cell_type": "markdown", + "id": "designing-advice", + "metadata": {}, + "source": [ + "## Preparing data" + ] + }, + { + "cell_type": "markdown", + "id": "floppy-fleet", + "metadata": {}, + "source": [ + "### Import des librairies" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "appreciated-victim", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import urllib, json\n", + "from urllib.request import urlopen" + ] + }, + { + "cell_type": "markdown", + "id": "framed-fossil", + "metadata": {}, + "source": [ + "### Parsing des articles TEI" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "suburban-honduras", + "metadata": {}, + "outputs": [], + "source": [ + "input_path = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/Alice/EDdA/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "scenic-vermont", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Volume : 11\n", + "Volume : 16\n", + "Volume : 17\n", + "Volume : 10\n", + "Volume : 5\n", + "Volume : 2\n", + "Volume : 3\n", + "Volume : 4\n", + "Volume : 15\n", + "Volume : 12\n", + "Volume : 13\n", + "Volume : 14\n", + "Volume : 1\n", + "Volume : 6\n", + "Volume : 8\n", + "Volume : 9\n", + "Volume : 7\n" + ] + } + ], + "source": [ + "# récupération dans une liste des métadonnées (volume, numéro, nom de l'article, classe et auteur) à partir des fichiers TEI\n", + "data = []\n", + "\n", + "for tome in os.listdir(input_path):\n", + " volume = tome[1:]\n", + " print(\"Volume : \", volume)\n", + " \n", + " for article in os.listdir(input_path + tome +\"/\"):\n", + " #print(\"Article : \", article[7:-4])\n", + " numero = article[7:-4]\n", + " extension = article[-4:]\n", + " if extension == '.tei':\n", + "\n", + " try:\n", + " soup = BeautifulSoup(open(input_path+tome+\"/\"+article))\n", + "\n", + " head = soup.find(type=\"head\")\n", + " author = soup.find(type=\"author\")\n", + " normclass = soup.find(type=\"normclass\")\n", + " classEDdA = soup.find(type=\"class\")\n", + " \n", + " #print(volume, numero, head.get('value'), normclass.get('value'), author.get('value'))\n", + " data.append([int(volume), int(numero), head.get('value').strip(), normclass.get('value').strip(), classEDdA.get('value').strip(), author.get('value').strip()])\n", + " \n", + " except AttributeError as e:\n", + " #print('Volume : ', volume, ' Numéro : ', numero)\n", + " #print('Error : ' + str(e))\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "excess-waterproof", + "metadata": {}, + "outputs": [], + "source": [ + "# transformation de la liste en dataframe\n", + "df = pd.DataFrame(data, columns=['volume', 'numero', 'head', 'normClass', 'classEDdA', 'author'])\n", + "df = df.sort_values(['volume', 'numero']).reset_index(drop = True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "blocked-reading", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>45529</th>\n", + " <td>11</td>\n", + " <td>2501</td>\n", + " <td>OPICIENS, les</td>\n", + " <td>Géographie ancienne</td>\n", + " <td>Géog. anc.</td>\n", + " <td>unsigned</td>\n", + " </tr>\n", + " <tr>\n", + " <th>63464</th>\n", + " <td>15</td>\n", + " <td>1971</td>\n", + " <td>SOUSA, Province de, ou Souse</td>\n", + " <td>Géographie moderne</td>\n", + " <td>Géog. mod.</td>\n", + " <td>unsigned</td>\n", + " </tr>\n", + " <tr>\n", + " <th>38896</th>\n", + " <td>9</td>\n", + " <td>4159</td>\n", + " <td>Maison</td>\n", + " <td>Histoire moderne</td>\n", + " <td>Hist. mod.</td>\n", + " <td>unsigned</td>\n", + " </tr>\n", + " <tr>\n", + " <th>52378</th>\n", + " <td>13</td>\n", + " <td>522</td>\n", + " <td>PORTO-FERRAIO</td>\n", + " <td>Géographie moderne</td>\n", + " <td>Geog. mod.</td>\n", + " <td>unsigned</td>\n", + " </tr>\n", + " <tr>\n", + " <th>62792</th>\n", + " <td>15</td>\n", + " <td>1299</td>\n", + " <td>SNOWDON-HILLS</td>\n", + " <td>Géographie moderne</td>\n", + " <td>Géog. mod.</td>\n", + " <td>Jaucourt</td>\n", + " </tr>\n", + " <tr>\n", + " <th>54108</th>\n", + " <td>13</td>\n", + " <td>2252</td>\n", + " <td>PULO-WAY</td>\n", + " <td>Géographie moderne</td>\n", + " <td>Geog. mod.</td>\n", + " <td>Jaucourt</td>\n", + " </tr>\n", + " <tr>\n", + " <th>62965</th>\n", + " <td>15</td>\n", + " <td>1472</td>\n", + " <td>Solide</td>\n", + " <td>Anatomie</td>\n", + " <td>en Anatomie</td>\n", + " <td>unsigned</td>\n", + " </tr>\n", + " <tr>\n", + " <th>61463</th>\n", + " <td>14</td>\n", + " <td>5167</td>\n", + " <td>SÉMI-PÉLAGIANISME</td>\n", + " <td>Histoire ecclésiastique</td>\n", + " <td>Hist. eccles.</td>\n", + " <td>Jaucourt</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29174</th>\n", + " <td>7</td>\n", + " <td>1711</td>\n", + " <td>GAS</td>\n", + " <td>Chimie</td>\n", + " <td>Chim.</td>\n", + " <td>Venel</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21200</th>\n", + " <td>4</td>\n", + " <td>5290</td>\n", + " <td>Divin, emplâtre divin, emplastrum divinum</td>\n", + " <td>Pharmacie</td>\n", + " <td>Pharmac.</td>\n", + " <td>unsigned</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head \\\n", + "45529 11 2501 OPICIENS, les \n", + "63464 15 1971 SOUSA, Province de, ou Souse \n", + "38896 9 4159 Maison \n", + "52378 13 522 PORTO-FERRAIO \n", + "62792 15 1299 SNOWDON-HILLS \n", + "54108 13 2252 PULO-WAY \n", + "62965 15 1472 Solide \n", + "61463 14 5167 SÉMI-PÉLAGIANISME \n", + "29174 7 1711 GAS \n", + "21200 4 5290 Divin, emplâtre divin, emplastrum divinum \n", + "\n", + " normClass classEDdA author \n", + "45529 Géographie ancienne Géog. anc. unsigned \n", + "63464 Géographie moderne Géog. mod. unsigned \n", + "38896 Histoire moderne Hist. mod. unsigned \n", + "52378 Géographie moderne Geog. mod. unsigned \n", + "62792 Géographie moderne Géog. mod. Jaucourt \n", + "54108 Géographie moderne Geog. mod. Jaucourt \n", + "62965 Anatomie en Anatomie unsigned \n", + "61463 Histoire ecclésiastique Hist. eccles. Jaucourt \n", + "29174 Chimie Chim. Venel \n", + "21200 Pharmacie Pharmac. unsigned " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# affichage aléatoire de 50 lignes du dataframe\n", + "df.sample(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "expired-click", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "74190" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# nombre d'articles dans le dataframe\n", + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "considered-adjustment", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " </tr>\n", + " <tr>\n", + " <th>normClass</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th></th>\n", + " <td>44</td>\n", + " <td>44</td>\n", + " <td>44</td>\n", + " <td>44</td>\n", + " <td>44</td>\n", + " </tr>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>17</td>\n", + " <td>17</td>\n", + " <td>17</td>\n", + " <td>17</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Abus des langues</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Accord de sons</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Acoustique</th>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head classEDdA author\n", + "normClass \n", + " 44 44 44 44 44\n", + "0 17 17 17 17 17\n", + "Abus des langues 1 1 1 1 1\n", + "Accord de sons 1 1 1 1 1\n", + "Acoustique 6 6 6 6 6" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# On regroupe les lignes du dataframe en fonction du normclass\n", + "classes = df.groupby(['normClass']).count()\n", + "classes.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "instructional-variation", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2908" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Nombre de classes \n", + "len(classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "handmade-contest", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12685" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# nombre d'articles 'unclassified'\n", + "len(df.loc[df['normClass']==\"unclassified\",:])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "crude-olympus", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1614" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# nombre de classes avec un seul article\n", + "len(classes.loc[classes['volume']==1])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "sized-barrier", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2656" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# nombre de classes avec moins de 20 articles\n", + "len(classes.loc[classes['volume']<20])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "indian-selection", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "weighted-hanging", + "metadata": {}, + "source": [ + "### Enregistrement" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "stainless-stewart", + "metadata": {}, + "outputs": [], + "source": [ + "# enregistrement du résultat du groupby\n", + "classes['volume'].to_csv('classesEDdA.tsv',sep='\\t',header=False) " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "hearing-olive", + "metadata": {}, + "outputs": [], + "source": [ + "# enregistrement du dataframe (permet de ne pas reparser tous les fichiers TEI pour recharger ce dataframe)\n", + "df.to_csv('EDdA_dataframe_orginal.tsv',sep='\\t',index=False) " + ] + }, + { + "cell_type": "markdown", + "id": "stuck-courage", + "metadata": {}, + "source": [ + "### Lecture" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "thick-destiny", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('EDdA_dataframe_orginal.tsv', sep='\\t') " + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "typical-munich", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>Title Page</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>unsigned</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot & d'Alembert</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>d'Alembert</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Grammaire</td>\n", + " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", + " <td>Dumarsais5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais5</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head normClass \\\n", + "0 1 1 Title Page unclassified \n", + "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON unclassified \n", + "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS unclassified \n", + "3 1 5 A, a & a Grammaire \n", + "4 1 6 A unclassified \n", + "\n", + " classEDdA author \n", + "0 unclassified unsigned \n", + "1 unclassified Diderot & d'Alembert \n", + "2 unclassified d'Alembert \n", + "3 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 \n", + "4 unclassified Dumarsais5 " + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "baking-command", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "individual-protection", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "domaines_regroupes = {}\n", + "domaines_regroupes['Agriculture - Economie rustique'] = ['Agriculture', 'Economie rustique', 'Fontainier', 'Graines', 'Jardinage', 'Moulin', 'Sucre', 'Tabac', 'Vigne', 'Vin']\n", + "domaines_regroupes['Anatomie'] = ['Anatomie', 'Economie animale']\n", + "domaines_regroupes['Antiquité'] = ['Antiquité', 'Iconologie', 'Mythologie']\n", + "domaines_regroupes['Architecture'] = ['Architecture', 'Carreleur', 'Carrier', 'Coupe des pierres', 'Couvreur', 'Décoration', 'Maçonnerie']\n", + "domaines_regroupes['Arts et métiers'] = ['Arts et métiers', 'Arts mécaniques', 'Manufacture']\n", + "domaines_regroupes['Beaux-arts'] = ['Beaux-arts', 'Dessin', 'Gravue', 'Peinture', 'Sculpture']\n", + "domaines_regroupes['Belles-lettres - Poésie'] = ['Belles-lettres', 'Eloquence', 'Littérature', 'Poésie', 'Rhétorique']\n", + "domaines_regroupes['Blason'] = ['Blason']\n", + "domaines_regroupes['Caractères'] = ['Caractères', 'Ecriture']\n", + "domaines_regroupes['Chasse'] = ['Chasse', 'Fauconnerie', 'Oisellerie', 'Vénerie']\n", + "domaines_regroupes['Chimie'] = ['Alchimie', 'Chimie', 'Docimasie']\n", + "domaines_regroupes['Commerce'] = ['Commerce', 'Marchand', 'Voiturier']\n", + "domaines_regroupes['Droit - Jurisprudence'] = ['Chancellerie', 'Corporation', 'Douane', 'Droit', 'Eaux et Forêts', 'Finance', 'Jurisprudence', 'Palais']\n", + "domaines_regroupes['Economie domestique'] = ['Cuisine','Economie domestique']\n", + "#domaines_regroupes['Géographie'] = ['Géographie', 'Géographie Histoire naturelle', 'Géographie ancienne', 'Géographie des Arabes', 'Géographie du moyen âge',\n", + "# 'Géographie ecclésiastique', 'Géographie historique', 'Géographie maritime ancienne', 'Géographie des Romains', 'Géographie morderne',\n", + "# 'Géographie naturelle', 'Géographie physique', 'Géographie sacrée', 'Géographie sainte', 'Géographie transcendante', 'Géographie transcendantee']\n", + "domaines_regroupes['Géographie'] = ['Géographie', 'Topographie']\n", + "domaines_regroupes['Grammaire'] = ['Grammaire', 'Langues', 'Synonymes']\n", + "domaines_regroupes['Histoire'] = ['Calendrier','Chevalerie','Chronologie','Coutumes','Généalogie','Histoire','Inscriptions','Inventions', 'Voyage']\n", + "domaines_regroupes['Histoire naturelle'] = ['Botanique','Conchyliologie','Fossiles','Histoire naturelle', 'Ichtyologie','Insectologie','Ophiologie','Ornithologie','Zoologie']\n", + "domaines_regroupes['Jeu'] = ['Jeu']\n", + "domaines_regroupes['Maréchage - Manège'] = ['Maréchage', 'Manège']\n", + "domaines_regroupes['Marine'] = ['Galère','Marine', 'Navigation', 'Rivière']\n", + "domaines_regroupes['Mathématiques'] = ['Algèbre','Analyse des hasards', 'Arithmétique', 'Arpentage','Géométrie', 'Mathématiques', 'Trigonométrie']\n", + "domaines_regroupes['Médailles'] = ['Médailles','Numismatique']\n", + "domaines_regroupes['Médecine - Chirurgie'] = ['Chirurgie', 'Diète', 'Gymnastique', 'Maladie', 'Matière médicale', 'Médecine', 'Pathologie', 'Physiologie', 'Séméiotique', 'Thérapeutique']\n", + "domaines_regroupes['Mesure'] = ['Balancier', 'Jaugeage', 'Mesure', 'Poids']\n", + "domaines_regroupes[\"Métiers\"] = ['Boucherie', 'Boulangerie', 'Brasserie', 'Charcuterie', 'Confiserie', 'Distillation', 'Epicerie', 'Pâtisserie', 'Rôtisserie', 'Vinaigrier']\n", + "domaines_regroupes[\"Métiers\"] += ['Bois', 'Boissellerie', 'Charpenterie', 'Charronnage', 'Coffretier', 'Ebénisterie', 'Formier', 'Layeterie', 'Menuiserie', 'Tonnelier', 'Vannerie']\n", + "domaines_regroupes[\"Métiers\"] += ['Bourrelier', 'Boyaudier', 'Cardier', 'Chamoiseur', 'Corroierie', 'Cuir', 'Gainier', 'Hongroyeur', 'Maroquinier', 'Mégisserie', 'Parcheminerie', 'Peausserie', 'Pelleterie', 'Sellier', 'Tannerie']\n", + "domaines_regroupes[\"Métiers\"] += ['Aiguilletier-Epinglier', 'Ardoiserie', 'Argent', \"Batteur d'or\", 'Bijouterie', 'Bimblotier', 'Chaînetier', 'Chaudronnerie', 'Ciselure', 'Cloche', 'Cloutier', 'Coutellerie', 'Cuivre', 'Diamantaire', 'Dorure', 'Eperonnier', 'Fer']\n", + "domaines_regroupes[\"Métiers\"] += ['Ferblanterie', 'Fonderie', 'Forge', 'Fourbisseur', 'Glaces', 'Joaillier', 'Lapidaire', 'Lunetier', 'Marbrier', 'Maréchal-grossier', 'Métal', 'Metteur en oeuvre', 'Miroiterie', 'Or', 'Orfèvrerie']\n", + "domaines_regroupes[\"Métiers\"] += ['Pierres', 'Plomberie', \"Potier d'étain\", 'Serrurerie', 'Taillanderie', \"Tireur d'or\", 'Verrerie', 'Vitrerie']\n", + "domaines_regroupes[\"Métiers\"] += ['Cartier', 'Cartonnier', 'Imprimerie', 'Librairie', 'Marbreur de papier', 'Papeterie', 'Reliure']\n", + "domaines_regroupes[\"Métiers\"] += ['Bas au métier', 'Blanchissage des toiles', 'Blondier', 'Bonneterie', 'Bottier', 'Bourserie', 'Boutonnier', 'Broderie', 'Cardeur', 'Ceinturier', 'Chapellerie', 'Cordonnerie','Coton', 'Couture', 'Découpeur', 'Dentelle', 'Draperie']\n", + "domaines_regroupes[\"Métiers\"] += ['Etoffe', 'Fil', 'Friseur', 'Ganterie', 'Gazier', 'Laine', 'Lingerie', 'Mode', 'Ourdissage', 'Passementerie', 'Perruquier', 'Plumasserie', 'Rubanerie', 'Soierie', 'Tailleur', 'Tapisserie', 'Teinturerie', 'Tisserand', 'Toilerie', 'Tonderie de drap']\n", + "domaines_regroupes[\"Métiers\"] += ['Amidonnier', 'Blanchisserie de cire', 'Chandelier', 'Cirerie', 'Corderie', 'Emailleur', 'Eventailliste', 'Faïencier', 'Filassier', 'Fleuriste', 'Horlogerie', 'Marqueterie', 'Métiers peu attestés', 'Parfumeur', 'Paumier', 'Poterie']\n", + "domaines_regroupes[\"Métiers\"] += ['Salpêtrerie', 'Savonnerie', 'Sel', 'Tabatière', 'Tabletier-Cornetier', 'Tourneur', 'Vergetier', 'Vernisseur']\n", + "domaines_regroupes['Militaire (Art) - Guerre - Arme'] = ['Armurerie', 'Artificier', 'Artillerie', 'Canon','Escrime','Fortification','Guerre','Milice','Militaire']\n", + "domaines_regroupes['Minéralogie'] = ['Lithologie','Métallurgie','Minéralogie']\n", + "domaines_regroupes['Monnaie'] = ['Monnaie']\n", + "domaines_regroupes['Musique'] = ['Danse', 'Lutherie','Musique','Orgue', 'Voix']\n", + "domaines_regroupes['Pêche'] = ['Pêche']\n", + "domaines_regroupes['Pharmacie'] = ['Drogues', 'Pharmacie']\n", + "domaines_regroupes['Philosophie'] = ['Education', 'Logique', 'Métaphysique', 'Morale', 'Philologie','Philosophie', 'Sciences']\n", + "domaines_regroupes['Physique - [Sciences physico-mathématiques]'] = ['Acoustique', 'Astrologie', 'Astronomie', 'Cosmographie-Cosmologie', 'Gnomonique', 'Hydraulique', 'Mécanique', 'Optique', 'Perspective', 'Physique', 'Science microscopique']\n", + "domaines_regroupes['Politique'] = ['Economie', 'Gouvernement', 'Police', 'Politique']\n", + "domaines_regroupes['Religion'] = ['Critique sacrée', 'Culte', 'Eglise', 'Histoire ecclésiastique', 'Idolâtrie', 'Religion', 'Théologie']\n", + "domaines_regroupes['Spectacle'] = ['Opéra','Spectacle', 'Théâtre']\n", + "domaines_regroupes['Superstition'] = ['Divination', 'Magie', 'Superstition']" + ] + }, + { + "cell_type": "markdown", + "id": "variable-instrument", + "metadata": {}, + "source": [ + "### Récupération correspondance EDdA / ENCCRE" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "south-equation", + "metadata": {}, + "outputs": [], + "source": [ + "df_correspondances = pd.read_csv(\"/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Classification domaines EDdA/correspondances_ARTFL-ENCCRE.csv\") \n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "protecting-incentive", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>path</th>\n", + " <th>entreeid</th>\n", + " <th>tome</th>\n", + " <th>article</th>\n", + " <th>adresse</th>\n", + " <th>entree</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>T1/article5</td>\n", + " <td>v1-1-0</td>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>T1/article6</td>\n", + " <td>v1-1-1</td>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>T1/article7</td>\n", + " <td>v1-1-2</td>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>T1/article8</td>\n", + " <td>v1-1-3</td>\n", + " <td>1</td>\n", + " <td>8</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>T1/article9</td>\n", + " <td>v1-1-4</td>\n", + " <td>1</td>\n", + " <td>9</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " path entreeid tome article adresse entree\n", + "0 T1/article5 v1-1-0 1 5 1 0\n", + "1 T1/article6 v1-1-1 1 6 1 1\n", + "2 T1/article7 v1-1-2 1 7 1 2\n", + "3 T1/article8 v1-1-3 1 8 1 3\n", + "4 T1/article9 v1-1-4 1 9 1 4" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_correspondances.head()" + ] + }, + { + "cell_type": "markdown", + "id": "continuous-feedback", + "metadata": {}, + "source": [ + "### Test récupération données ENCCRE" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "spread-feature", + "metadata": {}, + "outputs": [], + "source": [ + "import urllib, json\n", + "from urllib.request import urlopen\n", + "\n", + "json_url = urlopen(\"http://enccre.academie-sciences.fr/icefront/api/article/v1-544-0\")\n", + "data = json.loads(json_url.read())" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "facial-syndicate", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'géographie'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['annotations']['constit'][0]['domgen'][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "removed-nickel", + "metadata": {}, + "outputs": [], + "source": [ + "def get_key(val):\n", + " for key, value in domaines_regroupes.items():\n", + " for v in value:\n", + " v = v.replace(\" \", \"\")\n", + " if val == v.lower():\n", + " return key\n", + " \n", + " return None\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "nuclear-murder", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Histoire naturelle\n" + ] + } + ], + "source": [ + "print(get_key(\"histoirenaturelle\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "placed-homework", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "extraordinary-settlement", + "metadata": {}, + "source": [ + "### Ajout des colonnes domaines, texte, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "pursuant-camel", + "metadata": {}, + "outputs": [], + "source": [ + "def getDomaineEnccre(volume, numero, classEDDA):\n", + " #print(volume, ' ', numero)\n", + "\n", + " domaine = \"\"\n", + " ensemble_domaine = \"\"\n", + " entreeid = \"\"\n", + " \n", + " try : \n", + " #entreeid = df_correspondances.loc[(df_correspondances['tome']==volume) & (df_correspondances['article']==numero)]['entreeid'][0]\n", + " d = df_correspondances.loc[(df_correspondances['tome']==volume) & (df_correspondances['article']==numero)].reset_index(drop=True)\n", + " entreeid = d['entreeid'][0]\n", + "\n", + " json_url = urlopen(\"http://enccre.academie-sciences.fr/icefront/api/article/\" + entreeid)\n", + " data = json.loads(json_url.read())\n", + " #print(data['annotations']['constit'][0]['domgen'][0])\n", + " \n", + " \n", + " try : \n", + " \n", + " # changer pour avoir tous les noms\n", + " domaine = data['annotations']['constit'][0]['domgen'][0]\n", + " ensemble_domaine = get_key(domaine)\n", + "\n", + "\n", + "\n", + " \n", + " '''\n", + " for constit in data['annotations']['constit']:\n", + " \n", + " domaine = constit['domgen'][0]\n", + " print(domaine)\n", + "\n", + " for domgen in constit['domgen']: \n", + " domaine_multi += domgen + \";\"\n", + " ensemble = get_key(domgen)\n", + " if ensemble:\n", + " ensemble_domaine_multi.append(ensemble)\n", + " \n", + " #print(domaine)\n", + " '''\n", + " except KeyError:\n", + " pass\n", + " \n", + " except KeyError:\n", + " pass\n", + " \n", + " try :\n", + " if volume < 10:\n", + " txt_file = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/articles_all/all_txt/volume0\"+str(volume)+\"-\"+str(numero)+\".txt\"\n", + " else :\n", + " txt_file = \"/Users/lmoncla/Documents/Data/Corpus/EDDA/articles_all/all_txt/volume\"+str(volume)+\"-\"+str(numero)+\".txt\"\n", + "\n", + " txtContent = open(txt_file, \"r\").read()\n", + " \n", + " classEDDA = str(classEDDA)\n", + " \n", + " #supprime le désignant du texte\n", + " classEDDA_with_brcts = '('+ classEDDA +')'\n", + " txtContentWithoutClass = txtContent.replace(classEDDA_with_brcts, \"\")\n", + " txtContentWithoutClass = txtContent.replace(classEDDA, \"\")\n", + " \n", + " firstParagraph = txtContentWithoutClass.split('\\n \\n')[0]\n", + " \n", + " except FileNotFoundError:\n", + " txtContent = \"\"\n", + " txtContentWithoutClass = \"\"\n", + " firstParagraph = \"\"\n", + " \n", + " #ensemble_domaine_multi = ';'.join(list(set(ensemble_domaine)))\n", + " \n", + " #print(entreeid, domaine, ensemble_domaine, txtContent, txtContentWithoutClass, firstParagraph)\n", + " \n", + " return pd.Series([entreeid, domaine, ensemble_domaine, txtContent, txtContentWithoutClass, firstParagraph])\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "timely-inspection", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "natural-spanking", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>Title Page</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>unsigned</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot & d'Alembert</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>d'Alembert</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Grammaire</td>\n", + " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", + " <td>Dumarsais5</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais5</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>5</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head normClass \\\n", + "0 1 1 Title Page unclassified \n", + "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON unclassified \n", + "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS unclassified \n", + "3 1 5 A, a & a Grammaire \n", + "4 1 6 A unclassified \n", + "\n", + " classEDdA author \\\n", + "0 unclassified unsigned \n", + "1 unclassified Diderot & d'Alembert \n", + "2 unclassified d'Alembert \n", + "3 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 \n", + "4 unclassified Dumarsais5 \n", + "\n", + " id_enccre domaine_enccre ensemble_domaine_enccre content \\\n", + "0 0 1 2 3 \n", + "1 0 1 2 3 \n", + "2 0 1 2 3 \n", + "3 0 1 2 3 \n", + "4 0 1 2 3 \n", + "\n", + " contentWithoutClass firstParagraph \n", + "0 4 5 \n", + "1 4 5 \n", + "2 4 5 \n", + "3 4 5 \n", + "4 4 5 " + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "christian-advice", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>Title Page</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>unsigned</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>ENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES SCIE...</td>\n", + " <td>ENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES SCIE...</td>\n", + " <td>ENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES SCIE...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot & d'Alembert</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>A MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINISTRE\\...</td>\n", + " <td>A MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINISTRE\\...</td>\n", + " <td>A MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINISTRE\\...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>d'Alembert</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>DISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\nL'Encycl...</td>\n", + " <td>DISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\nL'Encycl...</td>\n", + " <td>DISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\nL'Encycl...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Grammaire</td>\n", + " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-0</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-1</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais</td>\n", + " <td>v1-1-2</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>1</td>\n", + " <td>8</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-3</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>A, étoit une lettre numérale parmi les Anciens...</td>\n", + " <td>A, étoit une lettre numérale parmi les Anciens...</td>\n", + " <td>A, étoit une lettre numérale parmi les Anciens...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>1</td>\n", + " <td>9</td>\n", + " <td>A, lettre symbolique</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-4</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>A, lettre symbolique, étoit un hiéroglyphe che...</td>\n", + " <td>A, lettre symbolique, étoit un hiéroglyphe che...</td>\n", + " <td>A, lettre symbolique, étoit un hiéroglyphe che...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>1</td>\n", + " <td>10</td>\n", + " <td>A, numismatique ou monétaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-5</td>\n", + " <td>numismatique</td>\n", + " <td>Médailles</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>1</td>\n", + " <td>11</td>\n", + " <td>A, lapidaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-6</td>\n", + " <td>inscriptions</td>\n", + " <td>Histoire</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>1</td>\n", + " <td>12</td>\n", + " <td>A, lettre de suffrage</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-7</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>A, lettre de suffrage ; les Romains se servoie...</td>\n", + " <td>A, lettre de suffrage ; les Romains se servoie...</td>\n", + " <td>A, lettre de suffrage ; les Romains se servoie...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>1</td>\n", + " <td>13</td>\n", + " <td>A, signe d'absolution</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet5</td>\n", + " <td>v1-1-8</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>A, signe d'absolution, chez les Romains dans l...</td>\n", + " <td>A, signe d'absolution, chez les Romains dans l...</td>\n", + " <td>A, signe d'absolution, chez les Romains dans l...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>1</td>\n", + " <td>14</td>\n", + " <td>A cognitionibus</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-2-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>* A cognitionibus. Scorpus fameux Agitateur du...</td>\n", + " <td>* A cognitionibus. Scorpus fameux Agitateur du...</td>\n", + " <td>* A cognitionibus. Scorpus fameux Agitateur du...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>1</td>\n", + " <td>15</td>\n", + " <td>A curâ amicorum</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-3-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>* A curâ amicorum. On lit dans quelques inscri...</td>\n", + " <td>* A curâ amicorum. On lit dans quelques inscri...</td>\n", + " <td>* A curâ amicorum. On lit dans quelques inscri...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>1</td>\n", + " <td>16</td>\n", + " <td>A</td>\n", + " <td>Ecrivains modernes</td>\n", + " <td>dans les Ecrivains modernes</td>\n", + " <td>Mallet</td>\n", + " <td>v1-4-0</td>\n", + " <td>caractères</td>\n", + " <td>Caractères</td>\n", + " <td>A, dans les Ecrivains modernes, veut dire auss...</td>\n", + " <td>A, , veut dire aussi\\nl'an, comme A. D. anno D...</td>\n", + " <td>A, , veut dire aussi\\nl'an, comme A. D. anno D...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>1</td>\n", + " <td>17</td>\n", + " <td>A</td>\n", + " <td>Calendrier Julien</td>\n", + " <td>dans le calendrier Julien</td>\n", + " <td>Mallet</td>\n", + " <td>v1-4-1</td>\n", + " <td>calendrier</td>\n", + " <td>Histoire</td>\n", + " <td>A, dans le calendrier Julien, est aussi la pre...</td>\n", + " <td>A, , est aussi la premiere\\ndes sept lettres d...</td>\n", + " <td>A, , est aussi la premiere\\ndes sept lettres d...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>1</td>\n", + " <td>18</td>\n", + " <td>A. D.</td>\n", + " <td>pending</td>\n", + " <td>épistolaire</td>\n", + " <td>Mallet</td>\n", + " <td>v1-4-2</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>A. D. épistolaire ; ces deux caracteres dans l...</td>\n", + " <td>A. D. ; ces deux caracteres dans les\\nLettres...</td>\n", + " <td>A. D. ; ces deux caracteres dans les\\nLettres...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>1</td>\n", + " <td>19</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-4-3</td>\n", + " <td>logique</td>\n", + " <td>Philosophie</td>\n", + " <td>* A désigne une proposition générale affirmati...</td>\n", + " <td>* A désigne une proposition générale affirmati...</td>\n", + " <td>* A désigne une proposition générale affirmati...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>1</td>\n", + " <td>20</td>\n", + " <td>A, signe des passions</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-4-4</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>* A, signe des passions ; selon certains Auteu...</td>\n", + " <td>* A, signe des passions ; selon certains Auteu...</td>\n", + " <td>* A, signe des passions ; selon certains Auteu...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>1</td>\n", + " <td>21</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>unsigned</td>\n", + " <td>v1-4-5</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>A, est aussi une abbréviation dont on se sert ...</td>\n", + " <td>A, est aussi une abbréviation dont on se sert ...</td>\n", + " <td>A, est aussi une abbréviation dont on se sert ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>1</td>\n", + " <td>22</td>\n", + " <td>A A A</td>\n", + " <td>Chimie</td>\n", + " <td>Chimistes</td>\n", + " <td>Malouin5</td>\n", + " <td>v1-5-0</td>\n", + " <td>chimie</td>\n", + " <td>Chimie</td>\n", + " <td>A A A, chez les Chimistes, signifie une amalga...</td>\n", + " <td>A A A, chez les , signifie une amalgame,\\nou l...</td>\n", + " <td>A A A, chez les , signifie une amalgame,\\nou l...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>1</td>\n", + " <td>23</td>\n", + " <td>A, Ä, ou Ä Ä</td>\n", + " <td>Médecine</td>\n", + " <td>Medecine</td>\n", + " <td>Vandenesse</td>\n", + " <td>v1-6-0</td>\n", + " <td>médecine</td>\n", + " <td>Médecine - Chirurgie</td>\n", + " <td>A, Ä, ou Ä Ä; on se sert de cette abbréviation...</td>\n", + " <td>A, Ä, ou Ä Ä; on se sert de cette abbréviation...</td>\n", + " <td>A, Ä, ou Ä Ä; on se sert de cette abbréviation...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>1</td>\n", + " <td>24</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-7-0</td>\n", + " <td>commerce</td>\n", + " <td>Commerce</td>\n", + " <td>A. Les Marchands Négocians, Banquiers, & Teneu...</td>\n", + " <td>A. Les Marchands Négocians, Banquiers, & Teneu...</td>\n", + " <td>A. Les Marchands Négocians, Banquiers, & Teneu...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>1</td>\n", + " <td>25</td>\n", + " <td>A</td>\n", + " <td>pending</td>\n", + " <td>caractere alphabétique</td>\n", + " <td>Diderot</td>\n", + " <td>v1-8-0</td>\n", + " <td>ecriture</td>\n", + " <td>Caractères</td>\n", + " <td>* A, caractere alphabétique. Après avoir donné...</td>\n", + " <td>* A, . Après avoir donné les\\ndifférentes sign...</td>\n", + " <td>* A, . Après avoir donné les\\ndifférentes sign...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>1</td>\n", + " <td>26</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-9-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>* A, s. petite riviere de France, qui a sa sou...</td>\n", + " <td>* A, s. petite riviere de France, qui a sa sou...</td>\n", + " <td>* A, s. petite riviere de France, qui a sa sou...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>1</td>\n", + " <td>27</td>\n", + " <td>AA</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-10-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>* AA, s. f. riviere de France, qui prend sa so...</td>\n", + " <td>* AA, s. f. riviere de France, qui prend sa so...</td>\n", + " <td>* AA, s. f. riviere de France, qui prend sa so...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>1</td>\n", + " <td>28</td>\n", + " <td>AABAM</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Malouin</td>\n", + " <td>v1-11-0</td>\n", + " <td>alchimie</td>\n", + " <td>Chimie</td>\n", + " <td>AABAM, s. m. Quelques Alchimistes se sont serv...</td>\n", + " <td>AABAM, s. m. Quelques Alchimistes se sont serv...</td>\n", + " <td>AABAM, s. m. Quelques Alchimistes se sont serv...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>1</td>\n", + " <td>29</td>\n", + " <td>AACH ou ACH</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-12-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>* AACH ou ACH, s. f. petite ville d'Allemagne\\...</td>\n", + " <td>* AACH ou ACH, s. f. petite ville d'Allemagne\\...</td>\n", + " <td>* AACH ou ACH, s. f. petite ville d'Allemagne\\...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>1</td>\n", + " <td>30</td>\n", + " <td>AAHUS</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-13-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>* AAHUS, s. petite ville d'Allemagne dans le c...</td>\n", + " <td>* AAHUS, s. petite ville d'Allemagne dans le c...</td>\n", + " <td>* AAHUS, s. petite ville d'Allemagne dans le c...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>1</td>\n", + " <td>31</td>\n", + " <td>AAM</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-14-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>* AAM, s. mesure des Liquides, en usage à Amst...</td>\n", + " <td>* AAM, s. mesure des Liquides, en usage à Amst...</td>\n", + " <td>* AAM, s. mesure des Liquides, en usage à Amst...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>1</td>\n", + " <td>32</td>\n", + " <td>AAR</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-15-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>* AAR, s. grande riviere qui a sa source proch...</td>\n", + " <td>* AAR, s. grande riviere qui a sa source proch...</td>\n", + " <td>* AAR, s. grande riviere qui a sa source proch...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>1</td>\n", + " <td>33</td>\n", + " <td>Aar</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-15-1</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>* Aar, s. riviere d'Allemagne qui a sa source ...</td>\n", + " <td>* Aar, s. riviere d'Allemagne qui a sa source ...</td>\n", + " <td>* Aar, s. riviere d'Allemagne qui a sa source ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>1</td>\n", + " <td>34</td>\n", + " <td>AA ou AAS</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-16-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>* AA ou AAS, s. ou Fontaine des Arquebusades. ...</td>\n", + " <td>* AA ou AAS, s. ou Fontaine des Arquebusades. ...</td>\n", + " <td>* AA ou AAS, s. ou Fontaine des Arquebusades. ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33</th>\n", + " <td>1</td>\n", + " <td>35</td>\n", + " <td>AAS ou AASA</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-17-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>* AAS ou AASA, Fort de Norwege dans le Baillia...</td>\n", + " <td>* AAS ou AASA, Fort de Norwege dans le Baillia...</td>\n", + " <td>* AAS ou AASA, Fort de Norwege dans le Baillia...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>34</th>\n", + " <td>1</td>\n", + " <td>36</td>\n", + " <td>AB</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-18-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>AB, s. m. onzieme mois de l'année civile des H...</td>\n", + " <td>AB, s. m. onzieme mois de l'année civile des H...</td>\n", + " <td>AB, s. m. onzieme mois de l'année civile des H...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35</th>\n", + " <td>1</td>\n", + " <td>37</td>\n", + " <td>AB</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-19-0</td>\n", + " <td>calendrier</td>\n", + " <td>Histoire</td>\n", + " <td>AB, s. m. en Langue Syriaque est le nom du der...</td>\n", + " <td>AB, s. m. en Langue Syriaque est le nom du der...</td>\n", + " <td>AB, s. m. en Langue Syriaque est le nom du der...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>36</th>\n", + " <td>1</td>\n", + " <td>38</td>\n", + " <td>AB</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-20-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>AB, s.m. en hébreu signifie pere ; d'où les Ch...</td>\n", + " <td>AB, s.m. en hébreu signifie pere ; d'où les Ch...</td>\n", + " <td>AB, s.m. en hébreu signifie pere ; d'où les Ch...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>37</th>\n", + " <td>1</td>\n", + " <td>39</td>\n", + " <td>ABA</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-21-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>*ABA, s. ville de la Phocide, bâtie par les Ab...</td>\n", + " <td>*ABA, s. ville de la Phocide, bâtie par les Ab...</td>\n", + " <td>*ABA, s. ville de la Phocide, bâtie par les Ab...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>38</th>\n", + " <td>1</td>\n", + " <td>40</td>\n", + " <td>ABACA</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-22-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>* ABACA, s. Il ne paroît pas qu'on sache bien ...</td>\n", + " <td>* ABACA, s. Il ne paroît pas qu'on sache bien ...</td>\n", + " <td>* ABACA, s. Il ne paroît pas qu'on sache bien ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>39</th>\n", + " <td>1</td>\n", + " <td>41</td>\n", + " <td>ABACH</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-23-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>* ABACH, s. petite ville d'Allemagne dans la b...</td>\n", + " <td>* ABACH, s. petite ville d'Allemagne dans la b...</td>\n", + " <td>* ABACH, s. petite ville d'Allemagne dans la b...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>40</th>\n", + " <td>1</td>\n", + " <td>42</td>\n", + " <td>ABACO</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>d'Alembert</td>\n", + " <td>v1-24-0</td>\n", + " <td>arithmétique</td>\n", + " <td>Mathématiques</td>\n", + " <td>ABACO, s. m. Quelques anciens Auteurs se serve...</td>\n", + " <td>ABACO, s. m. Quelques anciens Auteurs se serve...</td>\n", + " <td>ABACO, s. m. Quelques anciens Auteurs se serve...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>41</th>\n", + " <td>1</td>\n", + " <td>43</td>\n", + " <td>ABACOA</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-25-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>* ABACOA, s. Isle de l'Amérique septentrionale...</td>\n", + " <td>* ABACOA, s. Isle de l'Amérique septentrionale...</td>\n", + " <td>* ABACOA, s. Isle de l'Amérique septentrionale...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>42</th>\n", + " <td>1</td>\n", + " <td>44</td>\n", + " <td>ABACOT</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-26-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>* ABACOT, s. m. nom de l'ancienne parure dè\\nt...</td>\n", + " <td>* ABACOT, s. m. nom de l'ancienne parure dè\\nt...</td>\n", + " <td>* ABACOT, s. m. nom de l'ancienne parure dè\\nt...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>43</th>\n", + " <td>1</td>\n", + " <td>45</td>\n", + " <td>ABADA</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-27-0</td>\n", + " <td>histoirenaturelle</td>\n", + " <td>Histoire naturelle</td>\n", + " <td>* ABADA, s. m. c'est, dit-on, un animal qui\\ns...</td>\n", + " <td>* ABADA, s. m. c'est, dit-on, un animal qui\\ns...</td>\n", + " <td>* ABADA, s. m. c'est, dit-on, un animal qui\\ns...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>44</th>\n", + " <td>1</td>\n", + " <td>46</td>\n", + " <td>ABADDON</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-28-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>* ABADDON, s. m. vient d'abad, perdre. C'est\\n...</td>\n", + " <td>* ABADDON, s. m. vient d'abad, perdre. C'est\\n...</td>\n", + " <td>* ABADDON, s. m. vient d'abad, perdre. C'est\\n...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>45</th>\n", + " <td>1</td>\n", + " <td>47</td>\n", + " <td>ABADIR ou ABADDIR</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-29-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>ABADIR ou ABADDIR, s. m. mot composé \\nde deux...</td>\n", + " <td>ABADIR ou ABADDIR, s. m. mot composé \\nde deux...</td>\n", + " <td>ABADIR ou ABADDIR, s. m. mot composé \\nde deux...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>46</th>\n", + " <td>1</td>\n", + " <td>48</td>\n", + " <td>ABACUZ</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot & Toussaint</td>\n", + " <td>v1-30-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>* ABACUZ, s. m. pris adject. ce sont les biens...</td>\n", + " <td>* ABACUZ, s. m. pris adject. ce sont les biens...</td>\n", + " <td>* ABACUZ, s. m. pris adject. ce sont les biens...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>47</th>\n", + " <td>1</td>\n", + " <td>49</td>\n", + " <td>ABAJOUR</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Blondel</td>\n", + " <td>v1-31-0</td>\n", + " <td>architecture</td>\n", + " <td>Architecture</td>\n", + " <td>ABAJOUR, s. m. nom que les Architectes donnent...</td>\n", + " <td>ABAJOUR, s. m. nom que les Architectes donnent...</td>\n", + " <td>ABAJOUR, s. m. nom que les Architectes donnent...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>48</th>\n", + " <td>1</td>\n", + " <td>50</td>\n", + " <td>ABAISIR</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Malouin</td>\n", + " <td>v1-32-0</td>\n", + " <td>alchimie</td>\n", + " <td>Chimie</td>\n", + " <td>ABAISIR, s.m. Quelques Alchimistes se sont ser...</td>\n", + " <td>ABAISIR, s.m. Quelques Alchimistes se sont ser...</td>\n", + " <td>ABAISIR, s.m. Quelques Alchimistes se sont ser...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>49</th>\n", + " <td>1</td>\n", + " <td>51</td>\n", + " <td>ABAISSE</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot</td>\n", + " <td>v1-33-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>* ABAISSE, s. f. c'est le nom que les Pâtissie...</td>\n", + " <td>* ABAISSE, s. f. c'est le nom que les Pâtissie...</td>\n", + " <td>* ABAISSE, s. f. c'est le nom que les Pâtissie...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head normClass \\\n", + "0 1 1 Title Page unclassified \n", + "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON unclassified \n", + "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS unclassified \n", + "3 1 5 A, a & a Grammaire \n", + "4 1 6 A unclassified \n", + "5 1 7 A unclassified \n", + "6 1 8 A unclassified \n", + "7 1 9 A, lettre symbolique unclassified \n", + "8 1 10 A, numismatique ou monétaire unclassified \n", + "9 1 11 A, lapidaire unclassified \n", + "10 1 12 A, lettre de suffrage unclassified \n", + "11 1 13 A, signe d'absolution unclassified \n", + "12 1 14 A cognitionibus unclassified \n", + "13 1 15 A curâ amicorum unclassified \n", + "14 1 16 A Ecrivains modernes \n", + "15 1 17 A Calendrier Julien \n", + "16 1 18 A. D. pending \n", + "17 1 19 A unclassified \n", + "18 1 20 A, signe des passions unclassified \n", + "19 1 21 A unclassified \n", + "20 1 22 A A A Chimie \n", + "21 1 23 A, Ä, ou Ä Ä Médecine \n", + "22 1 24 A unclassified \n", + "23 1 25 A pending \n", + "24 1 26 A unclassified \n", + "25 1 27 AA unclassified \n", + "26 1 28 AABAM unclassified \n", + "27 1 29 AACH ou ACH unclassified \n", + "28 1 30 AAHUS unclassified \n", + "29 1 31 AAM unclassified \n", + "30 1 32 AAR unclassified \n", + "31 1 33 Aar unclassified \n", + "32 1 34 AA ou AAS unclassified \n", + "33 1 35 AAS ou AASA unclassified \n", + "34 1 36 AB unclassified \n", + "35 1 37 AB unclassified \n", + "36 1 38 AB unclassified \n", + "37 1 39 ABA unclassified \n", + "38 1 40 ABACA unclassified \n", + "39 1 41 ABACH unclassified \n", + "40 1 42 ABACO unclassified \n", + "41 1 43 ABACOA unclassified \n", + "42 1 44 ABACOT unclassified \n", + "43 1 45 ABADA unclassified \n", + "44 1 46 ABADDON unclassified \n", + "45 1 47 ABADIR ou ABADDIR unclassified \n", + "46 1 48 ABACUZ unclassified \n", + "47 1 49 ABAJOUR unclassified \n", + "48 1 50 ABAISIR unclassified \n", + "49 1 51 ABAISSE unclassified \n", + "\n", + " classEDdA author \\\n", + "0 unclassified unsigned \n", + "1 unclassified Diderot & d'Alembert \n", + "2 unclassified d'Alembert \n", + "3 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 \n", + "4 unclassified Dumarsais5 \n", + "5 unclassified Dumarsais \n", + "6 unclassified Mallet \n", + "7 unclassified Mallet \n", + "8 unclassified Mallet \n", + "9 unclassified Mallet \n", + "10 unclassified Mallet \n", + "11 unclassified Mallet5 \n", + "12 unclassified Diderot \n", + "13 unclassified Diderot \n", + "14 dans les Ecrivains modernes Mallet \n", + "15 dans le calendrier Julien Mallet \n", + "16 épistolaire Mallet \n", + "17 unclassified Diderot \n", + "18 unclassified Diderot \n", + "19 unclassified unsigned \n", + "20 Chimistes Malouin5 \n", + "21 Medecine Vandenesse \n", + "22 unclassified Mallet \n", + "23 caractere alphabétique Diderot \n", + "24 unclassified Diderot \n", + "25 unclassified Diderot \n", + "26 unclassified Malouin \n", + "27 unclassified Diderot \n", + "28 unclassified Diderot \n", + "29 unclassified Diderot \n", + "30 unclassified Diderot \n", + "31 unclassified Diderot \n", + "32 unclassified Diderot \n", + "33 unclassified Diderot \n", + "34 unclassified Mallet \n", + "35 unclassified Mallet \n", + "36 unclassified Mallet \n", + "37 unclassified Diderot \n", + "38 unclassified Diderot \n", + "39 unclassified Diderot \n", + "40 unclassified d'Alembert \n", + "41 unclassified Diderot \n", + "42 unclassified Diderot \n", + "43 unclassified Diderot \n", + "44 unclassified Diderot \n", + "45 unclassified Mallet \n", + "46 unclassified Diderot & Toussaint \n", + "47 unclassified Blondel \n", + "48 unclassified Malouin \n", + "49 unclassified Diderot \n", + "\n", + " id_enccre domaine_enccre ensemble_domaine_enccre \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 v1-1-0 grammaire Grammaire \n", + "4 v1-1-1 grammaire Grammaire \n", + "5 v1-1-2 grammaire Grammaire \n", + "6 v1-1-3 \n", + "7 v1-1-4 \n", + "8 v1-1-5 numismatique Médailles \n", + "9 v1-1-6 inscriptions Histoire \n", + "10 v1-1-7 \n", + "11 v1-1-8 \n", + "12 v1-2-0 \n", + "13 v1-3-0 \n", + "14 v1-4-0 caractères Caractères \n", + "15 v1-4-1 calendrier Histoire \n", + "16 v1-4-2 \n", + "17 v1-4-3 logique Philosophie \n", + "18 v1-4-4 \n", + "19 v1-4-5 \n", + "20 v1-5-0 chimie Chimie \n", + "21 v1-6-0 médecine Médecine - Chirurgie \n", + "22 v1-7-0 commerce Commerce \n", + "23 v1-8-0 ecriture Caractères \n", + "24 v1-9-0 géographie Géographie \n", + "25 v1-10-0 géographie Géographie \n", + "26 v1-11-0 alchimie Chimie \n", + "27 v1-12-0 géographie Géographie \n", + "28 v1-13-0 géographie Géographie \n", + "29 v1-14-0 \n", + "30 v1-15-0 géographie Géographie \n", + "31 v1-15-1 géographie Géographie \n", + "32 v1-16-0 géographie Géographie \n", + "33 v1-17-0 géographie Géographie \n", + "34 v1-18-0 \n", + "35 v1-19-0 calendrier Histoire \n", + "36 v1-20-0 \n", + "37 v1-21-0 géographie Géographie \n", + "38 v1-22-0 \n", + "39 v1-23-0 géographie Géographie \n", + "40 v1-24-0 arithmétique Mathématiques \n", + "41 v1-25-0 géographie Géographie \n", + "42 v1-26-0 \n", + "43 v1-27-0 histoirenaturelle Histoire naturelle \n", + "44 v1-28-0 \n", + "45 v1-29-0 \n", + "46 v1-30-0 \n", + "47 v1-31-0 architecture Architecture \n", + "48 v1-32-0 alchimie Chimie \n", + "49 v1-33-0 \n", + "\n", + " content \\\n", + "0 ENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES SCIE... \n", + "1 A MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINISTRE\\... \n", + "2 DISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\nL'Encycl... \n", + "3 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... \n", + "4 A, mot, est 1. la troisieme personne du présen... \n", + "5 A, préposition vient du latin à , à dextris, à ... \n", + "6 A, étoit une lettre numérale parmi les Anciens... \n", + "7 A, lettre symbolique, étoit un hiéroglyphe che... \n", + "8 A, numismatique ou monétaire, sur le revers de... \n", + "9 A, lapidaire, dans les anciennes inscriptions ... \n", + "10 A, lettre de suffrage ; les Romains se servoie... \n", + "11 A, signe d'absolution, chez les Romains dans l... \n", + "12 * A cognitionibus. Scorpus fameux Agitateur du... \n", + "13 * A curâ amicorum. On lit dans quelques inscri... \n", + "14 A, dans les Ecrivains modernes, veut dire auss... \n", + "15 A, dans le calendrier Julien, est aussi la pre... \n", + "16 A. D. épistolaire ; ces deux caracteres dans l... \n", + "17 * A désigne une proposition générale affirmati... \n", + "18 * A, signe des passions ; selon certains Auteu... \n", + "19 A, est aussi une abbréviation dont on se sert ... \n", + "20 A A A, chez les Chimistes, signifie une amalga... \n", + "21 A, Ä, ou Ä Ä; on se sert de cette abbréviation... \n", + "22 A. Les Marchands Négocians, Banquiers, & Teneu... \n", + "23 * A, caractere alphabétique. Après avoir donné... \n", + "24 * A, s. petite riviere de France, qui a sa sou... \n", + "25 * AA, s. f. riviere de France, qui prend sa so... \n", + "26 AABAM, s. m. Quelques Alchimistes se sont serv... \n", + "27 * AACH ou ACH, s. f. petite ville d'Allemagne\\... \n", + "28 * AAHUS, s. petite ville d'Allemagne dans le c... \n", + "29 * AAM, s. mesure des Liquides, en usage à Amst... \n", + "30 * AAR, s. grande riviere qui a sa source proch... \n", + "31 * Aar, s. riviere d'Allemagne qui a sa source ... \n", + "32 * AA ou AAS, s. ou Fontaine des Arquebusades. ... \n", + "33 * AAS ou AASA, Fort de Norwege dans le Baillia... \n", + "34 AB, s. m. onzieme mois de l'année civile des H... \n", + "35 AB, s. m. en Langue Syriaque est le nom du der... \n", + "36 AB, s.m. en hébreu signifie pere ; d'où les Ch... \n", + "37 *ABA, s. ville de la Phocide, bâtie par les Ab... \n", + "38 * ABACA, s. Il ne paroît pas qu'on sache bien ... \n", + "39 * ABACH, s. petite ville d'Allemagne dans la b... \n", + "40 ABACO, s. m. Quelques anciens Auteurs se serve... \n", + "41 * ABACOA, s. Isle de l'Amérique septentrionale... \n", + "42 * ABACOT, s. m. nom de l'ancienne parure dè\\nt... \n", + "43 * ABADA, s. m. c'est, dit-on, un animal qui\\ns... \n", + "44 * ABADDON, s. m. vient d'abad, perdre. C'est\\n... \n", + "45 ABADIR ou ABADDIR, s. m. mot composé \\nde deux... \n", + "46 * ABACUZ, s. m. pris adject. ce sont les biens... \n", + "47 ABAJOUR, s. m. nom que les Architectes donnent... \n", + "48 ABAISIR, s.m. Quelques Alchimistes se sont ser... \n", + "49 * ABAISSE, s. f. c'est le nom que les Pâtissie... \n", + "\n", + " contentWithoutClass \\\n", + "0 ENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES SCIE... \n", + "1 A MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINISTRE\\... \n", + "2 DISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\nL'Encycl... \n", + "3 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... \n", + "4 A, mot, est 1. la troisieme personne du présen... \n", + "5 A, préposition vient du latin à , à dextris, à ... \n", + "6 A, étoit une lettre numérale parmi les Anciens... \n", + "7 A, lettre symbolique, étoit un hiéroglyphe che... \n", + "8 A, numismatique ou monétaire, sur le revers de... \n", + "9 A, lapidaire, dans les anciennes inscriptions ... \n", + "10 A, lettre de suffrage ; les Romains se servoie... \n", + "11 A, signe d'absolution, chez les Romains dans l... \n", + "12 * A cognitionibus. Scorpus fameux Agitateur du... \n", + "13 * A curâ amicorum. On lit dans quelques inscri... \n", + "14 A, , veut dire aussi\\nl'an, comme A. D. anno D... \n", + "15 A, , est aussi la premiere\\ndes sept lettres d... \n", + "16 A. D. ; ces deux caracteres dans les\\nLettres... \n", + "17 * A désigne une proposition générale affirmati... \n", + "18 * A, signe des passions ; selon certains Auteu... \n", + "19 A, est aussi une abbréviation dont on se sert ... \n", + "20 A A A, chez les , signifie une amalgame,\\nou l... \n", + "21 A, Ä, ou Ä Ä; on se sert de cette abbréviation... \n", + "22 A. Les Marchands Négocians, Banquiers, & Teneu... \n", + "23 * A, . Après avoir donné les\\ndifférentes sign... \n", + "24 * A, s. petite riviere de France, qui a sa sou... \n", + "25 * AA, s. f. riviere de France, qui prend sa so... \n", + "26 AABAM, s. m. Quelques Alchimistes se sont serv... \n", + "27 * AACH ou ACH, s. f. petite ville d'Allemagne\\... \n", + "28 * AAHUS, s. petite ville d'Allemagne dans le c... \n", + "29 * AAM, s. mesure des Liquides, en usage à Amst... \n", + "30 * AAR, s. grande riviere qui a sa source proch... \n", + "31 * Aar, s. riviere d'Allemagne qui a sa source ... \n", + "32 * AA ou AAS, s. ou Fontaine des Arquebusades. ... \n", + "33 * AAS ou AASA, Fort de Norwege dans le Baillia... \n", + "34 AB, s. m. onzieme mois de l'année civile des H... \n", + "35 AB, s. m. en Langue Syriaque est le nom du der... \n", + "36 AB, s.m. en hébreu signifie pere ; d'où les Ch... \n", + "37 *ABA, s. ville de la Phocide, bâtie par les Ab... \n", + "38 * ABACA, s. Il ne paroît pas qu'on sache bien ... \n", + "39 * ABACH, s. petite ville d'Allemagne dans la b... \n", + "40 ABACO, s. m. Quelques anciens Auteurs se serve... \n", + "41 * ABACOA, s. Isle de l'Amérique septentrionale... \n", + "42 * ABACOT, s. m. nom de l'ancienne parure dè\\nt... \n", + "43 * ABADA, s. m. c'est, dit-on, un animal qui\\ns... \n", + "44 * ABADDON, s. m. vient d'abad, perdre. C'est\\n... \n", + "45 ABADIR ou ABADDIR, s. m. mot composé \\nde deux... \n", + "46 * ABACUZ, s. m. pris adject. ce sont les biens... \n", + "47 ABAJOUR, s. m. nom que les Architectes donnent... \n", + "48 ABAISIR, s.m. Quelques Alchimistes se sont ser... \n", + "49 * ABAISSE, s. f. c'est le nom que les Pâtissie... \n", + "\n", + " firstParagraph \n", + "0 ENCYCLOPÉDIE,\\nDICTIONNAIRE RAISONNÉ\\nDES SCIE... \n", + "1 A MONSEIGNEUR\\nLE COMTE D'ARGENSON,\\nMINISTRE\\... \n", + "2 DISCOURS PRÉLIMINAIRE\\nDES EDITEURS.\\nL'Encycl... \n", + "3 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... \n", + "4 A, mot, est 1. la troisieme personne du présen... \n", + "5 A, préposition vient du latin à , à dextris, à ... \n", + "6 A, étoit une lettre numérale parmi les Anciens... \n", + "7 A, lettre symbolique, étoit un hiéroglyphe che... \n", + "8 A, numismatique ou monétaire, sur le revers de... \n", + "9 A, lapidaire, dans les anciennes inscriptions ... \n", + "10 A, lettre de suffrage ; les Romains se servoie... \n", + "11 A, signe d'absolution, chez les Romains dans l... \n", + "12 * A cognitionibus. Scorpus fameux Agitateur du... \n", + "13 * A curâ amicorum. On lit dans quelques inscri... \n", + "14 A, , veut dire aussi\\nl'an, comme A. D. anno D... \n", + "15 A, , est aussi la premiere\\ndes sept lettres d... \n", + "16 A. D. ; ces deux caracteres dans les\\nLettres... \n", + "17 * A désigne une proposition générale affirmati... \n", + "18 * A, signe des passions ; selon certains Auteu... \n", + "19 A, est aussi une abbréviation dont on se sert ... \n", + "20 A A A, chez les , signifie une amalgame,\\nou l... \n", + "21 A, Ä, ou Ä Ä; on se sert de cette abbréviation... \n", + "22 A. Les Marchands Négocians, Banquiers, & Teneu... \n", + "23 * A, . Après avoir donné les\\ndifférentes sign... \n", + "24 * A, s. petite riviere de France, qui a sa sou... \n", + "25 * AA, s. f. riviere de France, qui prend sa so... \n", + "26 AABAM, s. m. Quelques Alchimistes se sont serv... \n", + "27 * AACH ou ACH, s. f. petite ville d'Allemagne\\... \n", + "28 * AAHUS, s. petite ville d'Allemagne dans le c... \n", + "29 * AAM, s. mesure des Liquides, en usage à Amst... \n", + "30 * AAR, s. grande riviere qui a sa source proch... \n", + "31 * Aar, s. riviere d'Allemagne qui a sa source ... \n", + "32 * AA ou AAS, s. ou Fontaine des Arquebusades. ... \n", + "33 * AAS ou AASA, Fort de Norwege dans le Baillia... \n", + "34 AB, s. m. onzieme mois de l'année civile des H... \n", + "35 AB, s. m. en Langue Syriaque est le nom du der... \n", + "36 AB, s.m. en hébreu signifie pere ; d'où les Ch... \n", + "37 *ABA, s. ville de la Phocide, bâtie par les Ab... \n", + "38 * ABACA, s. Il ne paroît pas qu'on sache bien ... \n", + "39 * ABACH, s. petite ville d'Allemagne dans la b... \n", + "40 ABACO, s. m. Quelques anciens Auteurs se serve... \n", + "41 * ABACOA, s. Isle de l'Amérique septentrionale... \n", + "42 * ABACOT, s. m. nom de l'ancienne parure dè\\nt... \n", + "43 * ABADA, s. m. c'est, dit-on, un animal qui\\ns... \n", + "44 * ABADDON, s. m. vient d'abad, perdre. C'est\\n... \n", + "45 ABADIR ou ABADDIR, s. m. mot composé \\nde deux... \n", + "46 * ABACUZ, s. m. pris adject. ce sont les biens... \n", + "47 ABAJOUR, s. m. nom que les Architectes donnent... \n", + "48 ABAISIR, s.m. Quelques Alchimistes se sont ser... \n", + "49 * ABAISSE, s. f. c'est le nom que les Pâtissie... " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "\n", + "df['id_enccre'], df['domaine_enccre'], df['ensemble_domaine_enccre'], df['content'], df['contentWithoutClass'], df['firstParagraph'] = df.apply(lambda row: getDomaineEnccre(row.volume, row.numero, row.classEDdA), axis=1).T.values\n", + "\n", + "#df['id_enccre'], df['domaine_enccre'], df['ensemble_domaine_enccre'], df['content'], df['contentWithoutClass'], df['firstParagraph'] = getDomaineEnccre(df.volume, df.numero, df.classEDdA)\n", + "df.head(50)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "daily-office", + "metadata": {}, + "outputs": [], + "source": [ + "# nombre d'articles non classés par ENCCRE (à partir de la correspondance automatique)\n", + "len(df.loc[(df['domaine_enccre']==\"\")])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "suited-methodology", + "metadata": {}, + "outputs": [], + "source": [ + "# nombre d'article non classés par ARTFL\n", + "len(df.loc[(df['normClass']==\"unclassified\")])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "special-investigation", + "metadata": {}, + "outputs": [], + "source": [ + "# nombre de classe ENCCRE\n", + "\n", + "classes_enccre = df.groupby(['domaine_enccre']).count()\n", + "classes_enccre.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "legendary-independence", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "312" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(classes_enccre)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "theoretical-marathon", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "lonely-efficiency", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>Title Page</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>unsigned</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>\\nM. DCC. L I.\\nAVEC APPROBATION ET PRIVILEGE ...</td>\n", + " <td>\\nM. DCC. L I.\\nAVEC APPROBATION ET PRIVILEGE ...</td>\n", + " <td>\\nM. DCC. L I.\\nAVEC APPROBATION ET PRIVILEGE ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Diderot & d'Alembert</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>\\nDIDEROT & D'ALEMBERT.\\n</td>\n", + " <td>\\nDIDEROT & D'ALEMBERT.\\n</td>\n", + " <td>\\nDIDEROT & D'ALEMBERT.\\n</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>d'Alembert</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>\\nVoilà dans son ordre naturel, & sans démembr...</td>\n", + " <td>\\nVoilà dans son ordre naturel, & sans démembr...</td>\n", + " <td>\\nVoilà dans son ordre naturel, & sans démembr...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Grammaire</td>\n", + " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-0</td>\n", + " <td>grammaire;</td>\n", + " <td>Grammaire</td>\n", + " <td>\\n3. On dit de quelqu'un qui n'a rien fait, ri...</td>\n", + " <td>\\n3. On dit de quelqu'un qui n'a rien fait, ri...</td>\n", + " <td>\\n3. On dit de quelqu'un qui n'a rien fait, ri...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-1</td>\n", + " <td>grammaire;</td>\n", + " <td>Grammaire</td>\n", + " <td>\\n2. A, comme mot, est aussi une préposition, ...</td>\n", + " <td>\\n2. A, comme mot, est aussi une préposition, ...</td>\n", + " <td>\\n2. A, comme mot, est aussi une préposition, ...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head normClass \\\n", + "0 1 1 Title Page unclassified \n", + "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON unclassified \n", + "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS unclassified \n", + "3 1 5 A, a & a Grammaire \n", + "4 1 6 A unclassified \n", + "\n", + " classEDdA author \\\n", + "0 unclassified unsigned \n", + "1 unclassified Diderot & d'Alembert \n", + "2 unclassified d'Alembert \n", + "3 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 \n", + "4 unclassified Dumarsais5 \n", + "\n", + " id_enccre domaine_enccre ensemble_domaine_enccre \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 v1-1-0 grammaire; Grammaire \n", + "4 v1-1-1 grammaire; Grammaire \n", + "\n", + " content \\\n", + "0 \\nM. DCC. L I.\\nAVEC APPROBATION ET PRIVILEGE ... \n", + "1 \\nDIDEROT & D'ALEMBERT.\\n \n", + "2 \\nVoilà dans son ordre naturel, & sans démembr... \n", + "3 \\n3. On dit de quelqu'un qui n'a rien fait, ri... \n", + "4 \\n2. A, comme mot, est aussi une préposition, ... \n", + "\n", + " contentWithoutClass \\\n", + "0 \\nM. DCC. L I.\\nAVEC APPROBATION ET PRIVILEGE ... \n", + "1 \\nDIDEROT & D'ALEMBERT.\\n \n", + "2 \\nVoilà dans son ordre naturel, & sans démembr... \n", + "3 \\n3. On dit de quelqu'un qui n'a rien fait, ri... \n", + "4 \\n2. A, comme mot, est aussi une préposition, ... \n", + "\n", + " firstParagraph \n", + "0 \\nM. DCC. L I.\\nAVEC APPROBATION ET PRIVILEGE ... \n", + "1 \\nDIDEROT & D'ALEMBERT.\\n \n", + "2 \\nVoilà dans son ordre naturel, & sans démembr... \n", + "3 \\n3. On dit de quelqu'un qui n'a rien fait, ri... \n", + "4 \\n2. A, comme mot, est aussi une préposition, ... " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "skilled-channel", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "least-practice", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "circular-service", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "possible-sleeping", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "streaming-savings", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 146, + "id": "fourth-involvement", + "metadata": {}, + "outputs": [], + "source": [ + "# enregistrement du dataframe dans un fichier tsv\n", + "df.to_csv('EDdA_dataframe_withContent.tsv',sep='\\t',index=False) " + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "id": "framed-sodium", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('EDdA_dataframe_withContent.tsv', sep='\\t') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "comparable-envelope", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "tutorial-savannah", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "74190" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "minus-waterproof", + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace= True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "id": "scenic-sugar", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "61673" + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "id": "unavailable-indiana", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Grammaire</td>\n", + " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-0</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>\\n3. On dit de quelqu'un qui n'a rien fait, ri...</td>\n", + " <td>\\n3. On dit de quelqu'un qui n'a rien fait, ri...</td>\n", + " <td>\\n3. On dit de quelqu'un qui n'a rien fait, ri...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-1</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>\\n2. A, comme mot, est aussi une préposition, ...</td>\n", + " <td>\\n2. A, comme mot, est aussi une préposition, ...</td>\n", + " <td>\\n2. A, comme mot, est aussi une préposition, ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais</td>\n", + " <td>v1-1-2</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nEn terme de Grammaire, & sur-tout de Grammai...</td>\n", + " <td>\\nEn terme de Grammaire, & sur-tout de Grammai...</td>\n", + " <td>\\nEn terme de Grammaire, & sur-tout de Grammai...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>1</td>\n", + " <td>10</td>\n", + " <td>A, numismatique ou monétaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-5</td>\n", + " <td>numismatique</td>\n", + " <td>Médailles</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>1</td>\n", + " <td>11</td>\n", + " <td>A, lapidaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-6</td>\n", + " <td>inscriptions</td>\n", + " <td>Histoire</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head normClass \\\n", + "3 1 5 A, a & a Grammaire \n", + "4 1 6 A unclassified \n", + "5 1 7 A unclassified \n", + "8 1 10 A, numismatique ou monétaire unclassified \n", + "9 1 11 A, lapidaire unclassified \n", + "\n", + " classEDdA author id_enccre \\\n", + "3 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 v1-1-0 \n", + "4 unclassified Dumarsais5 v1-1-1 \n", + "5 unclassified Dumarsais v1-1-2 \n", + "8 unclassified Mallet v1-1-5 \n", + "9 unclassified Mallet v1-1-6 \n", + "\n", + " domaine_enccre ensemble_domaine_enccre \\\n", + "3 grammaire Grammaire \n", + "4 grammaire Grammaire \n", + "5 grammaire Grammaire \n", + "8 numismatique Médailles \n", + "9 inscriptions Histoire \n", + "\n", + " content \\\n", + "3 \\n3. On dit de quelqu'un qui n'a rien fait, ri... \n", + "4 \\n2. A, comme mot, est aussi une préposition, ... \n", + "5 \\nEn terme de Grammaire, & sur-tout de Grammai... \n", + "8 \\nA, numismatique ou monétaire, sur le revers ... \n", + "9 \\nA, lapidaire, dans les anciennes inscription... \n", + "\n", + " contentWithoutClass \\\n", + "3 \\n3. On dit de quelqu'un qui n'a rien fait, ri... \n", + "4 \\n2. A, comme mot, est aussi une préposition, ... \n", + "5 \\nEn terme de Grammaire, & sur-tout de Grammai... \n", + "8 \\nA, numismatique ou monétaire, sur le revers ... \n", + "9 \\nA, lapidaire, dans les anciennes inscription... \n", + "\n", + " firstParagraph \n", + "3 \\n3. On dit de quelqu'un qui n'a rien fait, ri... \n", + "4 \\n2. A, comme mot, est aussi une préposition, ... \n", + "5 \\nEn terme de Grammaire, & sur-tout de Grammai... \n", + "8 \\nA, numismatique ou monétaire, sur le revers ... \n", + "9 \\nA, lapidaire, dans les anciennes inscription... " + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ahead-pendant", + "metadata": {}, + "outputs": [], + "source": [ + "def addNbWord(content):\n", + " return len(content.split(' '))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "hearing-backup", + "metadata": {}, + "outputs": [], + "source": [ + "df['nb_word'] = df.apply(lambda row: addNbWord(row.content), axis=1).T.values\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "suffering-athletics", + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[(df['nb_word']>=15)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "needed-behavior", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "mature-norfolk", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "green-afternoon", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "suffering-puppy", + "metadata": {}, + "outputs": [], + "source": [ + "df_unclassified = df.loc[(df['normClass']==\"unclassified\")]\n", + "df_classified = df.loc[(df['normClass']!=\"unclassified\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "disturbed-constitution", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "12685\n", + "61505\n" + ] + } + ], + "source": [ + "print(len(df_unclassified))\n", + "print(len(df_classified))" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "fatty-bouquet", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "pharmaceutical-presence", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "seasonal-suspect", + "metadata": {}, + "outputs": [], + "source": [ + "# enregistrement du dataframe dans un fichier tsv\n", + "df.to_csv('EDdA_dataframe_withContent.tsv',sep='\\t',index=False) " + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "id": "opposed-binding", + "metadata": {}, + "outputs": [], + "source": [ + "######\n", + "df = pd.read_csv('EDdA_dataframe_withContent.tsv', sep='\\t') " + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "id": "banner-beijing", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " <th>nb_word</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Grammaire</td>\n", + " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-0</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>\\n3. On dit de quelqu'un qui n'a rien fait, ri...</td>\n", + " <td>\\n3. On dit de quelqu'un qui n'a rien fait, ri...</td>\n", + " <td>\\n3. On dit de quelqu'un qui n'a rien fait, ri...</td>\n", + " <td>38</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-1</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>\\n2. A, comme mot, est aussi une préposition, ...</td>\n", + " <td>\\n2. A, comme mot, est aussi une préposition, ...</td>\n", + " <td>\\n2. A, comme mot, est aussi une préposition, ...</td>\n", + " <td>18</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais</td>\n", + " <td>v1-1-2</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nEn terme de Grammaire, & sur-tout de Grammai...</td>\n", + " <td>\\nEn terme de Grammaire, & sur-tout de Grammai...</td>\n", + " <td>\\nEn terme de Grammaire, & sur-tout de Grammai...</td>\n", + " <td>24</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>10</td>\n", + " <td>A, numismatique ou monétaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-5</td>\n", + " <td>numismatique</td>\n", + " <td>Médailles</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>\\nA, numismatique ou monétaire, sur le revers ...</td>\n", + " <td>112</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>11</td>\n", + " <td>A, lapidaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-6</td>\n", + " <td>inscriptions</td>\n", + " <td>Histoire</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>\\nA, lapidaire, dans les anciennes inscription...</td>\n", + " <td>80</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head normClass \\\n", + "0 1 5 A, a & a Grammaire \n", + "1 1 6 A unclassified \n", + "2 1 7 A unclassified \n", + "3 1 10 A, numismatique ou monétaire unclassified \n", + "4 1 11 A, lapidaire unclassified \n", + "\n", + " classEDdA author id_enccre \\\n", + "0 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 v1-1-0 \n", + "1 unclassified Dumarsais5 v1-1-1 \n", + "2 unclassified Dumarsais v1-1-2 \n", + "3 unclassified Mallet v1-1-5 \n", + "4 unclassified Mallet v1-1-6 \n", + "\n", + " domaine_enccre ensemble_domaine_enccre \\\n", + "0 grammaire Grammaire \n", + "1 grammaire Grammaire \n", + "2 grammaire Grammaire \n", + "3 numismatique Médailles \n", + "4 inscriptions Histoire \n", + "\n", + " content \\\n", + "0 \\n3. On dit de quelqu'un qui n'a rien fait, ri... \n", + "1 \\n2. A, comme mot, est aussi une préposition, ... \n", + "2 \\nEn terme de Grammaire, & sur-tout de Grammai... \n", + "3 \\nA, numismatique ou monétaire, sur le revers ... \n", + "4 \\nA, lapidaire, dans les anciennes inscription... \n", + "\n", + " contentWithoutClass \\\n", + "0 \\n3. On dit de quelqu'un qui n'a rien fait, ri... \n", + "1 \\n2. A, comme mot, est aussi une préposition, ... \n", + "2 \\nEn terme de Grammaire, & sur-tout de Grammai... \n", + "3 \\nA, numismatique ou monétaire, sur le revers ... \n", + "4 \\nA, lapidaire, dans les anciennes inscription... \n", + "\n", + " firstParagraph nb_word \n", + "0 \\n3. On dit de quelqu'un qui n'a rien fait, ri... 38 \n", + "1 \\n2. A, comme mot, est aussi une préposition, ... 18 \n", + "2 \\nEn terme de Grammaire, & sur-tout de Grammai... 24 \n", + "3 \\nA, numismatique ou monétaire, sur le revers ... 112 \n", + "4 \\nA, lapidaire, dans les anciennes inscription... 80 " + ] + }, + "execution_count": 185, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "innocent-stability", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 183, + "id": "classical-receipt", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7837" + ] + }, + "execution_count": 183, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df.loc[(df['nb_word']<=15)])" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "id": "featured-tennis", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1695" + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df.loc[(df['nb_word']<=15) & (df['ensemble_domaine_enccre']==\"Géographie\")])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "neither-idaho", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 175, + "id": "expanded-tunnel", + "metadata": {}, + "outputs": [], + "source": [ + "df_old = pd.read_csv('EDdA_dataframe_withContent_old.tsv', sep='\\t') " + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "id": "valid-manor", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>author</th>\n", + " <th>normClass_artfl</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>Title Page</td>\n", + " <td>unsigned</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\nM. DCC. L I.\\nAVEC APPROBATION ET PRIVILEGE ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>A MONSEIGNEUR LE COMTE D'ARGENSON</td>\n", + " <td>Diderot & d'Alembert</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\nDIDEROT & D'ALEMBERT.\\n</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>DISCOURS PRÉLIMINAIRE DES EDITEURS</td>\n", + " <td>d'Alembert</td>\n", + " <td>unclassified</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>\\nVoilà dans son ordre naturel, & sans démembr...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Dumarsais5</td>\n", + " <td>Grammaire</td>\n", + " <td>v1-1-0</td>\n", + " <td>grammaire;</td>\n", + " <td>Grammaire</td>\n", + " <td>\\n3. On dit de quelqu'un qui n'a rien fait, ri...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>Dumarsais5</td>\n", + " <td>unclassified</td>\n", + " <td>v1-1-1</td>\n", + " <td>grammaire;</td>\n", + " <td>Grammaire</td>\n", + " <td>\\n2. A, comme mot, est aussi une préposition, ...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head author \\\n", + "0 1 1 Title Page unsigned \n", + "1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert \n", + "2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert \n", + "3 1 5 A, a & a Dumarsais5 \n", + "4 1 6 A Dumarsais5 \n", + "\n", + " normClass_artfl id_enccre domaine_enccre ensemble_domaine_enccre \\\n", + "0 unclassified NaN NaN NaN \n", + "1 unclassified NaN NaN NaN \n", + "2 unclassified NaN NaN NaN \n", + "3 Grammaire v1-1-0 grammaire; Grammaire \n", + "4 unclassified v1-1-1 grammaire; Grammaire \n", + "\n", + " content \n", + "0 \\nM. DCC. L I.\\nAVEC APPROBATION ET PRIVILEGE ... \n", + "1 \\nDIDEROT & D'ALEMBERT.\\n \n", + "2 \\nVoilà dans son ordre naturel, & sans démembr... \n", + "3 \\n3. On dit de quelqu'un qui n'a rien fait, ri... \n", + "4 \\n2. A, comme mot, est aussi une préposition, ... " + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_old.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "id": "focused-bulgarian", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3654\n" + ] + } + ], + "source": [ + "def countDomaine(domaine):\n", + " return str(domaine).count(';')\n", + "\n", + "df_old['nb_domaine'] = df_old.apply(lambda row: countDomaine(row.ensemble_domaine_enccre), axis=1).T.values\n", + "\n", + "print(len(df_old.loc[(df_old['nb_domaine']>0)]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "informative-chess", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "covered-spine", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "endless-cathedral", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(66056, 13)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "corrected-batman", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.loc[(df['nb_word']>=15)]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "documentary-prince", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(66056, 13)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "opened-november", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "\n", + "train_x, validation_x, train_y, validation_y = train_test_split(df, df[\"ensemble_domaine_enccre\"], test_size=0.2, random_state=42, stratify = df[\"ensemble_domaine_enccre\"] )\n", + "\n", + "train, test_x, train_labels, test_y = train_test_split(train_x, train_x[\"ensemble_domaine_enccre\"], test_size=0.3, random_state=42, stratify = train_x[\"ensemble_domaine_enccre\"] )\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "noticed-evanescence", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(36990, 13)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "welcome-homework", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(13212, 13)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "returning-george", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(15854, 13)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "thorough-senator", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " <th>nb_word</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>46001</th>\n", + " <td>11</td>\n", + " <td>2973</td>\n", + " <td>ORNIS</td>\n", + " <td>Commerce</td>\n", + " <td>Comm.</td>\n", + " <td>unsigned</td>\n", + " <td>v11-1767-0</td>\n", + " <td>commerce</td>\n", + " <td>Commerce</td>\n", + " <td>ORNIS, s. m. toile des Indes, (Comm.) sortes d...</td>\n", + " <td>ORNIS, s. m. toile des Indes, () sortes de\\nto...</td>\n", + " <td>ORNIS, s. m. toile des Indes, () sortes de\\nto...</td>\n", + " <td>45</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15442</th>\n", + " <td>3</td>\n", + " <td>3525</td>\n", + " <td>COMPRENDRE</td>\n", + " <td>Philosophie</td>\n", + " <td>terme de Philosophie,</td>\n", + " <td>Diderot</td>\n", + " <td>v3-1722-0</td>\n", + " <td></td>\n", + " <td></td>\n", + " <td>* COMPRENDRE, v. act. terme de Philosophie,\\nc...</td>\n", + " <td>* COMPRENDRE, v. act. \\nc'est appercevoir la l...</td>\n", + " <td>* COMPRENDRE, v. act. \\nc'est appercevoir la l...</td>\n", + " <td>92</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2558</th>\n", + " <td>1</td>\n", + " <td>2560</td>\n", + " <td>ANCRE</td>\n", + " <td>Marine</td>\n", + " <td>Marine</td>\n", + " <td>d'Alembert & Diderot</td>\n", + " <td>v1-1865-0</td>\n", + " <td>marine</td>\n", + " <td>Marine</td>\n", + " <td>ANCRE, s. f. (Marine.) est un instrument de fe...</td>\n", + " <td>ANCRE, s. f. (.) est un instrument de fer\\nABC...</td>\n", + " <td>ANCRE, s. f. (.) est un instrument de fer\\nABC...</td>\n", + " <td>3327</td>\n", + " </tr>\n", + " <tr>\n", + " <th>70433</th>\n", + " <td>16</td>\n", + " <td>4241</td>\n", + " <td>VAKEBARO</td>\n", + " <td>Géographie moderne</td>\n", + " <td>Géog. mod.</td>\n", + " <td>unsigned</td>\n", + " <td>v16-2587-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>VAKEBARO, (Géog. mod.) vallée du royaume\\nd'Es...</td>\n", + " <td>VAKEBARO, () vallée du royaume\\nd'Espagne dans...</td>\n", + " <td>VAKEBARO, () vallée du royaume\\nd'Espagne dans...</td>\n", + " <td>34</td>\n", + " </tr>\n", + " <tr>\n", + " <th>34129</th>\n", + " <td>8</td>\n", + " <td>3281</td>\n", + " <td>INSPECTEUR</td>\n", + " <td>Histoire ancienne</td>\n", + " <td>Hist. anc.</td>\n", + " <td>unsigned</td>\n", + " <td>v8-2533-0</td>\n", + " <td>histoire</td>\n", + " <td>Histoire</td>\n", + " <td>INSPECTEUR, s. m. inspector ; (Hist. anc.) cel...</td>\n", + " <td>INSPECTEUR, s. m. inspector ; () celui \\nà qui...</td>\n", + " <td>INSPECTEUR, s. m. inspector ; () celui \\nà qui...</td>\n", + " <td>102</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head normClass classEDdA \\\n", + "46001 11 2973 ORNIS Commerce Comm. \n", + "15442 3 3525 COMPRENDRE Philosophie terme de Philosophie, \n", + "2558 1 2560 ANCRE Marine Marine \n", + "70433 16 4241 VAKEBARO Géographie moderne Géog. mod. \n", + "34129 8 3281 INSPECTEUR Histoire ancienne Hist. anc. \n", + "\n", + " author id_enccre domaine_enccre \\\n", + "46001 unsigned v11-1767-0 commerce \n", + "15442 Diderot v3-1722-0 \n", + "2558 d'Alembert & Diderot v1-1865-0 marine \n", + "70433 unsigned v16-2587-0 géographie \n", + "34129 unsigned v8-2533-0 histoire \n", + "\n", + " ensemble_domaine_enccre \\\n", + "46001 Commerce \n", + "15442 \n", + "2558 Marine \n", + "70433 Géographie \n", + "34129 Histoire \n", + "\n", + " content \\\n", + "46001 ORNIS, s. m. toile des Indes, (Comm.) sortes d... \n", + "15442 * COMPRENDRE, v. act. terme de Philosophie,\\nc... \n", + "2558 ANCRE, s. f. (Marine.) est un instrument de fe... \n", + "70433 VAKEBARO, (Géog. mod.) vallée du royaume\\nd'Es... \n", + "34129 INSPECTEUR, s. m. inspector ; (Hist. anc.) cel... \n", + "\n", + " contentWithoutClass \\\n", + "46001 ORNIS, s. m. toile des Indes, () sortes de\\nto... \n", + "15442 * COMPRENDRE, v. act. \\nc'est appercevoir la l... \n", + "2558 ANCRE, s. f. (.) est un instrument de fer\\nABC... \n", + "70433 VAKEBARO, () vallée du royaume\\nd'Espagne dans... \n", + "34129 INSPECTEUR, s. m. inspector ; () celui \\nà qui... \n", + "\n", + " firstParagraph nb_word \n", + "46001 ORNIS, s. m. toile des Indes, () sortes de\\nto... 45 \n", + "15442 * COMPRENDRE, v. act. \\nc'est appercevoir la l... 92 \n", + "2558 ANCRE, s. f. (.) est un instrument de fer\\nABC... 3327 \n", + "70433 VAKEBARO, () vallée du royaume\\nd'Espagne dans... 34 \n", + "34129 INSPECTEUR, s. m. inspector ; () celui \\nà qui... 102 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_x.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "hearing-moses", + "metadata": {}, + "outputs": [], + "source": [ + "train.to_csv('training_set.tsv',sep='\\t',index=False) \n", + "validation_x.to_csv('validation_set.tsv',sep='\\t',index=False) \n", + "test_x.to_csv('test_set.tsv',sep='\\t',index=False) " + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "exterior-praise", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>counts</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td></td>\n", + " <td>10053</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Agriculture - Economie rustique</td>\n", + " <td>1077</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Anatomie</td>\n", + " <td>1021</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Antiquité</td>\n", + " <td>1336</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Architecture</td>\n", + " <td>1357</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Arts et métiers</td>\n", + " <td>550</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Beaux-arts</td>\n", + " <td>427</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>Belles-lettres - Poésie</td>\n", + " <td>1026</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>Blason</td>\n", + " <td>526</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>Caractères</td>\n", + " <td>113</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>Chasse</td>\n", + " <td>516</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>Chimie</td>\n", + " <td>478</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>Commerce</td>\n", + " <td>1823</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>Droit - Jurisprudence</td>\n", + " <td>6052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>Economie domestique</td>\n", + " <td>131</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>Grammaire</td>\n", + " <td>2397</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>Géographie</td>\n", + " <td>11959</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>Histoire</td>\n", + " <td>3025</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>Histoire naturelle</td>\n", + " <td>4707</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>Jeu</td>\n", + " <td>279</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>Marine</td>\n", + " <td>1893</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>Maréchage - Manège</td>\n", + " <td>494</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>Mathématiques</td>\n", + " <td>681</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>Mesure</td>\n", + " <td>179</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>Militaire (Art) - Guerre - Arme</td>\n", + " <td>1265</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>Minéralogie</td>\n", + " <td>109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>Monnaie</td>\n", + " <td>309</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>Musique</td>\n", + " <td>681</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>Médailles</td>\n", + " <td>116</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>Médecine - Chirurgie</td>\n", + " <td>2227</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>Métiers</td>\n", + " <td>5083</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>Pharmacie</td>\n", + " <td>311</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>Philosophie</td>\n", + " <td>478</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33</th>\n", + " <td>Physique - [Sciences physico-mathématiques]</td>\n", + " <td>1286</td>\n", + " </tr>\n", + " <tr>\n", + " <th>34</th>\n", + " <td>Politique</td>\n", + " <td>114</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35</th>\n", + " <td>Pêche</td>\n", + " <td>199</td>\n", + " </tr>\n", + " <tr>\n", + " <th>36</th>\n", + " <td>Religion</td>\n", + " <td>1623</td>\n", + " </tr>\n", + " <tr>\n", + " <th>37</th>\n", + " <td>Spectacle</td>\n", + " <td>47</td>\n", + " </tr>\n", + " <tr>\n", + " <th>38</th>\n", + " <td>Superstition</td>\n", + " <td>108</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ensemble_domaine_enccre counts\n", + "0 10053\n", + "1 Agriculture - Economie rustique 1077\n", + "2 Anatomie 1021\n", + "3 Antiquité 1336\n", + "4 Architecture 1357\n", + "5 Arts et métiers 550\n", + "6 Beaux-arts 427\n", + "7 Belles-lettres - Poésie 1026\n", + "8 Blason 526\n", + "9 Caractères 113\n", + "10 Chasse 516\n", + "11 Chimie 478\n", + "12 Commerce 1823\n", + "13 Droit - Jurisprudence 6052\n", + "14 Economie domestique 131\n", + "15 Grammaire 2397\n", + "16 Géographie 11959\n", + "17 Histoire 3025\n", + "18 Histoire naturelle 4707\n", + "19 Jeu 279\n", + "20 Marine 1893\n", + "21 Maréchage - Manège 494\n", + "22 Mathématiques 681\n", + "23 Mesure 179\n", + "24 Militaire (Art) - Guerre - Arme 1265\n", + "25 Minéralogie 109\n", + "26 Monnaie 309\n", + "27 Musique 681\n", + "28 Médailles 116\n", + "29 Médecine - Chirurgie 2227\n", + "30 Métiers 5083\n", + "31 Pharmacie 311\n", + "32 Philosophie 478\n", + "33 Physique - [Sciences physico-mathématiques] 1286\n", + "34 Politique 114\n", + "35 Pêche 199\n", + "36 Religion 1623\n", + "37 Spectacle 47\n", + "38 Superstition 108" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "unable-agenda", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>counts</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td></td>\n", + " <td>5629</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Agriculture - Economie rustique</td>\n", + " <td>603</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Anatomie</td>\n", + " <td>572</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Antiquité</td>\n", + " <td>748</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Architecture</td>\n", + " <td>760</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Arts et métiers</td>\n", + " <td>308</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Beaux-arts</td>\n", + " <td>239</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>Belles-lettres - Poésie</td>\n", + " <td>575</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>Blason</td>\n", + " <td>295</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>Caractères</td>\n", + " <td>63</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>Chasse</td>\n", + " <td>289</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>Chimie</td>\n", + " <td>267</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>Commerce</td>\n", + " <td>1021</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>Droit - Jurisprudence</td>\n", + " <td>3389</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>Economie domestique</td>\n", + " <td>74</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>Grammaire</td>\n", + " <td>1343</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>Géographie</td>\n", + " <td>6697</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>Histoire</td>\n", + " <td>1694</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>Histoire naturelle</td>\n", + " <td>2636</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>Jeu</td>\n", + " <td>156</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>Marine</td>\n", + " <td>1060</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>Maréchage - Manège</td>\n", + " <td>277</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>Mathématiques</td>\n", + " <td>381</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>Mesure</td>\n", + " <td>100</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>Militaire (Art) - Guerre - Arme</td>\n", + " <td>708</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>Minéralogie</td>\n", + " <td>61</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>Monnaie</td>\n", + " <td>173</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>Musique</td>\n", + " <td>382</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>Médailles</td>\n", + " <td>65</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>Médecine - Chirurgie</td>\n", + " <td>1247</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>Métiers</td>\n", + " <td>2846</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>Pharmacie</td>\n", + " <td>174</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>Philosophie</td>\n", + " <td>267</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33</th>\n", + " <td>Physique - [Sciences physico-mathématiques]</td>\n", + " <td>720</td>\n", + " </tr>\n", + " <tr>\n", + " <th>34</th>\n", + " <td>Politique</td>\n", + " <td>64</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35</th>\n", + " <td>Pêche</td>\n", + " <td>111</td>\n", + " </tr>\n", + " <tr>\n", + " <th>36</th>\n", + " <td>Religion</td>\n", + " <td>909</td>\n", + " </tr>\n", + " <tr>\n", + " <th>37</th>\n", + " <td>Spectacle</td>\n", + " <td>27</td>\n", + " </tr>\n", + " <tr>\n", + " <th>38</th>\n", + " <td>Superstition</td>\n", + " <td>60</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ensemble_domaine_enccre counts\n", + "0 5629\n", + "1 Agriculture - Economie rustique 603\n", + "2 Anatomie 572\n", + "3 Antiquité 748\n", + "4 Architecture 760\n", + "5 Arts et métiers 308\n", + "6 Beaux-arts 239\n", + "7 Belles-lettres - Poésie 575\n", + "8 Blason 295\n", + "9 Caractères 63\n", + "10 Chasse 289\n", + "11 Chimie 267\n", + "12 Commerce 1021\n", + "13 Droit - Jurisprudence 3389\n", + "14 Economie domestique 74\n", + "15 Grammaire 1343\n", + "16 Géographie 6697\n", + "17 Histoire 1694\n", + "18 Histoire naturelle 2636\n", + "19 Jeu 156\n", + "20 Marine 1060\n", + "21 Maréchage - Manège 277\n", + "22 Mathématiques 381\n", + "23 Mesure 100\n", + "24 Militaire (Art) - Guerre - Arme 708\n", + "25 Minéralogie 61\n", + "26 Monnaie 173\n", + "27 Musique 382\n", + "28 Médailles 65\n", + "29 Médecine - Chirurgie 1247\n", + "30 Métiers 2846\n", + "31 Pharmacie 174\n", + "32 Philosophie 267\n", + "33 Physique - [Sciences physico-mathématiques] 720\n", + "34 Politique 64\n", + "35 Pêche 111\n", + "36 Religion 909\n", + "37 Spectacle 27\n", + "38 Superstition 60" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts')" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "id": "potential-friday", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>counts</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Agriculture - Economie rustique</td>\n", + " <td>212</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Anatomie</td>\n", + " <td>187</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Antiquité</td>\n", + " <td>263</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Architecture</td>\n", + " <td>265</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Arts et métiers</td>\n", + " <td>108</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Beaux-arts</td>\n", + " <td>84</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Belles-lettres - Poésie</td>\n", + " <td>195</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>Blason</td>\n", + " <td>87</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>Caractères</td>\n", + " <td>22</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>Chasse</td>\n", + " <td>102</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>Chimie</td>\n", + " <td>94</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>Commerce</td>\n", + " <td>361</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>Droit - Jurisprudence</td>\n", + " <td>1181</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>Economie domestique</td>\n", + " <td>26</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>Grammaire</td>\n", + " <td>466</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>Géographie</td>\n", + " <td>2368</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>Histoire</td>\n", + " <td>592</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>Histoire naturelle</td>\n", + " <td>931</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>Jeu</td>\n", + " <td>54</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>Marine</td>\n", + " <td>363</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>Maréchage - Manège</td>\n", + " <td>97</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>Mathématiques</td>\n", + " <td>126</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>Mesure</td>\n", + " <td>35</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>Militaire (Art) - Guerre - Arme</td>\n", + " <td>247</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>Minéralogie</td>\n", + " <td>21</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>Monnaie</td>\n", + " <td>61</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>Musique</td>\n", + " <td>133</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>Médailles</td>\n", + " <td>23</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>Médecine - Chirurgie</td>\n", + " <td>428</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>Métiers</td>\n", + " <td>1006</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>Pharmacie</td>\n", + " <td>59</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>Philosophie</td>\n", + " <td>93</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>Physique - [Sciences physico-mathématiques]</td>\n", + " <td>247</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33</th>\n", + " <td>Politique</td>\n", + " <td>22</td>\n", + " </tr>\n", + " <tr>\n", + " <th>34</th>\n", + " <td>Pêche</td>\n", + " <td>39</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35</th>\n", + " <td>Religion</td>\n", + " <td>319</td>\n", + " </tr>\n", + " <tr>\n", + " <th>36</th>\n", + " <td>Spectacle</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <th>37</th>\n", + " <td>Superstition</td>\n", + " <td>21</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ensemble_domaine_enccre counts\n", + "0 Agriculture - Economie rustique 212\n", + "1 Anatomie 187\n", + "2 Antiquité 263\n", + "3 Architecture 265\n", + "4 Arts et métiers 108\n", + "5 Beaux-arts 84\n", + "6 Belles-lettres - Poésie 195\n", + "7 Blason 87\n", + "8 Caractères 22\n", + "9 Chasse 102\n", + "10 Chimie 94\n", + "11 Commerce 361\n", + "12 Droit - Jurisprudence 1181\n", + "13 Economie domestique 26\n", + "14 Grammaire 466\n", + "15 Géographie 2368\n", + "16 Histoire 592\n", + "17 Histoire naturelle 931\n", + "18 Jeu 54\n", + "19 Marine 363\n", + "20 Maréchage - Manège 97\n", + "21 Mathématiques 126\n", + "22 Mesure 35\n", + "23 Militaire (Art) - Guerre - Arme 247\n", + "24 Minéralogie 21\n", + "25 Monnaie 61\n", + "26 Musique 133\n", + "27 Médailles 23\n", + "28 Médecine - Chirurgie 428\n", + "29 Métiers 1006\n", + "30 Pharmacie 59\n", + "31 Philosophie 93\n", + "32 Physique - [Sciences physico-mathématiques] 247\n", + "33 Politique 22\n", + "34 Pêche 39\n", + "35 Religion 319\n", + "36 Spectacle 9\n", + "37 Superstition 21" + ] + }, + "execution_count": 208, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_x.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts')" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "id": "fatty-pharmacy", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>counts</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Agriculture - Economie rustique</td>\n", + " <td>254</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Anatomie</td>\n", + " <td>224</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Antiquité</td>\n", + " <td>316</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Architecture</td>\n", + " <td>318</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Arts et métiers</td>\n", + " <td>129</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Beaux-arts</td>\n", + " <td>100</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Belles-lettres - Poésie</td>\n", + " <td>235</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>Blason</td>\n", + " <td>105</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>Caractères</td>\n", + " <td>27</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>Chasse</td>\n", + " <td>122</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>Chimie</td>\n", + " <td>112</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>Commerce</td>\n", + " <td>433</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>Droit - Jurisprudence</td>\n", + " <td>1417</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>Economie domestique</td>\n", + " <td>31</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>Grammaire</td>\n", + " <td>560</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>Géographie</td>\n", + " <td>2842</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>Histoire</td>\n", + " <td>711</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>Histoire naturelle</td>\n", + " <td>1118</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>Jeu</td>\n", + " <td>65</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>Marine</td>\n", + " <td>435</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>Maréchage - Manège</td>\n", + " <td>116</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>Mathématiques</td>\n", + " <td>151</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>Mesure</td>\n", + " <td>42</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>Militaire (Art) - Guerre - Arme</td>\n", + " <td>296</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>Minéralogie</td>\n", + " <td>26</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>Monnaie</td>\n", + " <td>73</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>Musique</td>\n", + " <td>160</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>Médailles</td>\n", + " <td>28</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>Médecine - Chirurgie</td>\n", + " <td>513</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>Métiers</td>\n", + " <td>1207</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>Pharmacie</td>\n", + " <td>71</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>Philosophie</td>\n", + " <td>112</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>Physique - [Sciences physico-mathématiques]</td>\n", + " <td>296</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33</th>\n", + " <td>Politique</td>\n", + " <td>26</td>\n", + " </tr>\n", + " <tr>\n", + " <th>34</th>\n", + " <td>Pêche</td>\n", + " <td>47</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35</th>\n", + " <td>Religion</td>\n", + " <td>383</td>\n", + " </tr>\n", + " <tr>\n", + " <th>36</th>\n", + " <td>Spectacle</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th>37</th>\n", + " <td>Superstition</td>\n", + " <td>25</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ensemble_domaine_enccre counts\n", + "0 Agriculture - Economie rustique 254\n", + "1 Anatomie 224\n", + "2 Antiquité 316\n", + "3 Architecture 318\n", + "4 Arts et métiers 129\n", + "5 Beaux-arts 100\n", + "6 Belles-lettres - Poésie 235\n", + "7 Blason 105\n", + "8 Caractères 27\n", + "9 Chasse 122\n", + "10 Chimie 112\n", + "11 Commerce 433\n", + "12 Droit - Jurisprudence 1417\n", + "13 Economie domestique 31\n", + "14 Grammaire 560\n", + "15 Géographie 2842\n", + "16 Histoire 711\n", + "17 Histoire naturelle 1118\n", + "18 Jeu 65\n", + "19 Marine 435\n", + "20 Maréchage - Manège 116\n", + "21 Mathématiques 151\n", + "22 Mesure 42\n", + "23 Militaire (Art) - Guerre - Arme 296\n", + "24 Minéralogie 26\n", + "25 Monnaie 73\n", + "26 Musique 160\n", + "27 Médailles 28\n", + "28 Médecine - Chirurgie 513\n", + "29 Métiers 1207\n", + "30 Pharmacie 71\n", + "31 Philosophie 112\n", + "32 Physique - [Sciences physico-mathématiques] 296\n", + "33 Politique 26\n", + "34 Pêche 47\n", + "35 Religion 383\n", + "36 Spectacle 11\n", + "37 Superstition 25" + ] + }, + "execution_count": 209, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_x.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "indonesian-reach", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "divine-winner", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tropical-research", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "younger-louisiana", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "demanding-essay", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "vanilla-italy", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "consistent-checklist", + "metadata": {}, + "outputs": [], + "source": [ + "## ajout dune colonne avec toutes les classes enccre\n", + "\n", + "def getDomaineEnccre2(volume, numero):\n", + " #print(volume, ' ', numero)\n", + "\n", + " ensemble_domaine = \"\"\n", + "\n", + " try : \n", + " #entreeid = df_correspondances.loc[(df_correspondances['tome']==volume) & (df_correspondances['article']==numero)]['entreeid'][0]\n", + " d = df_correspondances.loc[(df_correspondances['tome']==volume) & (df_correspondances['article']==numero)].reset_index(drop=True)\n", + " entreeid = d['entreeid'][0]\n", + "\n", + " json_url = urlopen(\"http://enccre.academie-sciences.fr/icefront/api/article/\" + entreeid)\n", + " data = json.loads(json_url.read())\n", + " #print(data['annotations']['constit'][0]['domgen'][0])\n", + " cpt = 0\n", + " try : \n", + " \n", + " # changer pour avoir tous les noms\n", + " for dom in data['annotations']['constit'][0]['domgen']:\n", + " val = get_key(dom)\n", + " if val is not None:\n", + " if cpt > 0:\n", + " ensemble_domaine += '|'\n", + " ensemble_domaine += get_key(dom)\n", + " cpt += 1\n", + "\n", + " \n", + " #print(ensemble_domaine)\n", + "\n", + " except KeyError:\n", + " pass\n", + " \n", + " except KeyError:\n", + " pass\n", + " \n", + " \n", + " \n", + " #ensemble_domaine_multi = ';'.join(list(set(ensemble_domaine)))\n", + " \n", + " #print(entreeid, domaine, ensemble_domaine, txtContent, txtContentWithoutClass, firstParagraph)\n", + " \n", + " return ensemble_domaine" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "coral-level", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " <th>nb_word</th>\n", + " <th>classification</th>\n", + " <th>class_is_true</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Grammaire</td>\n", + " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-0</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>711</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-1</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>238</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais</td>\n", + " <td>v1-1-2</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>1980</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>10</td>\n", + " <td>A, numismatique ou monétaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-5</td>\n", + " <td>numismatique</td>\n", + " <td>Médailles</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>112</td>\n", + " <td>Médailles</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>11</td>\n", + " <td>A, lapidaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-6</td>\n", + " <td>inscriptions</td>\n", + " <td>Histoire</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>80</td>\n", + " <td>Histoire</td>\n", + " <td>True</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head normClass \\\n", + "0 1 5 A, a & a Grammaire \n", + "1 1 6 A unclassified \n", + "2 1 7 A unclassified \n", + "3 1 10 A, numismatique ou monétaire unclassified \n", + "4 1 11 A, lapidaire unclassified \n", + "\n", + " classEDdA author id_enccre \\\n", + "0 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 v1-1-0 \n", + "1 unclassified Dumarsais5 v1-1-1 \n", + "2 unclassified Dumarsais v1-1-2 \n", + "3 unclassified Mallet v1-1-5 \n", + "4 unclassified Mallet v1-1-6 \n", + "\n", + " domaine_enccre ensemble_domaine_enccre \\\n", + "0 grammaire Grammaire \n", + "1 grammaire Grammaire \n", + "2 grammaire Grammaire \n", + "3 numismatique Médailles \n", + "4 inscriptions Histoire \n", + "\n", + " content \\\n", + "0 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... \n", + "1 A, mot, est 1. la troisieme personne du présen... \n", + "2 A, préposition vient du latin à , à dextris, à ... \n", + "3 A, numismatique ou monétaire, sur le revers de... \n", + "4 A, lapidaire, dans les anciennes inscriptions ... \n", + "\n", + " contentWithoutClass \\\n", + "0 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... \n", + "1 A, mot, est 1. la troisieme personne du présen... \n", + "2 A, préposition vient du latin à , à dextris, à ... \n", + "3 A, numismatique ou monétaire, sur le revers de... \n", + "4 A, lapidaire, dans les anciennes inscriptions ... \n", + "\n", + " firstParagraph nb_word classification \\\n", + "0 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... 711 Grammaire \n", + "1 A, mot, est 1. la troisieme personne du présen... 238 Grammaire \n", + "2 A, préposition vient du latin à , à dextris, à ... 1980 Grammaire \n", + "3 A, numismatique ou monétaire, sur le revers de... 112 Médailles \n", + "4 A, lapidaire, dans les anciennes inscriptions ... 80 Histoire \n", + "\n", + " class_is_true \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "filepath = '/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Classification domaines EDdA/results_classification/result_classification_sgdtfidf_21.11.24.csv'\n", + "df = pd.read_csv(filepath)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "enormous-longer", + "metadata": {}, + "outputs": [], + "source": [ + " df['ensembles_domaine_enccre'] = df.apply(lambda row: getDomaineEnccre2(row.volume, row.numero), axis=1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "incorporated-commons", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " <th>nb_word</th>\n", + " <th>classification</th>\n", + " <th>class_is_true</th>\n", + " <th>ensembles_domaine_enccre</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Grammaire</td>\n", + " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-0</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>711</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " <td>Grammaire</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-1</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>238</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " <td>Grammaire</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais</td>\n", + " <td>v1-1-2</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>1980</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " <td>Grammaire</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>10</td>\n", + " <td>A, numismatique ou monétaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-5</td>\n", + " <td>numismatique</td>\n", + " <td>Médailles</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>112</td>\n", + " <td>Médailles</td>\n", + " <td>True</td>\n", + " <td>Médailles|Monnaie</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>11</td>\n", + " <td>A, lapidaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-6</td>\n", + " <td>inscriptions</td>\n", + " <td>Histoire</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>80</td>\n", + " <td>Histoire</td>\n", + " <td>True</td>\n", + " <td>Histoire</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head normClass \\\n", + "0 1 5 A, a & a Grammaire \n", + "1 1 6 A unclassified \n", + "2 1 7 A unclassified \n", + "3 1 10 A, numismatique ou monétaire unclassified \n", + "4 1 11 A, lapidaire unclassified \n", + "\n", + " classEDdA author id_enccre \\\n", + "0 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 v1-1-0 \n", + "1 unclassified Dumarsais5 v1-1-1 \n", + "2 unclassified Dumarsais v1-1-2 \n", + "3 unclassified Mallet v1-1-5 \n", + "4 unclassified Mallet v1-1-6 \n", + "\n", + " domaine_enccre ensemble_domaine_enccre \\\n", + "0 grammaire Grammaire \n", + "1 grammaire Grammaire \n", + "2 grammaire Grammaire \n", + "3 numismatique Médailles \n", + "4 inscriptions Histoire \n", + "\n", + " content \\\n", + "0 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... \n", + "1 A, mot, est 1. la troisieme personne du présen... \n", + "2 A, préposition vient du latin à , à dextris, à ... \n", + "3 A, numismatique ou monétaire, sur le revers de... \n", + "4 A, lapidaire, dans les anciennes inscriptions ... \n", + "\n", + " contentWithoutClass \\\n", + "0 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... \n", + "1 A, mot, est 1. la troisieme personne du présen... \n", + "2 A, préposition vient du latin à , à dextris, à ... \n", + "3 A, numismatique ou monétaire, sur le revers de... \n", + "4 A, lapidaire, dans les anciennes inscriptions ... \n", + "\n", + " firstParagraph nb_word classification \\\n", + "0 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... 711 Grammaire \n", + "1 A, mot, est 1. la troisieme personne du présen... 238 Grammaire \n", + "2 A, préposition vient du latin à , à dextris, à ... 1980 Grammaire \n", + "3 A, numismatique ou monétaire, sur le revers de... 112 Médailles \n", + "4 A, lapidaire, dans les anciennes inscriptions ... 80 Histoire \n", + "\n", + " class_is_true ensembles_domaine_enccre \n", + "0 True Grammaire \n", + "1 True Grammaire \n", + "2 True Grammaire \n", + "3 True Médailles|Monnaie \n", + "4 True Histoire " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "pleasant-throat", + "metadata": {}, + "outputs": [], + "source": [ + "# enregistrement du dataframe dans un fichier tsv\n", + "df.to_csv('/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Classification domaines EDdA/results_classification/result_classification_sgdtfidf_21.11.25.csv',index=False) " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "small-shore", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " <th>nb_word</th>\n", + " <th>classification</th>\n", + " <th>class_is_true</th>\n", + " <th>ensembles_domaine_enccre</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Grammaire</td>\n", + " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-0</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>711</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " <td>Grammaire</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-1</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>238</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " <td>Grammaire</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais</td>\n", + " <td>v1-1-2</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>1980</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " <td>Grammaire</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>10</td>\n", + " <td>A, numismatique ou monétaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-5</td>\n", + " <td>numismatique</td>\n", + " <td>Médailles</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>112</td>\n", + " <td>Médailles</td>\n", + " <td>True</td>\n", + " <td>Médailles|Monnaie</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>11</td>\n", + " <td>A, lapidaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-6</td>\n", + " <td>inscriptions</td>\n", + " <td>Histoire</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>80</td>\n", + " <td>Histoire</td>\n", + " <td>True</td>\n", + " <td>Histoire</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero head normClass \\\n", + "0 1 5 A, a & a Grammaire \n", + "1 1 6 A unclassified \n", + "2 1 7 A unclassified \n", + "3 1 10 A, numismatique ou monétaire unclassified \n", + "4 1 11 A, lapidaire unclassified \n", + "\n", + " classEDdA author id_enccre \\\n", + "0 ordre Encyclopéd. Entend. Science de l'homme, ... Dumarsais5 v1-1-0 \n", + "1 unclassified Dumarsais5 v1-1-1 \n", + "2 unclassified Dumarsais v1-1-2 \n", + "3 unclassified Mallet v1-1-5 \n", + "4 unclassified Mallet v1-1-6 \n", + "\n", + " domaine_enccre ensemble_domaine_enccre \\\n", + "0 grammaire Grammaire \n", + "1 grammaire Grammaire \n", + "2 grammaire Grammaire \n", + "3 numismatique Médailles \n", + "4 inscriptions Histoire \n", + "\n", + " content \\\n", + "0 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... \n", + "1 A, mot, est 1. la troisieme personne du présen... \n", + "2 A, préposition vient du latin à , à dextris, à ... \n", + "3 A, numismatique ou monétaire, sur le revers de... \n", + "4 A, lapidaire, dans les anciennes inscriptions ... \n", + "\n", + " contentWithoutClass \\\n", + "0 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... \n", + "1 A, mot, est 1. la troisieme personne du présen... \n", + "2 A, préposition vient du latin à , à dextris, à ... \n", + "3 A, numismatique ou monétaire, sur le revers de... \n", + "4 A, lapidaire, dans les anciennes inscriptions ... \n", + "\n", + " firstParagraph nb_word classification \\\n", + "0 A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie... 711 Grammaire \n", + "1 A, mot, est 1. la troisieme personne du présen... 238 Grammaire \n", + "2 A, préposition vient du latin à , à dextris, à ... 1980 Grammaire \n", + "3 A, numismatique ou monétaire, sur le revers de... 112 Médailles \n", + "4 A, lapidaire, dans les anciennes inscriptions ... 80 Histoire \n", + "\n", + " class_is_true ensembles_domaine_enccre \n", + "0 True Grammaire \n", + "1 True Grammaire \n", + "2 True Grammaire \n", + "3 True Médailles|Monnaie \n", + "4 True Histoire " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acute-basketball", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "verified-compression", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/EDdA_Classification_BertFineTuning.ipynb b/notebooks/EDdA_Classification_BertFineTuning.ipynb new file mode 100644 index 0000000..dc0830e --- /dev/null +++ b/notebooks/EDdA_Classification_BertFineTuning.ipynb @@ -0,0 +1,4421 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "4YCMlsNwOWs0" + }, + "source": [ + "# BERT fine-tuning for EDdA classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pz9VDIXUON97" + }, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ouU5usvXg4PA" + }, + "outputs": [], + "source": [ + "train_path = 'training_set.tsv'\n", + "validation_path = 'validation_set.tsv'\n", + "test_path = 'test_set.tsv'\n", + "\n", + "columnText = 'contentWithoutClass'\n", + "columnClass = 'ensemble_domaine_enccre'\n", + "\n", + "minOfInstancePerClass = 0\n", + "maxOfInstancePerClass = 10000\n", + "\n", + "#model_chosen = \"bert\"\n", + "model_chosen = \"camembert\"\n", + "\n", + "batch_size = 8 # 16 or 32 recommended\n", + "max_len = 512" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6xdYI9moOQSv" + }, + "source": [ + "## Setup colab environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WF0qFN_g3ekz", + "outputId": "445ffd96-843b-4ff1-a24d-c110964a63e4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your runtime has 27.3 gigabytes of available RAM\n", + "\n", + "You are using a high-RAM runtime!\n" + ] + } + ], + "source": [ + "from psutil import virtual_memory\n", + "ram_gb = virtual_memory().total / 1e9\n", + "print('Your runtime has {:.1f} gigabytes of available RAM\\n'.format(ram_gb))\n", + "\n", + "if ram_gb < 20:\n", + " print('Not using a high-RAM runtime')\n", + "else:\n", + " print('You are using a high-RAM runtime!')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vL0S-s9Uofvn", + "outputId": "415b7bf1-d3fd-42b6-ee03-13601c953a4f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8hzEGHl7gmzk" + }, + "source": [ + "## Setup GPU" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dPOU-Efhf4ui", + "outputId": "fc873e0c-1254-4928-c8e9-e3eb093acc64" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 1 GPU(s) available.\n", + "We will use the GPU: Tesla P100-PCIE-16GB\n" + ] + } + ], + "source": [ + "import torch\n", + "\n", + "# If there's a GPU available...\n", + "if torch.cuda.is_available(): \n", + "\n", + " # Tell PyTorch to use the GPU. \n", + " device = torch.device(\"cuda\")\n", + "\n", + " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n", + "\n", + " print('We will use the GPU:', torch.cuda.get_device_name(0))\n", + "\n", + "# If not...\n", + "else:\n", + " print('No GPU available, using the CPU instead.')\n", + " device = torch.device(\"cpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Jr-S9yYIgGkA" + }, + "source": [ + "## Install packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pwmZ5bBvgGNh", + "outputId": "e92404c6-af38-4bd8-8c99-20ec6b545b3f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting transformers==4.10.3\n", + " Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)\n", + "\u001b[K |████████████████████████████████| 2.8 MB 5.0 MB/s \n", + "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n", + " Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n", + "\u001b[K |████████████████████████████████| 3.3 MB 38.8 MB/s \n", + "\u001b[?25hCollecting pyyaml>=5.1\n", + " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", + "\u001b[K |████████████████████████████████| 596 kB 58.6 MB/s \n", + "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (2019.12.20)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (4.62.3)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (2.23.0)\n", + "Collecting huggingface-hub>=0.0.12\n", + " Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)\n", + "\u001b[K |████████████████████████████████| 61 kB 486 kB/s \n", + "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (3.4.0)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (1.19.5)\n", + "Collecting sacremoses\n", + " Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n", + "\u001b[K |████████████████████████████████| 895 kB 43.3 MB/s \n", + "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (21.3)\n", + "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (4.8.2)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.12->transformers==4.10.3) (3.10.0.2)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers==4.10.3) (3.0.6)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.10.3) (3.6.0)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (2.10)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (2021.10.8)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (3.0.4)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (7.1.2)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (1.15.0)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (1.1.0)\n", + "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n", + " Attempting uninstall: pyyaml\n", + " Found existing installation: PyYAML 3.13\n", + " Uninstalling PyYAML-3.13:\n", + " Successfully uninstalled PyYAML-3.13\n", + "Successfully installed huggingface-hub-0.2.1 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.10.3\n", + "Collecting sentencepiece\n", + " Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", + "\u001b[K |████████████████████████████████| 1.2 MB 5.1 MB/s \n", + "\u001b[?25hInstalling collected packages: sentencepiece\n", + "Successfully installed sentencepiece-0.1.96\n" + ] + } + ], + "source": [ + "!pip install transformers==4.10.3\n", + "!pip install sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wSqbrupGMc1M" + }, + "source": [ + "## Import librairies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SkErnwgMMbRj" + }, + "outputs": [], + "source": [ + "import pandas as pd \n", + "import numpy as np\n", + "import csv\n", + "from sklearn import preprocessing\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import *\n", + "\n", + "from transformers import BertTokenizer, CamembertTokenizer, BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification\n", + "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", + "from transformers import get_linear_schedule_with_warmup\n", + "\n", + "import time\n", + "import datetime\n", + "\n", + "import random\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import plot_confusion_matrix\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import classification_report\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "12SA-qPFgsVo" + }, + "source": [ + "## Utils functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WkIVcabUgxIl" + }, + "outputs": [], + "source": [ + "def create_dict(df, classColumnName):\n", + " return dict(df[classColumnName].value_counts())\n", + "\n", + "\n", + "def remove_weak_classes(df, classColumnName, threshold):\n", + " dictOfClassInstances = create_dict(df,classColumnName)\n", + " dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }\n", + " keys = [*dictionary]\n", + " df_tmp = df[~ df[classColumnName].isin(keys)]\n", + " df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)\n", + " return df\n", + "\n", + "\n", + "def resample_classes(df, classColumnName, numberOfInstances):\n", + " #random numberOfInstances elements\n", + " replace = False # with replacement\n", + " fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n", + " return df.groupby(classColumnName, as_index=False).apply(fn)\n", + " \n", + "\n", + "# Function to calculate the accuracy of our predictions vs labels\n", + "def flat_accuracy(preds, labels):\n", + " pred_flat = np.argmax(preds, axis=1).flatten()\n", + " labels_flat = labels.flatten()\n", + " return np.sum(pred_flat == labels_flat) / len(labels_flat) \n", + "\n", + "\n", + "def format_time(elapsed):\n", + " '''\n", + " Takes a time in seconds and returns a string hh:mm:ss\n", + " '''\n", + " # Round to the nearest second.\n", + " elapsed_rounded = int(round((elapsed)))\n", + "\n", + " # Format as hh:mm:ss\n", + " return str(datetime.timedelta(seconds=elapsed_rounded))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c5QKcXulhNJ-" + }, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jdCdUVOTZrqh" + }, + "outputs": [], + "source": [ + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n", + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n", + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9d1IxD_bLEvp" + }, + "source": [ + "## Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5u1acjunhoxe" + }, + "outputs": [], + "source": [ + "df_train = pd.read_csv(train_path, sep=\"\\t\")\n", + "df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)\n", + "\n", + "df_validation = pd.read_csv(validation_path, sep=\"\\t\")\n", + "df_validation = resample_classes(df_validation, columnClass, maxOfInstancePerClass)\n", + "\n", + "#df_train = remove_weak_classes(df, columnClass, minOfInstancePerClass)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zj3JDoJNfx1f", + "outputId": "59262e3f-5fe0-49f5-bb55-8586653498ab" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(30650, 13)\n", + "(10947, 13)\n" + ] + } + ], + "source": [ + "print(df_train.shape)\n", + "print(df_validation.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zrjZvs2dhzAy" + }, + "outputs": [], + "source": [ + "y_train = df_train[columnClass]\n", + "y_validation = df_validation[columnClass]\n", + "numberOfClasses = y_train.nunique()\n", + "\n", + "encoder = preprocessing.LabelEncoder()\n", + "\n", + "y_train = encoder.fit_transform(y_train)\n", + "y_validation = encoder.fit_transform(y_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "u9AxxaA_h1CM" + }, + "outputs": [], + "source": [ + "#train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Xt_PhH_6h1_3" + }, + "outputs": [], + "source": [ + "sentences_train = df_train[columnText].values\n", + "labels_train = y_train.tolist()\n", + "\n", + "sentences_validation = df_validation[columnText].values\n", + "labels_validation = y_validation.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Dq_KF5WAsbpC", + "outputId": "ba91b953-abcb-4bed-a5c5-9e429e68239a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([\"\\nESTAMPEUR, s. m. en , est une\\nsorte de pilon de bois, surmonté d'un manche d'environ \\ndeux piés & demi. On s'en sert pour estamper\\nles formes où l'on veut faire des vergeoises. Voyez\\nVergeoise & Estamper.\\n\",\n", + " \"\\nOn doit ébourgeonner les vignes, alors ce mot doit\\ns'entendre autrement que pour les arbres fruitiers:\\non ébourgeonne les vignes. non-seulement quand on\\nsupprime les bourgeons surnuméraires, mais encore\\nquand on arrête par-en-haut les bourgeons. Il en est\\nde même quand on détache en cassant les faux bourgeons \\nqui poussent d'ordinaire à chaque noeud à \\ncôté des yeux, à commencer par le bas. (K)\\n\",\n", + " \"\\nBois mort en pié, s'il est pourri sur pié, sans\\nsubstance, & bon seulement à brûler.\\n\",\n", + " ...,\n", + " \"\\nIl y a une hydatoscopie naturelle & permise ; elle\\nconsiste à prévoir & à prédire les orages & les tempêtes \\nsur certains signes qu'on remarque dans la mer,\\ndans l'air, & dans les nuages. Voyez Tems & Ouragans. Dict. de Trévoux.\\n\",\n", + " \"\\nMÉTÉOROMANCIE, s.f. () divination par\\nles météores ; & comme les météores ignés sont ceux\\nqui jettent le plus de crainte parmi les hommes, la\\nmétéoromancie désigne proprement la divination par\\nle tonnerre & les éclairs. Cette espece de divination\\npassa des Toscans aux Romains, sons rien perdre de\\nce qu'elle avoit de frivole. Seneque nous apprend\\nque deux auteurs graves, & qui avoient exercé des\\n\\nmagistratures, écrivoient à Rome sur cette matiere.\\nIl semble même que l'un d'eux l'épuisa entierement,\\ncar il donnoit une liste exacte des différentes especes\\nde tonnerres. Il circonstancioit & leurs noms & les\\nprognostics qui s'en pouvoient tirer ; le tout avec un\\nair de confiance plus surprenant encore que les choses\\nqu'il rapportoit. On eût dit, tant cette matiere météorologique lui étoit familiere, qu'il comptoit les tableaux \\nde sa galerie, ou qu'il faisoit la description\\ndes fleurs de son jardin. La plus ancienne maladie,\\nla plus invétérée, la plus incurable du genre humain,\\nc'est l'envie de connoître ce qui doit arriver.\\nNi le voile obscur qui nous cache notre destinée, ni\\nl'expérience journaliere, ni une infinité de tentatives \\nmalheureuses, n'ont pû guerir les hommes. Hé!\\nse dépréviennent-ils jamais d'une erreur agréablement \\nreçue? Nous sommes sur ce point aussi crédules\\nque nos ancêtres ; nous prêtons comme eux l'oreille\\nà toutes les impostures flatteuses. Pour avoir trompé\\ncent fois, elles n'ont point perdu le droit funeste de\\ntromper encore. (D. J.)\\n\",\n", + " \"\\nPENTACLE, s. m. () c'est le nom que la\\nmagie des exorcismes donne à un sceau imprimé ou\\nsur du parchemin vierge fait de peau de bouc, ou\\nsur quelque métal, or, argent, cuivre, étain, plomb,\\n&c. On ne peut faire aucune opération magique pour\\nexorciser les esprits, sans avoir ce sceau qui contient\\nles noms de Dieu. Le pentacle se fait en renfermant\\nun triangle dans deux cercles : on lit dans ce triangle \\nces trois mots ; formatio, reformatio, transformatio. A côté du triangle est le mot agla, qui est très puissant \\npour arrêter la malice des esprits. Il faut que\\nla peau sur laquelle on applique le sceau soit exorcisée \\n& bénite. On exorcise aussi l'encre & la plume,\\ndont on se sert pour écrire les mots dont on vient de\\nparler. Après cela on encense le pentacle ; on l'enferme \\ntrois jours & trois nuits dans un vase bien net ;\\nenfin, on le met dans un linge ou dans un livre que\\nl'on parfume & que l'on exorcise. Voilà les fadaises\\nqu'on lit dans le livre intitulé Encheiridion Leonis papae, ouvrage misérable, qui n'a servi qu'à gâter davantage \\nles esprits crédules & portés à la superstitition.\\n(D. J.)\\n\"],\n", + " dtype=object)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sentences_train" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gs4Agx_5h43M" + }, + "source": [ + "# Model\n", + "## Tokenisation & Input Formatting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YZ5PhEYZiCEA" + }, + "outputs": [], + "source": [ + "if model_chosen == \"bert\":\n", + " tokeniser_bert = 'bert-base-multilingual-cased'\n", + " model_bert = \"bert-base-multilingual-cased\"\n", + "elif model_chosen == \"camembert\":\n", + " tokeniser_bert = 'camembert-base'\n", + " model_bert = 'camembert-base'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 131, + "referenced_widgets": [ + "06c6e7721b68449a9f3619ffdf18dfeb", + "5ec6a851b16c4339b51acb6129935f13", + "fd39a852133144e2b4aed474b204451f", + "0143df420df444e9aac5c8b39c342021", + "c61b6474b55948cb91a598e6b9aa10d2", + "a0d9ceaa8d3a4876ae65d877687bcf50", + "aa6ea92757df47eda1e41603cb109e79", + "41558bfcc0464711916c2d96337bef66", + "fdf05cea504c42f793f9c06e58ef995b", + "044fc1f96f8347ddb4a79d31edf32174", + "cf0d3320e06546789b5d5a2021dbc3ad", + "fba1d1d5c83b40659295a3457d74cb4e", + "f7224a1b831d459594852eece9f05543", + "185ae5ef7be646b797467086ad7d3a82", + "3ceaa994a3814d3c85e2051e37397342", + "e674e279b13b41fda3df3a6c89f5fcb1", + "3203783f58e54b0e856ab84503bf0d3c", + "0214f74b229a4232a9edf3cab751b90d", + "152afcb9245c416fae0fde257fa25e2e", + "fb3a174c597b47c7a527517004ba5f54", + "75073a0f673345728871dfb0346e7c1b", + "db8c94b4ed724f859d1ae8c153b01110", + "6a29c1c28ceb415f91ec55512da981c5", + "5879fadf430646f6af41b1a9b14864ff", + "340241453dab4db88043d372aaa88c2e", + "27e18e1fa3884c0fb0339764e0397990", + "2af1124092684f8bafab311cbe9bf22c", + "95a3332ba4634d1c930a7021eacce230", + "d53488432f8544de863210d9e8ee4e48", + "4422e64029184ba4ba30eecfdf2b4306", + "1d97e83c703f4071b9176ba7bf57cddf", + "17bf94188b844f649642d9c6e6a20373", + "d3aaecd7a6e34cc8918a689ac6299746" + ] + }, + "id": "C4bigx_3ibuN", + "outputId": "b8cef3f8-7a6c-47d1-9d37-7b3b6d08f00b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading CamemBERT tokenizer...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "06c6e7721b68449a9f3619ffdf18dfeb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/811k [00:00<?, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fba1d1d5c83b40659295a3457d74cb4e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/1.40M [00:00<?, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6a29c1c28ceb415f91ec55512da981c5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/508 [00:00<?, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Load the BERT tokenizer.\n", + "if model_chosen == \"bert\":\n", + " print('Loading BERT tokenizer...')\n", + " tokenizer = BertTokenizer.from_pretrained(tokeniser_bert)\n", + "elif model_chosen == \"camembert\":\n", + " print('Loading CamemBERT tokenizer...')\n", + " tokenizer = CamembertTokenizer.from_pretrained(tokeniser_bert)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5hNod5X9jDZN", + "outputId": "93b6e633-afb7-4bcc-be00-44388f801d64" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (1263 > 512). Running this sequence through the model will result in indexing errors\n" + ] + } + ], + "source": [ + " # Tokenize all of the sentences and map the tokens to thier word IDs.\n", + "input_ids_train = []\n", + "\n", + "# For every sentence...\n", + "for sent in sentences_train:\n", + " # `encode` will:\n", + " # (1) Tokenize the sentence.\n", + " # (2) Prepend the `[CLS]` token to the start.\n", + " # (3) Append the `[SEP]` token to the end.\n", + " # (4) Map tokens to their IDs.\n", + " encoded_sent_train = tokenizer.encode(\n", + " str(sent), # Sentence to encode.\n", + " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", + "\n", + " # This function also supports truncation and conversion\n", + " # to pytorch tensors, but I need to do padding, so I\n", + " # can't use these features.\n", + " #max_length = 128, # Truncate all sentences.\n", + " #return_tensors = 'pt', # Return pytorch tensors.\n", + " )\n", + " \n", + " # Add the encoded sentence to the list.\n", + " input_ids_train.append(encoded_sent_train)\n", + "\n", + "input_ids_validation = []\n", + "for sent in sentences_validation:\n", + " # `encode` will:\n", + " # (1) Tokenize the sentence.\n", + " # (2) Prepend the `[CLS]` token to the start.\n", + " # (3) Append the `[SEP]` token to the end.\n", + " # (4) Map tokens to their IDs.\n", + " encoded_sent_validation = tokenizer.encode(\n", + " str(sent), # Sentence to encode.\n", + " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", + "\n", + " # This function also supports truncation and conversion\n", + " # to pytorch tensors, but I need to do padding, so I\n", + " # can't use these features.\n", + " #max_length = 128, # Truncate all sentences.\n", + " #return_tensors = 'pt', # Return pytorch tensors.\n", + " )\n", + " \n", + " # Add the encoded sentence to the list.\n", + " input_ids_validation.append(encoded_sent_validation)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "W9EWv5JvjGH3", + "outputId": "32cd417d-9a40-4086-d900-b81982407667" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Max sentence length train: 2253\n", + "Max sentence length validation: 3067\n" + ] + } + ], + "source": [ + "print('Max sentence length train: ', max([len(sen) for sen in input_ids_train]))\n", + "print('Max sentence length validation: ', max([len(sen) for sen in input_ids_validation])) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xh1TQJyvjOx5" + }, + "outputs": [], + "source": [ + "\n", + "padded_train = []\n", + "for i in input_ids_train:\n", + "\n", + " if len(i) > max_len:\n", + " padded_train.extend([i[:max_len]])\n", + " else:\n", + " padded_train.extend([i + [0] * (max_len - len(i))])\n", + "\n", + "\n", + "padded_train = input_ids_train = np.array(padded_train)\n", + "\n", + "\n", + "padded_validation = []\n", + "for i in input_ids_validation:\n", + "\n", + " if len(i) > max_len:\n", + " padded_validation.extend([i[:max_len]])\n", + " else:\n", + " padded_validation.extend([i + [0] * (max_len - len(i))])\n", + "\n", + "\n", + "padded_validation = input_ids_train = np.array(padded_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZiwY6gn0jUkD" + }, + "outputs": [], + "source": [ + " # Create attention masks\n", + "attention_masks_train = []\n", + "\n", + "# For each sentence...\n", + "for sent in padded_train:\n", + " \n", + " # Create the attention mask.\n", + " # - If a token ID is 0, then it's padding, set the mask to 0.\n", + " # - If a token ID is > 0, then it's a real token, set the mask to 1.\n", + " att_mask = [int(token_id > 0) for token_id in sent]\n", + " \n", + " # Store the attention mask for this sentence.\n", + " attention_masks_train.append(att_mask)\n", + "\n", + "\n", + "attention_masks_validation = []\n", + "\n", + "# For each sentence...\n", + "for sent in padded_validation:\n", + " \n", + " # Create the attention mask.\n", + " # - If a token ID is 0, then it's padding, set the mask to 0.\n", + " # - If a token ID is > 0, then it's a real token, set the mask to 1.\n", + " att_mask = [int(token_id > 0) for token_id in sent]\n", + " \n", + " # Store the attention mask for this sentence.\n", + " attention_masks_validation.append(att_mask)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oBTR5AfAjXJe" + }, + "outputs": [], + "source": [ + "# Use 70% for training and 30% for validation.\n", + "#train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, \n", + "# random_state=2018, test_size=0.3, stratify = labels)\n", + "# Do the same for the masks.\n", + "#train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,\n", + "# random_state=2018, test_size=0.3, stratify = labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b9Mw5kq3jhTb" + }, + "outputs": [], + "source": [ + "# Convert all inputs and labels into torch tensors, the required datatype \n", + "# for my model.\n", + "train_inputs = torch.tensor(padded_train)\n", + "validation_inputs = torch.tensor(padded_validation)\n", + "\n", + "train_labels = torch.tensor(labels_train)\n", + "validation_labels = torch.tensor(labels_validation)\n", + "\n", + "train_masks = torch.tensor(attention_masks_train)\n", + "validation_masks = torch.tensor(attention_masks_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UfFWzbENjnkw" + }, + "outputs": [], + "source": [ + "# The DataLoader needs to know the batch size for training, so I specify it here.\n", + "# For fine-tuning BERT on a specific task, the authors recommend a batch size of\n", + "# 16 or 32.\n", + "\n", + "# Create the DataLoader for training set.\n", + "train_data = TensorDataset(train_inputs, train_masks, train_labels)\n", + "train_sampler = RandomSampler(train_data)\n", + "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)\n", + "\n", + "# Create the DataLoader for validation set.\n", + "validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)\n", + "validation_sampler = SequentialSampler(validation_data)\n", + "validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "x45JNGqhkUn2" + }, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "4873cc6c9e1d493c9a67d6536e4367a6", + "12aa3280d3284c07ac12e2fe842b40b0", + "1bcdb04d16dd4f9e9d86938e1d2def02", + "b5f86071b23c40bf9c96f74c613c2729", + "27a20a17123744948e0c1dbf49b51b27", + "f470af786c1c4d049de4f0a7f373379f", + "00bd66a81aad4cd7a10df4a67b52b14e", + "a5efb634a95c42a7abfaaf61e1c2c928", + "600e627de1f0403595f701381dc3b164", + "f3b7527bd4d04c81936d8392decee3ac", + "885f91c34b9c422889df8b556aad8ec0" + ] + }, + "id": "C7M2Er1ajsTf", + "outputId": "2c3f467d-ab09-4f8f-d464-a4e738333587" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4873cc6c9e1d493c9a67d6536e4367a6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/445M [00:00<?, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']\n", + "- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "data": { + "text/plain": [ + "CamembertForSequenceClassification(\n", + " (roberta): RobertaModel(\n", + " (embeddings): RobertaEmbeddings(\n", + " (word_embeddings): Embedding(32005, 768, padding_idx=1)\n", + " (position_embeddings): Embedding(514, 768, padding_idx=1)\n", + " (token_type_embeddings): Embedding(1, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): RobertaEncoder(\n", + " (layer): ModuleList(\n", + " (0): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (1): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (2): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (3): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (4): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (5): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (6): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (7): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (8): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (9): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (10): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (11): RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (classifier): RobertaClassificationHead(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (out_proj): Linear(in_features=768, out_features=38, bias=True)\n", + " )\n", + ")" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load BertForSequenceClassification, the pretrained BERT model with a single \n", + "# linear classification layer on top.\n", + "\n", + "#model = CamembertForSequenceClassification.from_pretrained(\n", + "if model_chosen == \"bert\":\n", + " model = BertForSequenceClassification.from_pretrained(\n", + " model_bert, # Use the 12-layer BERT model, with an uncased vocab.\n", + " num_labels = numberOfClasses, # The number of output labels--2 for binary classification.\n", + " # You can increase this for multi-class tasks. \n", + " output_attentions = False, # Whether the model returns attentions weights.\n", + " output_hidden_states = False, # Whether the model returns all hidden-states.\n", + " )\n", + "elif model_chosen == \"camembert\":\n", + " model = CamembertForSequenceClassification.from_pretrained(\n", + " model_bert, # Use the 12-layer BERT model, with an uncased vocab.\n", + " num_labels = numberOfClasses, # The number of output labels--2 for binary classification.\n", + " # You can increase this for multi-class tasks. \n", + " output_attentions = False, # Whether the model returns attentions weights.\n", + " output_hidden_states = False, # Whether the model returns all hidden-states.\n", + " )\n", + "\n", + "# Tell pytorch to run this model on the GPU.\n", + "model.cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xd_cG-8pj4Iw" + }, + "outputs": [], + "source": [ + "#Note: AdamW is a class from the huggingface library (as opposed to pytorch) \n", + "# I believe the 'W' stands for 'Weight Decay fix\"\n", + "optimizer = AdamW(model.parameters(),\n", + " lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n", + " eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "65G-uHuLj4_6" + }, + "outputs": [], + "source": [ + "# Number of training epochs (authors recommend between 2 and 4)\n", + "epochs = 4\n", + "\n", + "# Total number of training steps is number of batches * number of epochs.\n", + "total_steps = len(train_dataloader) * epochs\n", + "\n", + "# Create the learning rate scheduler.\n", + "scheduler = get_linear_schedule_with_warmup(optimizer, \n", + " num_warmup_steps = 0, # Default value in run_glue.py\n", + " num_training_steps = total_steps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "background_save": true, + "base_uri": "https://localhost:8080/" + }, + "id": "SbHBbYpwkKaA", + "outputId": "49f7f5f4-716d-44c2-e299-505086a89061" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======== Epoch 1 / 4 ========\n", + "Training...\n", + " Batch 40 of 2,642. Elapsed: 0:00:18.\n", + " Batch 80 of 2,642. Elapsed: 0:00:36.\n", + " Batch 120 of 2,642. Elapsed: 0:00:55.\n", + " Batch 160 of 2,642. Elapsed: 0:01:13.\n", + " Batch 200 of 2,642. Elapsed: 0:01:31.\n", + " Batch 240 of 2,642. Elapsed: 0:01:49.\n", + " Batch 280 of 2,642. Elapsed: 0:02:08.\n", + " Batch 320 of 2,642. Elapsed: 0:02:26.\n", + " Batch 360 of 2,642. Elapsed: 0:02:44.\n", + " Batch 400 of 2,642. Elapsed: 0:03:02.\n", + " Batch 440 of 2,642. Elapsed: 0:03:20.\n", + " Batch 480 of 2,642. Elapsed: 0:03:39.\n", + " Batch 520 of 2,642. Elapsed: 0:03:57.\n", + " Batch 560 of 2,642. Elapsed: 0:04:15.\n", + " Batch 600 of 2,642. Elapsed: 0:04:33.\n", + " Batch 640 of 2,642. Elapsed: 0:04:51.\n", + " Batch 680 of 2,642. Elapsed: 0:05:10.\n", + " Batch 720 of 2,642. Elapsed: 0:05:28.\n", + " Batch 760 of 2,642. Elapsed: 0:05:46.\n", + " Batch 800 of 2,642. Elapsed: 0:06:04.\n", + " Batch 840 of 2,642. Elapsed: 0:06:22.\n", + " Batch 880 of 2,642. Elapsed: 0:06:41.\n", + " Batch 920 of 2,642. Elapsed: 0:06:59.\n", + " Batch 960 of 2,642. Elapsed: 0:07:17.\n", + " Batch 1,000 of 2,642. Elapsed: 0:07:35.\n", + " Batch 1,040 of 2,642. Elapsed: 0:07:54.\n", + " Batch 1,080 of 2,642. Elapsed: 0:08:12.\n", + " Batch 1,120 of 2,642. Elapsed: 0:08:30.\n", + " Batch 1,160 of 2,642. Elapsed: 0:08:48.\n", + " Batch 1,200 of 2,642. Elapsed: 0:09:06.\n", + " Batch 1,240 of 2,642. Elapsed: 0:09:25.\n", + " Batch 1,280 of 2,642. Elapsed: 0:09:43.\n", + " Batch 1,320 of 2,642. Elapsed: 0:10:01.\n", + " Batch 1,360 of 2,642. Elapsed: 0:10:19.\n", + " Batch 1,400 of 2,642. Elapsed: 0:10:37.\n", + " Batch 1,440 of 2,642. Elapsed: 0:10:56.\n", + " Batch 1,480 of 2,642. Elapsed: 0:11:14.\n", + " Batch 1,520 of 2,642. Elapsed: 0:11:32.\n", + " Batch 1,560 of 2,642. Elapsed: 0:11:50.\n", + " Batch 1,600 of 2,642. Elapsed: 0:12:08.\n", + " Batch 1,640 of 2,642. Elapsed: 0:12:27.\n", + " Batch 1,680 of 2,642. Elapsed: 0:12:45.\n", + " Batch 1,720 of 2,642. Elapsed: 0:13:03.\n", + " Batch 1,760 of 2,642. Elapsed: 0:13:21.\n", + " Batch 1,800 of 2,642. Elapsed: 0:13:39.\n", + " Batch 1,840 of 2,642. Elapsed: 0:13:58.\n", + " Batch 1,880 of 2,642. Elapsed: 0:14:16.\n", + " Batch 1,920 of 2,642. Elapsed: 0:14:34.\n", + " Batch 1,960 of 2,642. Elapsed: 0:14:52.\n", + " Batch 2,000 of 2,642. Elapsed: 0:15:11.\n", + " Batch 2,040 of 2,642. Elapsed: 0:15:29.\n", + " Batch 2,080 of 2,642. Elapsed: 0:15:47.\n", + " Batch 2,120 of 2,642. Elapsed: 0:16:05.\n", + " Batch 2,160 of 2,642. Elapsed: 0:16:23.\n", + " Batch 2,200 of 2,642. Elapsed: 0:16:42.\n", + " Batch 2,240 of 2,642. Elapsed: 0:17:00.\n", + " Batch 2,280 of 2,642. Elapsed: 0:17:18.\n", + " Batch 2,320 of 2,642. Elapsed: 0:17:36.\n", + " Batch 2,360 of 2,642. Elapsed: 0:17:54.\n", + " Batch 2,400 of 2,642. Elapsed: 0:18:13.\n", + " Batch 2,440 of 2,642. Elapsed: 0:18:31.\n", + " Batch 2,480 of 2,642. Elapsed: 0:18:49.\n", + " Batch 2,520 of 2,642. Elapsed: 0:19:07.\n", + " Batch 2,560 of 2,642. Elapsed: 0:19:26.\n", + " Batch 2,600 of 2,642. Elapsed: 0:19:44.\n", + " Batch 2,640 of 2,642. Elapsed: 0:20:02.\n", + "\n", + " Average training loss: 2.04\n", + " Training epoch took: 0:20:03\n", + "\n", + "Running Validation...\n", + " Accuracy: 0.75\n", + " Validation took: 0:03:09\n", + "\n", + "======== Epoch 2 / 4 ========\n", + "Training...\n", + " Batch 40 of 2,642. Elapsed: 0:00:18.\n", + " Batch 80 of 2,642. Elapsed: 0:00:36.\n", + " Batch 120 of 2,642. Elapsed: 0:00:55.\n", + " Batch 160 of 2,642. Elapsed: 0:01:13.\n", + " Batch 200 of 2,642. Elapsed: 0:01:31.\n", + " Batch 240 of 2,642. Elapsed: 0:01:49.\n", + " Batch 280 of 2,642. Elapsed: 0:02:07.\n", + " Batch 320 of 2,642. Elapsed: 0:02:26.\n", + " Batch 360 of 2,642. Elapsed: 0:02:44.\n", + " Batch 400 of 2,642. Elapsed: 0:03:02.\n", + " Batch 440 of 2,642. Elapsed: 0:03:20.\n", + " Batch 480 of 2,642. Elapsed: 0:03:38.\n", + " Batch 520 of 2,642. Elapsed: 0:03:57.\n", + " Batch 560 of 2,642. Elapsed: 0:04:15.\n", + " Batch 600 of 2,642. Elapsed: 0:04:33.\n", + " Batch 640 of 2,642. Elapsed: 0:04:51.\n", + " Batch 680 of 2,642. Elapsed: 0:05:10.\n", + " Batch 720 of 2,642. Elapsed: 0:05:28.\n", + " Batch 760 of 2,642. Elapsed: 0:05:46.\n", + " Batch 800 of 2,642. Elapsed: 0:06:04.\n", + " Batch 840 of 2,642. Elapsed: 0:06:22.\n", + " Batch 880 of 2,642. Elapsed: 0:06:41.\n", + " Batch 920 of 2,642. Elapsed: 0:06:59.\n", + " Batch 960 of 2,642. Elapsed: 0:07:17.\n", + " Batch 1,000 of 2,642. Elapsed: 0:07:35.\n", + " Batch 1,040 of 2,642. Elapsed: 0:07:53.\n", + " Batch 1,080 of 2,642. Elapsed: 0:08:12.\n", + " Batch 1,120 of 2,642. Elapsed: 0:08:30.\n", + " Batch 1,160 of 2,642. Elapsed: 0:08:48.\n", + " Batch 1,200 of 2,642. Elapsed: 0:09:06.\n", + " Batch 1,240 of 2,642. Elapsed: 0:09:24.\n", + " Batch 1,280 of 2,642. Elapsed: 0:09:43.\n", + " Batch 1,320 of 2,642. Elapsed: 0:10:01.\n", + " Batch 1,360 of 2,642. Elapsed: 0:10:19.\n", + " Batch 1,400 of 2,642. Elapsed: 0:10:37.\n", + " Batch 1,440 of 2,642. Elapsed: 0:10:55.\n", + " Batch 1,480 of 2,642. Elapsed: 0:11:14.\n", + " Batch 1,520 of 2,642. Elapsed: 0:11:32.\n", + " Batch 1,560 of 2,642. Elapsed: 0:11:50.\n", + " Batch 1,600 of 2,642. Elapsed: 0:12:08.\n", + " Batch 1,640 of 2,642. Elapsed: 0:12:27.\n", + " Batch 1,680 of 2,642. Elapsed: 0:12:45.\n", + " Batch 1,720 of 2,642. Elapsed: 0:13:03.\n", + " Batch 1,760 of 2,642. Elapsed: 0:13:21.\n", + " Batch 1,800 of 2,642. Elapsed: 0:13:39.\n", + " Batch 1,840 of 2,642. Elapsed: 0:13:58.\n", + " Batch 1,880 of 2,642. Elapsed: 0:14:16.\n", + " Batch 1,920 of 2,642. Elapsed: 0:14:34.\n", + " Batch 1,960 of 2,642. Elapsed: 0:14:52.\n", + " Batch 2,000 of 2,642. Elapsed: 0:15:10.\n", + " Batch 2,040 of 2,642. Elapsed: 0:15:29.\n", + " Batch 2,080 of 2,642. Elapsed: 0:15:47.\n", + " Batch 2,120 of 2,642. Elapsed: 0:16:05.\n", + " Batch 2,160 of 2,642. Elapsed: 0:16:23.\n", + " Batch 2,200 of 2,642. Elapsed: 0:16:41.\n", + " Batch 2,240 of 2,642. Elapsed: 0:17:00.\n", + " Batch 2,280 of 2,642. Elapsed: 0:17:18.\n", + " Batch 2,320 of 2,642. Elapsed: 0:17:36.\n", + " Batch 2,360 of 2,642. Elapsed: 0:17:54.\n", + " Batch 2,400 of 2,642. Elapsed: 0:18:12.\n", + " Batch 2,440 of 2,642. Elapsed: 0:18:31.\n", + " Batch 2,480 of 2,642. Elapsed: 0:18:49.\n", + " Batch 2,520 of 2,642. Elapsed: 0:19:07.\n", + " Batch 2,560 of 2,642. Elapsed: 0:19:25.\n", + " Batch 2,600 of 2,642. Elapsed: 0:19:44.\n", + " Batch 2,640 of 2,642. Elapsed: 0:20:02.\n", + "\n", + " Average training loss: 1.03\n", + " Training epoch took: 0:20:02\n", + "\n", + "Running Validation...\n", + " Accuracy: 0.79\n", + " Validation took: 0:03:09\n", + "\n", + "======== Epoch 3 / 4 ========\n", + "Training...\n", + " Batch 40 of 2,642. Elapsed: 0:00:18.\n", + " Batch 80 of 2,642. Elapsed: 0:00:36.\n", + " Batch 120 of 2,642. Elapsed: 0:00:55.\n", + " Batch 160 of 2,642. Elapsed: 0:01:13.\n", + " Batch 200 of 2,642. Elapsed: 0:01:31.\n", + " Batch 240 of 2,642. Elapsed: 0:01:49.\n", + " Batch 280 of 2,642. Elapsed: 0:02:07.\n", + " Batch 320 of 2,642. Elapsed: 0:02:26.\n", + " Batch 360 of 2,642. Elapsed: 0:02:44.\n", + " Batch 400 of 2,642. Elapsed: 0:03:02.\n", + " Batch 440 of 2,642. Elapsed: 0:03:20.\n", + " Batch 480 of 2,642. Elapsed: 0:03:38.\n", + " Batch 520 of 2,642. Elapsed: 0:03:57.\n", + " Batch 560 of 2,642. Elapsed: 0:04:15.\n", + " Batch 600 of 2,642. Elapsed: 0:04:33.\n", + " Batch 640 of 2,642. Elapsed: 0:04:51.\n", + " Batch 680 of 2,642. Elapsed: 0:05:09.\n", + " Batch 720 of 2,642. Elapsed: 0:05:28.\n", + " Batch 760 of 2,642. Elapsed: 0:05:46.\n", + " Batch 800 of 2,642. Elapsed: 0:06:04.\n", + " Batch 840 of 2,642. Elapsed: 0:06:22.\n", + " Batch 880 of 2,642. Elapsed: 0:06:41.\n", + " Batch 920 of 2,642. Elapsed: 0:06:59.\n", + " Batch 960 of 2,642. Elapsed: 0:07:17.\n", + " Batch 1,000 of 2,642. Elapsed: 0:07:35.\n", + " Batch 1,040 of 2,642. Elapsed: 0:07:53.\n", + " Batch 1,080 of 2,642. Elapsed: 0:08:12.\n", + " Batch 1,120 of 2,642. Elapsed: 0:08:30.\n", + " Batch 1,160 of 2,642. Elapsed: 0:08:48.\n", + " Batch 1,200 of 2,642. Elapsed: 0:09:06.\n", + " Batch 1,240 of 2,642. Elapsed: 0:09:24.\n", + " Batch 1,280 of 2,642. Elapsed: 0:09:43.\n", + " Batch 1,320 of 2,642. Elapsed: 0:10:01.\n", + " Batch 1,360 of 2,642. Elapsed: 0:10:19.\n", + " Batch 1,400 of 2,642. Elapsed: 0:10:37.\n", + " Batch 1,440 of 2,642. Elapsed: 0:10:55.\n", + " Batch 1,480 of 2,642. Elapsed: 0:11:14.\n", + " Batch 1,520 of 2,642. Elapsed: 0:11:32.\n", + " Batch 1,560 of 2,642. Elapsed: 0:11:50.\n", + " Batch 1,600 of 2,642. Elapsed: 0:12:08.\n", + " Batch 1,640 of 2,642. Elapsed: 0:12:26.\n", + " Batch 1,680 of 2,642. Elapsed: 0:12:45.\n", + " Batch 1,720 of 2,642. Elapsed: 0:13:03.\n", + " Batch 1,760 of 2,642. Elapsed: 0:13:21.\n", + " Batch 1,800 of 2,642. Elapsed: 0:13:39.\n", + " Batch 1,840 of 2,642. Elapsed: 0:13:57.\n", + " Batch 1,880 of 2,642. Elapsed: 0:14:16.\n", + " Batch 1,920 of 2,642. Elapsed: 0:14:34.\n", + " Batch 1,960 of 2,642. Elapsed: 0:14:52.\n", + " Batch 2,000 of 2,642. Elapsed: 0:15:10.\n", + " Batch 2,040 of 2,642. Elapsed: 0:15:28.\n", + " Batch 2,080 of 2,642. Elapsed: 0:15:47.\n", + " Batch 2,120 of 2,642. Elapsed: 0:16:05.\n", + " Batch 2,160 of 2,642. Elapsed: 0:16:23.\n", + " Batch 2,200 of 2,642. Elapsed: 0:16:41.\n", + " Batch 2,240 of 2,642. Elapsed: 0:17:00.\n", + " Batch 2,280 of 2,642. Elapsed: 0:17:18.\n", + " Batch 2,320 of 2,642. Elapsed: 0:17:36.\n", + " Batch 2,360 of 2,642. Elapsed: 0:17:54.\n", + " Batch 2,400 of 2,642. Elapsed: 0:18:12.\n", + " Batch 2,440 of 2,642. Elapsed: 0:18:31.\n", + " Batch 2,480 of 2,642. Elapsed: 0:18:49.\n", + " Batch 2,520 of 2,642. Elapsed: 0:19:07.\n", + " Batch 2,560 of 2,642. Elapsed: 0:19:25.\n", + " Batch 2,600 of 2,642. Elapsed: 0:19:43.\n", + " Batch 2,640 of 2,642. Elapsed: 0:20:02.\n", + "\n", + " Average training loss: 0.75\n", + " Training epoch took: 0:20:02\n", + "\n", + "Running Validation...\n", + " Accuracy: 0.79\n", + " Validation took: 0:03:09\n", + "\n", + "======== Epoch 4 / 4 ========\n", + "Training...\n", + " Batch 40 of 2,642. Elapsed: 0:00:18.\n", + " Batch 80 of 2,642. Elapsed: 0:00:36.\n", + " Batch 120 of 2,642. Elapsed: 0:00:55.\n", + " Batch 160 of 2,642. Elapsed: 0:01:13.\n", + " Batch 200 of 2,642. Elapsed: 0:01:31.\n", + " Batch 240 of 2,642. Elapsed: 0:01:49.\n", + " Batch 280 of 2,642. Elapsed: 0:02:07.\n", + " Batch 320 of 2,642. Elapsed: 0:02:26.\n", + " Batch 360 of 2,642. Elapsed: 0:02:44.\n", + " Batch 400 of 2,642. Elapsed: 0:03:02.\n", + " Batch 440 of 2,642. Elapsed: 0:03:20.\n", + " Batch 480 of 2,642. Elapsed: 0:03:39.\n", + " Batch 520 of 2,642. Elapsed: 0:03:57.\n", + " Batch 560 of 2,642. Elapsed: 0:04:15.\n", + " Batch 600 of 2,642. Elapsed: 0:04:33.\n", + " Batch 640 of 2,642. Elapsed: 0:04:51.\n", + " Batch 680 of 2,642. Elapsed: 0:05:10.\n", + " Batch 720 of 2,642. Elapsed: 0:05:28.\n", + " Batch 760 of 2,642. Elapsed: 0:05:46.\n", + " Batch 800 of 2,642. Elapsed: 0:06:04.\n", + " Batch 840 of 2,642. Elapsed: 0:06:22.\n", + " Batch 880 of 2,642. Elapsed: 0:06:41.\n", + " Batch 920 of 2,642. Elapsed: 0:06:59.\n", + " Batch 960 of 2,642. Elapsed: 0:07:17.\n", + " Batch 1,000 of 2,642. Elapsed: 0:07:35.\n", + " Batch 1,040 of 2,642. Elapsed: 0:07:53.\n", + " Batch 1,080 of 2,642. Elapsed: 0:08:12.\n", + " Batch 1,120 of 2,642. Elapsed: 0:08:30.\n", + " Batch 1,160 of 2,642. Elapsed: 0:08:48.\n", + " Batch 1,200 of 2,642. Elapsed: 0:09:06.\n", + " Batch 1,240 of 2,642. Elapsed: 0:09:24.\n", + " Batch 1,280 of 2,642. Elapsed: 0:09:43.\n", + " Batch 1,320 of 2,642. Elapsed: 0:10:01.\n", + " Batch 1,360 of 2,642. Elapsed: 0:10:19.\n", + " Batch 1,400 of 2,642. Elapsed: 0:10:37.\n", + " Batch 1,440 of 2,642. Elapsed: 0:10:55.\n", + " Batch 1,480 of 2,642. Elapsed: 0:11:14.\n", + " Batch 1,520 of 2,642. Elapsed: 0:11:32.\n", + " Batch 1,560 of 2,642. Elapsed: 0:11:50.\n", + " Batch 1,600 of 2,642. Elapsed: 0:12:08.\n", + " Batch 1,640 of 2,642. Elapsed: 0:12:26.\n", + " Batch 1,680 of 2,642. Elapsed: 0:12:45.\n", + " Batch 1,720 of 2,642. Elapsed: 0:13:03.\n", + " Batch 1,760 of 2,642. Elapsed: 0:13:21.\n", + " Batch 1,800 of 2,642. Elapsed: 0:13:39.\n", + " Batch 1,840 of 2,642. Elapsed: 0:13:57.\n", + " Batch 1,880 of 2,642. Elapsed: 0:14:16.\n", + " Batch 1,920 of 2,642. Elapsed: 0:14:34.\n", + " Batch 1,960 of 2,642. Elapsed: 0:14:52.\n", + " Batch 2,000 of 2,642. Elapsed: 0:15:10.\n", + " Batch 2,040 of 2,642. Elapsed: 0:15:28.\n", + " Batch 2,080 of 2,642. Elapsed: 0:15:46.\n", + " Batch 2,120 of 2,642. Elapsed: 0:16:05.\n", + " Batch 2,160 of 2,642. Elapsed: 0:16:23.\n", + " Batch 2,200 of 2,642. Elapsed: 0:16:41.\n", + " Batch 2,240 of 2,642. Elapsed: 0:16:59.\n", + " Batch 2,280 of 2,642. Elapsed: 0:17:17.\n", + " Batch 2,320 of 2,642. Elapsed: 0:17:36.\n", + " Batch 2,360 of 2,642. Elapsed: 0:17:54.\n", + " Batch 2,400 of 2,642. Elapsed: 0:18:12.\n", + " Batch 2,440 of 2,642. Elapsed: 0:18:30.\n", + " Batch 2,480 of 2,642. Elapsed: 0:18:48.\n", + " Batch 2,520 of 2,642. Elapsed: 0:19:07.\n", + " Batch 2,560 of 2,642. Elapsed: 0:19:25.\n", + " Batch 2,600 of 2,642. Elapsed: 0:19:43.\n", + " Batch 2,640 of 2,642. Elapsed: 0:20:01.\n", + "\n", + " Average training loss: 0.60\n", + " Training epoch took: 0:20:02\n", + "\n", + "Running Validation...\n", + " Accuracy: 0.80\n", + " Validation took: 0:03:09\n", + "\n", + "Training complete!\n" + ] + } + ], + "source": [ + "# This training code is based on the `run_glue.py` script here:\n", + "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n", + "\n", + "# Set the seed value all over the place to make this reproducible.\n", + "seed_val = 42\n", + "\n", + "random.seed(seed_val)\n", + "np.random.seed(seed_val)\n", + "torch.manual_seed(seed_val)\n", + "torch.cuda.manual_seed_all(seed_val)\n", + "\n", + "# Store the average loss after each epoch so I can plot them.\n", + "loss_values = []\n", + "\n", + "# For each epoch...\n", + "for epoch_i in range(0, epochs):\n", + " \n", + " # ========================================\n", + " # Training\n", + " # ========================================\n", + " \n", + " # Perform one full pass over the training set.\n", + "\n", + " print(\"\")\n", + " print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n", + " print('Training...')\n", + "\n", + " # Measure how long the training epoch takes.\n", + " t0 = time.time()\n", + "\n", + " # Reset the total loss for this epoch.\n", + " total_loss = 0\n", + "\n", + " # Put the model into training mode.\n", + " model.train()\n", + "\n", + " # For each batch of training data...\n", + " for step, batch in enumerate(train_dataloader):\n", + "\n", + " # Progress update every 40 batches.\n", + " if step % 40 == 0 and not step == 0:\n", + " # Calculate elapsed time in minutes.\n", + " elapsed = format_time(time.time() - t0)\n", + " \n", + " # Report progress.\n", + " print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n", + "\n", + " # Unpack this training batch from the dataloader. \n", + " #\n", + " # As I unpack the batch, I'll also copy each tensor to the GPU using the \n", + " # `to` method.\n", + " #\n", + " # `batch` contains three pytorch tensors:\n", + " # [0]: input ids \n", + " # [1]: attention masks\n", + " # [2]: labels \n", + " b_input_ids = batch[0].to(device)\n", + " b_input_mask = batch[1].to(device)\n", + " b_labels = batch[2].to(device)\n", + "\n", + " # Always clear any previously calculated gradients before performing a\n", + " # backward pass. PyTorch doesn't do this automatically because \n", + " # accumulating the gradients is \"convenient while training RNNs\". \n", + " # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)\n", + " model.zero_grad() \n", + "\n", + " # Perform a forward pass (evaluate the model on this training batch).\n", + " # This will return the loss (rather than the model output) because I\n", + " # have provided the `labels`.\n", + " # The documentation for this `model` function is here: \n", + " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n", + " outputs = model(b_input_ids, \n", + " token_type_ids=None, \n", + " attention_mask=b_input_mask, \n", + " labels=b_labels)\n", + " \n", + " # The call to `model` always returns a tuple, so I need to pull the \n", + " # loss value out of the tuple.\n", + " loss = outputs[0]\n", + "\n", + " # Accumulate the training loss over all of the batches so that I can\n", + " # calculate the average loss at the end. `loss` is a Tensor containing a\n", + " # single value; the `.item()` function just returns the Python value \n", + " # from the tensor.\n", + " total_loss += loss.item()\n", + "\n", + " # Perform a backward pass to calculate the gradients.\n", + " loss.backward()\n", + "\n", + " # Clip the norm of the gradients to 1.0.\n", + " # This is to help prevent the \"exploding gradients\" problem.\n", + " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n", + "\n", + " # Update parameters and take a step using the computed gradient.\n", + " # The optimizer dictates the \"update rule\"--how the parameters are\n", + " # modified based on their gradients, the learning rate, etc.\n", + " optimizer.step()\n", + "\n", + " # Update the learning rate.\n", + " scheduler.step()\n", + "\n", + " # Calculate the average loss over the training data.\n", + " avg_train_loss = total_loss / len(train_dataloader) \n", + " \n", + " # Store the loss value for plotting the learning curve.\n", + " loss_values.append(avg_train_loss)\n", + "\n", + " print(\"\")\n", + " print(\" Average training loss: {0:.2f}\".format(avg_train_loss))\n", + " print(\" Training epoch took: {:}\".format(format_time(time.time() - t0)))\n", + " \n", + " # ========================================\n", + " # Validation\n", + " # ========================================\n", + " # After the completion of each training epoch, measure the performance on\n", + " # the validation set.\n", + "\n", + " print(\"\")\n", + " print(\"Running Validation...\")\n", + "\n", + " t0 = time.time()\n", + "\n", + " # Put the model in evaluation mode--the dropout layers behave differently\n", + " # during evaluation.\n", + " model.eval()\n", + "\n", + " # Tracking variables \n", + " eval_loss, eval_accuracy = 0, 0\n", + " nb_eval_steps, nb_eval_examples = 0, 0\n", + "\n", + " # Evaluate data for one epoch\n", + " for batch in validation_dataloader:\n", + " \n", + " # Add batch to GPU\n", + " batch = tuple(t.to(device) for t in batch)\n", + " \n", + " # Unpack the inputs from dataloader\n", + " b_input_ids, b_input_mask, b_labels = batch\n", + " \n", + " # Telling the model not to compute or store gradients, saving memory and\n", + " # speeding up validation\n", + " with torch.no_grad(): \n", + "\n", + " # Forward pass, calculate logit predictions.\n", + " # This will return the logits rather than the loss because we have\n", + " # not provided labels.\n", + " # token_type_ids is the same as the \"segment ids\", which \n", + " # differentiates sentence 1 and 2 in 2-sentence tasks.\n", + " # The documentation for this `model` function is here: \n", + " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n", + " outputs = model(b_input_ids, \n", + " token_type_ids=None, \n", + " attention_mask=b_input_mask)\n", + " \n", + " # Get the \"logits\" output by the model. The \"logits\" are the output\n", + " # values prior to applying an activation function like the softmax.\n", + " logits = outputs[0]\n", + "\n", + " # Move logits and labels to CPU\n", + " logits = logits.detach().cpu().numpy()\n", + " label_ids = b_labels.to('cpu').numpy()\n", + " \n", + " # Calculate the accuracy for this batch of test sentences.\n", + " tmp_eval_accuracy = flat_accuracy(logits, label_ids)\n", + " \n", + " # Accumulate the total accuracy.\n", + " eval_accuracy += tmp_eval_accuracy\n", + "\n", + " # Track the number of batches\n", + " nb_eval_steps += 1\n", + "\n", + " # Report the final accuracy for this validation run.\n", + " print(\" Accuracy: {0:.2f}\".format(eval_accuracy/nb_eval_steps))\n", + " print(\" Validation took: {:}\".format(format_time(time.time() - t0)))\n", + "\n", + "print(\"\")\n", + "print(\"Training complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uEe7lPtVKpIY" + }, + "source": [ + "## Saving model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "background_save": true + }, + "id": "AYCSVm_wKnuM" + }, + "outputs": [], + "source": [ + "model_path = \"drive/MyDrive/Classification-EDdA/model_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".pt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "background_save": true + }, + "id": "qmsxrOqjCsGo" + }, + "outputs": [], + "source": [ + "torch.save(model, model_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pM9bSsckCndR" + }, + "source": [ + "## Loading model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cEycmiS8Cnjw" + }, + "outputs": [], + "source": [ + "#model = torch.load(model_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VJwyfmakkQyj" + }, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K9qdtYexIIvk" + }, + "outputs": [], + "source": [ + "def evaluate_bert(data, labels, model, batch_size):\n", + " # Tokenize all of the sentences and map the tokens to thier word IDs.\n", + " input_ids = []\n", + " # For every sentence...\n", + " for sent in data:\n", + " # `encode` will:\n", + " # (1) Tokenize the sentence.\n", + " # (2) Prepend the `[CLS]` token to the start.\n", + " # (3) Append the `[SEP]` token to the end.\n", + " # (4) Map tokens to their IDs.\n", + " encoded_sent = tokenizer.encode(\n", + " str(sent), # Sentence to encode.\n", + " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", + " )\n", + " \n", + " input_ids.append(encoded_sent)\n", + "\n", + " # Pad our input tokens\n", + " padded = []\n", + " for i in input_ids:\n", + "\n", + " if len(i) > max_len:\n", + " padded.extend([i[:max_len]])\n", + " else:\n", + " padded.extend([i + [0] * (max_len - len(i))])\n", + " input_ids = np.array(padded)\n", + "\n", + " # Create attention masks\n", + " attention_masks = []\n", + "\n", + " # Create a mask of 1s for each token followed by 0s for padding\n", + " for seq in input_ids:\n", + " seq_mask = [float(i>0) for i in seq]\n", + " attention_masks.append(seq_mask) \n", + "\n", + " # Convert to tensors.\n", + " prediction_inputs = torch.tensor(input_ids)\n", + " prediction_masks = torch.tensor(attention_masks)\n", + " prediction_labels = torch.tensor(labels)\n", + "\n", + " # Create the DataLoader.\n", + " prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)\n", + " prediction_sampler = SequentialSampler(prediction_data)\n", + " prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)\n", + "\n", + " print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))\n", + "\n", + " # Put model in evaluation mode\n", + " model.eval()\n", + "\n", + " # Tracking variables \n", + " predictions , true_labels = [], []\n", + "\n", + " # Predict \n", + " for batch in prediction_dataloader:\n", + " # Add batch to GPU\n", + " batch = tuple(t.to(device) for t in batch)\n", + " \n", + " # Unpack the inputs from the dataloader\n", + " b_input_ids, b_input_mask, b_labels = batch\n", + " \n", + " # Telling the model not to compute or store gradients, saving memory and \n", + " # speeding up prediction\n", + " with torch.no_grad():\n", + " # Forward pass, calculate logit predictions\n", + " outputs = model(b_input_ids, token_type_ids=None, \n", + " attention_mask=b_input_mask)\n", + "\n", + " logits = outputs[0]\n", + " #print(logits)\n", + "\n", + " # Move logits and labels to CPU\n", + " logits = logits.detach().cpu().numpy()\n", + " label_ids = b_labels.to('cpu').numpy()\n", + " #print(logits)\n", + " \n", + " # Store predictions and true labels\n", + " predictions.append(logits)\n", + " true_labels.append(label_ids)\n", + "\n", + " print(' DONE.')\n", + "\n", + "\n", + " pred_labels = []\n", + "\n", + " # Evaluate each test batch using many matrics\n", + " print('Calculating the matrics for each batch...')\n", + "\n", + " for i in range(len(true_labels)):\n", + " \n", + " # The predictions for this batch are a 2-column ndarray (one column for \"0\" \n", + " # and one column for \"1\"). Pick the label with the highest value and turn this\n", + " # in to a list of 0s and 1s.\n", + " pred_labels_i = np.argmax(predictions[i], axis=1).flatten()\n", + " pred_labels.append(pred_labels_i)\n", + "\n", + "\n", + " pred_labels_ = [item for sublist in pred_labels for item in sublist]\n", + " true_labels_ = [item for sublist in true_labels for item in sublist]\n", + "\n", + " return pred_labels_, true_labels_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AJ0suC8iMs8a" + }, + "outputs": [], + "source": [ + "dataset_name = [\"validation\", \"test\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dPjV_5g8DDQy" + }, + "outputs": [], + "source": [ + "for dataset in dataset_name:\n", + " df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n", + " data_eval = df_eval[columnText].values\n", + "\n", + " y = df_eval[columnClass]\n", + " encoder = preprocessing.LabelEncoder()\n", + " y = encoder.fit_transform(y)\n", + " labels = y.tolist()\n", + "\n", + " pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n", + "\n", + "\n", + " report = classification_report( pred_labels_, true_labels_, output_dict = True)\n", + " \n", + " classes = [str(e) for e in encoder.transform(encoder.classes_)]\n", + " classesName = encoder.classes_\n", + "\n", + " precision = []\n", + " recall = []\n", + " f1 = []\n", + " support = []\n", + " dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", + " for c in classes:\n", + " precision.append(report[c]['precision'])\n", + " recall.append(report[c]['recall'])\n", + " f1.append(report[c]['f1-score'])\n", + " support.append(report[c]['support'])\n", + "\n", + " accuracy = report['accuracy']\n", + " weighted_avg = report['weighted avg']\n", + " cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n", + " FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", + " FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", + " TP = np.diag(cnf_matrix)\n", + " TN = cnf_matrix.sum() - (FP + FN + TP)\n", + "\n", + " dff['className'] = classesName\n", + " dff['precision'] = precision\n", + " dff['recall'] = recall\n", + " dff['f1-score'] = f1\n", + " dff['support'] = support\n", + " dff['FP'] = FP\n", + " dff['FN'] = FN\n", + " dff['TP'] = TP\n", + " dff['TN'] = TN\n", + "\n", + " print(dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass))\n", + "\n", + " print(weighted_avg)\n", + " print(accuracy)\n", + " print(dff)\n", + "\n", + " dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cVdM4eT6I8g2" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HzxyFO3knanV" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KDRPPw4Wnap7" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DX81R2dcnasF" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wgfqJFVeJMK1" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GqEf5_41JMNZ" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x_n57EvhJMQh" + }, + "outputs": [], + "source": [ + "model_path = \"drive/MyDrive/Classification-EDdA/model_bert-base-multilingual-cased_s10000.pt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R3_9tA9MI8ju" + }, + "outputs": [], + "source": [ + "model = torch.load(model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_fzgS5USJeAF", + "outputId": "be4a5506-76ed-4eef-bb3c-fe2bb77c6e4d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2021-09-30 19:38:22-- https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv\n", + "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n", + "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 356197 (348K) [text/tab-separated-values]\n", + "Saving to: ‘LGE_withContent.tsv’\n", + "\n", + "LGE_withContent.tsv 100%[===================>] 347.85K 567KB/s in 0.6s \n", + "\n", + "2021-09-30 19:38:24 (567 KB/s) - ‘LGE_withContent.tsv’ saved [356197/356197]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8WEJjQC7I8mP" + }, + "outputs": [], + "source": [ + "df_LGE = pd.read_csv(\"LGE_withContent.tsv\", sep=\"\\t\")\n", + "data_LGE = df_LGE[\"content\"].values\n", + "\n", + "\n", + "#pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "9qJDTU-6vzkk", + "outputId": "1b279f0e-7715-4d23-f524-08e8ba327f6c" + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>tome</th>\n", + " <th>rank</th>\n", + " <th>domain</th>\n", + " <th>remark</th>\n", + " <th>content</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>abrabeses-0</td>\n", + " <td>1</td>\n", + " <td>623</td>\n", + " <td>geography</td>\n", + " <td>NaN</td>\n", + " <td>ABRABESES. Village d’Espagne de la prov. de Za...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>accius-0</td>\n", + " <td>1</td>\n", + " <td>1076</td>\n", + " <td>biography</td>\n", + " <td>NaN</td>\n", + " <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>achenbach-2</td>\n", + " <td>1</td>\n", + " <td>1357</td>\n", + " <td>biography</td>\n", + " <td>NaN</td>\n", + " <td>ACHENBACH(Henri), administrateur prussien, né ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>acireale-0</td>\n", + " <td>1</td>\n", + " <td>1513</td>\n", + " <td>geography</td>\n", + " <td>NaN</td>\n", + " <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>actée-0</td>\n", + " <td>1</td>\n", + " <td>1731</td>\n", + " <td>botany</td>\n", + " <td>NaN</td>\n", + " <td>ACTÉE(ActÅ“a L.). Genre de plantes de la famill...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id tome ... remark content\n", + "0 abrabeses-0 1 ... NaN ABRABESES. Village d’Espagne de la prov. de Za...\n", + "1 accius-0 1 ... NaN ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...\n", + "2 achenbach-2 1 ... NaN ACHENBACH(Henri), administrateur prussien, né ...\n", + "3 acireale-0 1 ... NaN ACIREALE. Yille de Sicile, de la province et d...\n", + "4 actée-0 1 ... NaN ACTÉE(ActÅ“a L.). Genre de plantes de la famill...\n", + "\n", + "[5 rows x 6 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_LGE.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "71-fP61-OOwQ", + "outputId": "ef08b49e-0a9f-4653-e303-3163250af35b" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(310, 6)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_LGE.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lFFed2EAI8oq" + }, + "outputs": [], + "source": [ + "def generate_prediction_dataloader(chosen_model, sentences_to_predict, batch_size = 8, max_len = 512):\n", + "\n", + " if chosen_model == 'bert-base-multilingual-cased' :\n", + " print('Loading Bert Tokenizer...')\n", + " tokenizer = BertTokenizer.from_pretrained(chosen_model)\n", + " elif chosen_model == 'camembert-base':\n", + " print('Loading Camembert Tokenizer...')\n", + " tokenizer = CamembertTokenizer.from_pretrained(chosen_model)\n", + "\n", + " # Tokenize all of the sentences and map the tokens to thier word IDs.\n", + " input_ids_test = []\n", + " # For every sentence...\n", + " for sent in sentences_to_predict:\n", + " # `encode` will:\n", + " # (1) Tokenize the sentence.\n", + " # (2) Prepend the `[CLS]` token to the start.\n", + " # (3) Append the `[SEP]` token to the end.\n", + " # (4) Map tokens to their IDs.\n", + " encoded_sent = tokenizer.encode(\n", + " sent, # Sentence to encode.\n", + " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", + " )\n", + "\n", + " input_ids_test.append(encoded_sent)\n", + "\n", + " # Pad our input tokens\n", + " padded_test = []\n", + " for i in input_ids_test:\n", + "\n", + " if len(i) > max_len:\n", + " padded_test.extend([i[:max_len]])\n", + " else:\n", + "\n", + " padded_test.extend([i + [0] * (max_len - len(i))])\n", + " input_ids_test = np.array(padded_test)\n", + "\n", + " # Create attention masks\n", + " attention_masks = []\n", + "\n", + " # Create a mask of 1s for each token followed by 0s for padding\n", + " for seq in input_ids_test:\n", + " seq_mask = [float(i>0) for i in seq]\n", + " attention_masks.append(seq_mask)\n", + "\n", + " # Convert to tensors.\n", + " prediction_inputs = torch.tensor(input_ids_test)\n", + " prediction_masks = torch.tensor(attention_masks)\n", + " #set batch size\n", + "\n", + "\n", + " # Create the DataLoader.\n", + " prediction_data = TensorDataset(prediction_inputs, prediction_masks)\n", + " prediction_sampler = SequentialSampler(prediction_data)\n", + " prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)\n", + "\n", + " return prediction_dataloader\n", + "\n", + "\n", + "\n", + "def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):\n", + "\n", + "\n", + " # If there's a GPU available...\n", + " if torch.cuda.is_available():\n", + "\n", + " # Tell PyTorch to use the GPU.\n", + " device = torch.device(\"cuda\")\n", + "\n", + " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n", + "\n", + " print('We will use the GPU:', torch.cuda.get_device_name(0))\n", + "\n", + " # If not...\n", + " else:\n", + " print('No GPU available, using the CPU instead.')\n", + " device = torch.device(\"cpu\")\n", + "\n", + " # Put model in evaluation mode\n", + " model.eval()\n", + "\n", + " # Tracking variables\n", + " predictions_test , true_labels = [], []\n", + " pred_labels_ = []\n", + " # Predict\n", + " for batch in sentences_to_predict_dataloader:\n", + " # Add batch to GPU\n", + " batch = tuple(t.to(device) for t in batch)\n", + "\n", + " # Unpack the inputs from the dataloader\n", + " b_input_ids, b_input_mask = batch\n", + "\n", + " # Telling the model not to compute or store gradients, saving memory and\n", + " # speeding up prediction\n", + " with torch.no_grad():\n", + " # Forward pass, calculate logit predictions\n", + " outputs = model(b_input_ids, token_type_ids=None,\n", + " attention_mask=b_input_mask)\n", + "\n", + " logits = outputs[0]\n", + " #print(logits)\n", + "\n", + " # Move logits and labels to CPU\n", + " logits = logits.detach().cpu().numpy()\n", + " #print(logits)\n", + "\n", + " # Store predictions and true labels\n", + " predictions_test.append(logits)\n", + "\n", + " #print(' DONE.')\n", + "\n", + " pred_labels = []\n", + " \n", + " for i in range(len(predictions_test)):\n", + "\n", + " # The predictions for this batch are a 2-column ndarray (one column for \"0\"\n", + " # and one column for \"1\"). Pick the label with the highest value and turn this\n", + " # in to a list of 0s and 1s.\n", + " pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()\n", + " pred_labels.append(pred_labels_i)\n", + "\n", + " pred_labels_ += [item for sublist in pred_labels for item in sublist]\n", + " return pred_labels_\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "O9eer_kgI8rC", + "outputId": "94ea7418-14a8-4918-e210-caf0018f5989" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading Bert Tokenizer...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors\n" + ] + } + ], + "source": [ + "data_loader = generate_prediction_dataloader('bert-base-multilingual-cased', data_LGE)\n", + "#data_loader = generate_prediction_dataloader('camembert-base', data_LGE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sFpAwbrBwF2h", + "outputId": "8d210732-619d-41f0-b6e2-ad9d06a85069" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 1 GPU(s) available.\n", + "We will use the GPU: Tesla P100-PCIE-16GB\n" + ] + } + ], + "source": [ + "p = predict_class_bertFineTuning( model, data_loader )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "51HF6-8UPSTc", + "outputId": "26bff792-eb8d-4e1a-efa4-a7a6c9d32bf9" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "310" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(p)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rFFGhaCvQHfh" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qgJ-O4rcQHiI", + "outputId": "bfe93dd6-4d89-4d5c-be0d-45e1c98c6b14" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "LabelEncoder()" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Il faudrait enregistrer l'encoder, \n", + "# sinon on est obligé de le refaire à partir du jeu d'entrainement pour récupérer le noms des classes.\n", + "encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QuST9wJoQHnS" + }, + "outputs": [], + "source": [ + "p2 = list(encoder.inverse_transform(p))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6ek7suq9QHqE", + "outputId": "6636983a-7eba-48c8-d884-f8fb437294dc" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Histoire naturelle',\n", + " 'Chimie',\n", + " 'Histoire naturelle',\n", + " 'Géographie',\n", + " 'Mathématiques',\n", + " 'Histoire',\n", + " 'Géographie',\n", + " 'Musique',\n", + " 'Commerce',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Histoire',\n", + " 'Géographie',\n", + " 'Histoire naturelle',\n", + " 'Géographie',\n", + " 'Physique - [Sciences physico-mathématiques]',\n", + " 'Histoire naturelle',\n", + " 'Chimie',\n", + " 'Histoire',\n", + " 'Physique - [Sciences physico-mathématiques]',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Histoire',\n", + " 'Histoire naturelle',\n", + " 'Médecine - Chirurgie',\n", + " 'Géographie',\n", + " 'Architecture',\n", + " 'Histoire naturelle',\n", + " 'Histoire naturelle',\n", + " 'Géographie',\n", + " 'Arts et métiers',\n", + " 'Géographie',\n", + " 'Histoire naturelle',\n", + " 'Marine',\n", + " 'Histoire',\n", + " 'Géographie',\n", + " 'Architecture',\n", + " 'Histoire naturelle',\n", + " 'Beaux-arts',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Beaux-arts',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Médecine - Chirurgie',\n", + " 'Géographie',\n", + " 'Histoire naturelle',\n", + " 'Chimie',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Religion',\n", + " 'Histoire naturelle',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Agriculture - Economie rustique',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Jeu',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Beaux-arts',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Beaux-arts',\n", + " 'Histoire naturelle',\n", + " 'Géographie',\n", + " 'Histoire naturelle',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Histoire naturelle',\n", + " 'Histoire',\n", + " 'Histoire naturelle',\n", + " 'Commerce',\n", + " 'Histoire',\n", + " 'Militaire (Art) - Guerre - Arme',\n", + " 'Histoire',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Histoire',\n", + " 'Géographie',\n", + " 'Religion',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Agriculture - Economie rustique',\n", + " 'Histoire',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Métiers',\n", + " 'Belles-lettres - Poésie',\n", + " 'Beaux-arts',\n", + " 'Religion',\n", + " 'Architecture',\n", + " 'Architecture',\n", + " 'Architecture',\n", + " 'Géographie',\n", + " 'Chimie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Beaux-arts',\n", + " 'Histoire naturelle',\n", + " 'Militaire (Art) - Guerre - Arme',\n", + " 'Géographie',\n", + " 'Histoire naturelle',\n", + " 'Médecine - Chirurgie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Minéralogie',\n", + " 'Belles-lettres - Poésie',\n", + " 'Histoire naturelle',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Médecine - Chirurgie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Grammaire',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Mathématiques',\n", + " 'Géographie',\n", + " 'Médecine - Chirurgie',\n", + " 'Blason',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Histoire naturelle',\n", + " 'Militaire (Art) - Guerre - Arme',\n", + " 'Géographie',\n", + " 'Antiquité',\n", + " 'Agriculture - Economie rustique',\n", + " 'Chimie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Histoire naturelle',\n", + " 'Belles-lettres - Poésie',\n", + " 'Histoire',\n", + " 'Géographie',\n", + " 'Métiers',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Arts et métiers',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Beaux-arts',\n", + " 'Géographie',\n", + " 'Beaux-arts',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Musique',\n", + " 'Médecine - Chirurgie',\n", + " 'Religion',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Histoire',\n", + " 'Droit - Jurisprudence',\n", + " 'Histoire',\n", + " 'Médecine - Chirurgie',\n", + " 'Histoire',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Chimie',\n", + " 'Antiquité',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Histoire',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Beaux-arts',\n", + " 'Histoire',\n", + " 'Géographie',\n", + " 'Histoire naturelle',\n", + " 'Antiquité',\n", + " 'Grammaire',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Beaux-arts',\n", + " 'Beaux-arts',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Histoire',\n", + " 'Architecture',\n", + " 'Commerce',\n", + " 'Antiquité',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Médecine - Chirurgie',\n", + " 'Histoire naturelle',\n", + " 'Histoire',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Anatomie',\n", + " 'Commerce',\n", + " 'Beaux-arts',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Histoire naturelle',\n", + " 'Géographie',\n", + " 'Beaux-arts',\n", + " 'Commerce',\n", + " 'Architecture',\n", + " 'Commerce',\n", + " 'Antiquité',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Médecine - Chirurgie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Antiquité',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Histoire',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Antiquité',\n", + " 'Géographie',\n", + " 'Religion',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Philosophie',\n", + " 'Géographie',\n", + " 'Chimie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Géographie',\n", + " 'Beaux-arts',\n", + " 'Commerce',\n", + " 'Commerce',\n", + " 'Géographie',\n", + " 'Géographie']" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XvdDj5PBQHtk" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "t39Xs0j7QHXJ" + }, + "outputs": [], + "source": [ + "df_LGE['class_bert'] = p2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "-VZ7geRmQHaD", + "outputId": "350a4122-5b1f-43e2-e372-2f628f665c4a" + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>tome</th>\n", + " <th>rank</th>\n", + " <th>domain</th>\n", + " <th>remark</th>\n", + " <th>content</th>\n", + " <th>class_bert</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>abrabeses-0</td>\n", + " <td>1</td>\n", + " <td>623</td>\n", + " <td>geography</td>\n", + " <td>NaN</td>\n", + " <td>ABRABESES. Village d’Espagne de la prov. de Za...</td>\n", + " <td>Géographie</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>accius-0</td>\n", + " <td>1</td>\n", + " <td>1076</td>\n", + " <td>biography</td>\n", + " <td>NaN</td>\n", + " <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n", + " <td>Géographie</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>achenbach-2</td>\n", + " <td>1</td>\n", + " <td>1357</td>\n", + " <td>biography</td>\n", + " <td>NaN</td>\n", + " <td>ACHENBACH(Henri), administrateur prussien, né ...</td>\n", + " <td>Géographie</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>acireale-0</td>\n", + " <td>1</td>\n", + " <td>1513</td>\n", + " <td>geography</td>\n", + " <td>NaN</td>\n", + " <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n", + " <td>Géographie</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>actée-0</td>\n", + " <td>1</td>\n", + " <td>1731</td>\n", + " <td>botany</td>\n", + " <td>NaN</td>\n", + " <td>ACTÉE(ActÅ“a L.). Genre de plantes de la famill...</td>\n", + " <td>Histoire naturelle</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id ... class_bert\n", + "0 abrabeses-0 ... Géographie\n", + "1 accius-0 ... Géographie\n", + "2 achenbach-2 ... Géographie\n", + "3 acireale-0 ... Géographie\n", + "4 actée-0 ... Histoire naturelle\n", + "\n", + "[5 rows x 7 columns]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_LGE.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3xkzdkrKQHwA" + }, + "outputs": [], + "source": [ + "df_LGE.to_csv(\"drive/MyDrive/Classification-EDdA/classification_LGE.tsv\", sep=\"\\t\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "machine_shape": "hm", + "name": "EDdA-Classification_BertFineTuning.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "00bd66a81aad4cd7a10df4a67b52b14e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0143df420df444e9aac5c8b39c342021": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_044fc1f96f8347ddb4a79d31edf32174", + "placeholder": "​", + "style": "IPY_MODEL_cf0d3320e06546789b5d5a2021dbc3ad", + "value": " 811k/811k [00:00<00:00, 932kB/s]" + } + }, + "0214f74b229a4232a9edf3cab751b90d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "044fc1f96f8347ddb4a79d31edf32174": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "06c6e7721b68449a9f3619ffdf18dfeb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5ec6a851b16c4339b51acb6129935f13", + "IPY_MODEL_fd39a852133144e2b4aed474b204451f", + "IPY_MODEL_0143df420df444e9aac5c8b39c342021" + ], + "layout": "IPY_MODEL_c61b6474b55948cb91a598e6b9aa10d2" + } + }, + "12aa3280d3284c07ac12e2fe842b40b0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f470af786c1c4d049de4f0a7f373379f", + "placeholder": "​", + "style": "IPY_MODEL_00bd66a81aad4cd7a10df4a67b52b14e", + "value": "Downloading: 100%" + } + }, + "152afcb9245c416fae0fde257fa25e2e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "17bf94188b844f649642d9c6e6a20373": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "185ae5ef7be646b797467086ad7d3a82": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_152afcb9245c416fae0fde257fa25e2e", + "max": 1395301, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_fb3a174c597b47c7a527517004ba5f54", + "value": 1395301 + } + }, + "1bcdb04d16dd4f9e9d86938e1d2def02": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a5efb634a95c42a7abfaaf61e1c2c928", + "max": 445032417, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_600e627de1f0403595f701381dc3b164", + "value": 445032417 + } + }, + "1d97e83c703f4071b9176ba7bf57cddf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "27a20a17123744948e0c1dbf49b51b27": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "27e18e1fa3884c0fb0339764e0397990": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_17bf94188b844f649642d9c6e6a20373", + "placeholder": "​", + "style": "IPY_MODEL_d3aaecd7a6e34cc8918a689ac6299746", + "value": " 508/508 [00:00<00:00, 15.9kB/s]" + } + }, + "2af1124092684f8bafab311cbe9bf22c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3203783f58e54b0e856ab84503bf0d3c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "340241453dab4db88043d372aaa88c2e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4422e64029184ba4ba30eecfdf2b4306", + "max": 508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1d97e83c703f4071b9176ba7bf57cddf", + "value": 508 + } + }, + "3ceaa994a3814d3c85e2051e37397342": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_75073a0f673345728871dfb0346e7c1b", + "placeholder": "​", + "style": "IPY_MODEL_db8c94b4ed724f859d1ae8c153b01110", + "value": " 1.40M/1.40M [00:00<00:00, 2.81MB/s]" + } + }, + "41558bfcc0464711916c2d96337bef66": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4422e64029184ba4ba30eecfdf2b4306": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4873cc6c9e1d493c9a67d6536e4367a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_12aa3280d3284c07ac12e2fe842b40b0", + "IPY_MODEL_1bcdb04d16dd4f9e9d86938e1d2def02", + "IPY_MODEL_b5f86071b23c40bf9c96f74c613c2729" + ], + "layout": "IPY_MODEL_27a20a17123744948e0c1dbf49b51b27" + } + }, + "5879fadf430646f6af41b1a9b14864ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_95a3332ba4634d1c930a7021eacce230", + "placeholder": "​", + "style": "IPY_MODEL_d53488432f8544de863210d9e8ee4e48", + "value": "Downloading: 100%" + } + }, + "5ec6a851b16c4339b51acb6129935f13": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a0d9ceaa8d3a4876ae65d877687bcf50", + "placeholder": "​", + "style": "IPY_MODEL_aa6ea92757df47eda1e41603cb109e79", + "value": "Downloading: 100%" + } + }, + "600e627de1f0403595f701381dc3b164": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6a29c1c28ceb415f91ec55512da981c5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5879fadf430646f6af41b1a9b14864ff", + "IPY_MODEL_340241453dab4db88043d372aaa88c2e", + "IPY_MODEL_27e18e1fa3884c0fb0339764e0397990" + ], + "layout": "IPY_MODEL_2af1124092684f8bafab311cbe9bf22c" + } + }, + "75073a0f673345728871dfb0346e7c1b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "885f91c34b9c422889df8b556aad8ec0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "95a3332ba4634d1c930a7021eacce230": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a0d9ceaa8d3a4876ae65d877687bcf50": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a5efb634a95c42a7abfaaf61e1c2c928": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aa6ea92757df47eda1e41603cb109e79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b5f86071b23c40bf9c96f74c613c2729": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f3b7527bd4d04c81936d8392decee3ac", + "placeholder": "​", + "style": "IPY_MODEL_885f91c34b9c422889df8b556aad8ec0", + "value": " 445M/445M [00:12<00:00, 41.9MB/s]" + } + }, + "c61b6474b55948cb91a598e6b9aa10d2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cf0d3320e06546789b5d5a2021dbc3ad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d3aaecd7a6e34cc8918a689ac6299746": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d53488432f8544de863210d9e8ee4e48": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "db8c94b4ed724f859d1ae8c153b01110": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e674e279b13b41fda3df3a6c89f5fcb1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f3b7527bd4d04c81936d8392decee3ac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f470af786c1c4d049de4f0a7f373379f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f7224a1b831d459594852eece9f05543": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3203783f58e54b0e856ab84503bf0d3c", + "placeholder": "​", + "style": "IPY_MODEL_0214f74b229a4232a9edf3cab751b90d", + "value": "Downloading: 100%" + } + }, + "fb3a174c597b47c7a527517004ba5f54": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "fba1d1d5c83b40659295a3457d74cb4e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f7224a1b831d459594852eece9f05543", + "IPY_MODEL_185ae5ef7be646b797467086ad7d3a82", + "IPY_MODEL_3ceaa994a3814d3c85e2051e37397342" + ], + "layout": "IPY_MODEL_e674e279b13b41fda3df3a6c89f5fcb1" + } + }, + "fd39a852133144e2b4aed474b204451f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_41558bfcc0464711916c2d96337bef66", + "max": 810912, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_fdf05cea504c42f793f9c06e58ef995b", + "value": 810912 + } + }, + "fdf05cea504c42f793f9c06e58ef995b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/EDdA_Classification_ClassicModels.ipynb b/notebooks/EDdA_Classification_ClassicModels.ipynb new file mode 100644 index 0000000..fcb2ba0 --- /dev/null +++ b/notebooks/EDdA_Classification_ClassicModels.ipynb @@ -0,0 +1,861 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "EDdA-Classification_ClassicModels.ipynb", + "provenance": [], + "collapsed_sections": [], + "machine_shape": "hm" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "aXLlx8vXQlJw" + }, + "source": [ + "# Train supervised models for EDdA classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3kYI_pq3Q1BT" + }, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "D_uwiuJq3pAM" + }, + "source": [ + "train_path = 'training_set.tsv'\n", + "validation_path = 'validation_set.tsv'\n", + "test_path = 'test_set.tsv'\n", + "\n", + "columnText = 'contentWithoutClass'\n", + "columnClass = 'ensemble_domaine_enccre'\n", + "\n", + "minOfInstancePerClass = 0\n", + "maxOfInstancePerClass = 10000\n", + "\n", + "\n", + "classifier_list = [\"bayes\"]\n", + "vectorizer_list = [\"bagofwords\", \"tf_idf\"]\n", + "\n", + "#classifier_list = [\"lr\", \"rfc\", \"sgd\", \"svm\"]\n", + "#vectorizer_list = [\"bagofwords\", \"tf_idf\", \"doc2vec\"]\n", + "\n", + "vectorization_max_df= 1.0\n", + "vectorization_min_df= 4\n", + "vectorization_numberOfFeatures= None\n", + "\n", + "doc2vec_vec_size = 700\n", + "max_epochs = 10\n", + "doc2vec_min_count = 12\n", + "doc2vec_dm = 0\n", + "doc2vec_workers = 8" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P_L0rDhZQ6Fn" + }, + "source": [ + "## Setup colab environment" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FsAR4CsB3aUc", + "outputId": "a5e4efde-a5c9-45f9-ef1c-9223b4d52ac6" + }, + "source": [ + "from psutil import virtual_memory\n", + "ram_gb = virtual_memory().total / 1e9\n", + "print('Your runtime has {:.1f} gigabytes of available RAM\\n'.format(ram_gb))\n", + "\n", + "if ram_gb < 20:\n", + " print('Not using a high-RAM runtime')\n", + "else:\n", + " print('You are using a high-RAM runtime!')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Your runtime has 27.3 gigabytes of available RAM\n", + "\n", + "You are using a high-RAM runtime!\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "h5MwRwL53aYY", + "outputId": "bc4c4c16-fb20-404a-e044-550fc4ca907d" + }, + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4z78CLYi75kV" + }, + "source": [ + "## Import libraries" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bcptSr6o3ac7", + "outputId": "19713482-dfeb-4be3-e63c-35b4253cb9e5" + }, + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.svm import SVC\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.model_selection import GridSearchCV\n", + "import pickle\n", + "\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from nltk.stem.snowball import SnowballStemmer\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n", + "from nltk.tokenize import word_tokenize\n", + "import spacy\n", + "import os\n", + "import nltk\n", + "import string\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dwSVXDtWZB5H", + "outputId": "44e2aa14-726f-43af-aa6a-1b7899e1025b" + }, + "source": [ + "!python -m spacy download fr_core_news_sm" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting fr_core_news_sm==2.2.5\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7 MB)\n", + "\u001b[K |████████████████████████████████| 14.7 MB 5.5 MB/s \n", + "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from fr_core_news_sm==2.2.5) (2.2.4)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (0.8.2)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (57.4.0)\n", + "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.1.3)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.0.6)\n", + "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.5)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.23.0)\n", + "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.0)\n", + "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (0.4.1)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.0.6)\n", + "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (7.4.0)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (4.62.3)\n", + "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.19.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.6)\n", + "Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (4.8.2)\n", + "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.10.0.2)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.6.0)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (2021.10.8)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.24.3)\n", + "Building wheels for collected packages: fr-core-news-sm\n", + " Building wheel for fr-core-news-sm (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.2.5-py3-none-any.whl size=14727026 sha256=994d176b35663506dd047e65863238d29b9b60313ba0dee5997c107f116477aa\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-c8y7i3ag/wheels/c9/a6/ea/0778337c34660027ee67ef3a91fb9d3600b76777a912ea1c24\n", + "Successfully built fr-core-news-sm\n", + "Installing collected packages: fr-core-news-sm\n", + "Successfully installed fr-core-news-sm-2.2.5\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the model via spacy.load('fr_core_news_sm')\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SuDZl6v48CBi" + }, + "source": [ + "## Utils functions" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Tunf_CYi3afO" + }, + "source": [ + "def create_dict(df, classColumnName):\n", + " return dict(df[classColumnName].value_counts())\n", + "\n", + "def remove_weak_classes(df, classColumnName, threshold):\n", + " dictOfClassInstances = create_dict(df,classColumnName)\n", + " dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }\n", + " keys = [*dictionary]\n", + " df_tmp = df[~ df[classColumnName].isin(keys)]\n", + " df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)\n", + " return df\n", + "\n", + "\n", + "def resample_classes(df, classColumnName, numberOfInstances):\n", + " #random numberOfInstances elements\n", + " replace = False # with replacement\n", + " fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n", + " return df.groupby(classColumnName, as_index=False).apply(fn)\n", + "\n", + "\n", + "def count_vect(data, max_df= 1.0 , min_df= 1, numberOfFeatures= None ):\n", + " stop_words = set(stopwords.words('french'))\n", + " stemmer_fr = SnowballStemmer(\"french\")\n", + " analyzer = CountVectorizer().build_analyzer()\n", + " def stemmed_words_fr(doc):\n", + " return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)\n", + " return CountVectorizer(stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures)\n", + "\n", + "\n", + "def tf_idf(data, max_df= 1.0 , min_df= 1, numberOfFeatures = None):\n", + " stop_words = set(stopwords.words('french'))\n", + " stemmer_fr = SnowballStemmer(\"french\")\n", + " analyzer = TfidfVectorizer().build_analyzer()\n", + " def stemmed_words_fr(doc):\n", + " return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)\n", + " return TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures)\n", + "\n", + "\n", + "def tokenize_fr_text(sentence):\n", + " result = string.punctuation\n", + " doc = nlp(sentence)\n", + " return [X.text.lower() for X in doc if not X.text in stopWords and not X.text in result and not len(X.text) < 2]\n", + "\n", + "\n", + "def doc2vec(tagged_tr, max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm, doc2vec_workers):\n", + " \n", + " stopWords = set(stopwords.words('french'))\n", + " #tagged_tr = [TaggedDocument(words = tokenize_fr_text(_d),tags = [str(i)]) for i, _d in enumerate(data)]\n", + " model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm, workers = doc2vec_workers)\n", + " model.build_vocab(tagged_tr)\n", + " model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs)\n", + " return model\n", + " #return np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lc1DRh4b7mto" + }, + "source": [ + "## Load datasets" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ybiJYL0h3ahh" + }, + "source": [ + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n", + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n", + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LRKJzWmf3pCg" + }, + "source": [ + "df_train = pd.read_csv(train_path, sep=\"\\t\")\n", + "df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nkRUCjiR84Qr" + }, + "source": [ + "## Vectorization\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "6QQXybaQ3pE9" + }, + "source": [ + "data_train = df_train[columnText].tolist()\n", + "vectorizer_dic = {}\n", + "\n", + "\n", + "nlp = spacy.load(\"fr_core_news_sm\")\n", + "stop_words = set(stopwords.words('french'))\n", + "\n", + "stemmer = SnowballStemmer('french').stem\n", + "def stem_tokenize(text):\n", + " return [stemmer(i) for i in word_tokenize(text) if not i in stop_words]\n", + "\n", + "for vectorizer_name in vectorizer_list:\n", + "\n", + " vec_file_name = vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n", + " if os.path.isfile(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name):\n", + " \n", + " # load existing vectorizers \n", + " with open(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name, 'rb') as file:\n", + " vectorizer = pickle.load(file)\n", + " \n", + " else :\n", + "\n", + " if vectorizer_name == \"bagofwords\" :\n", + " #vectorizer = count_vect(data_train, vectorization_max_df, vectorization_min_df, vectorization_numberOfFeatures)\n", + " vectorizer = CountVectorizer(analyzer = \"word\", lowercase=True, token_pattern='[a-zA-Z0-9]+', strip_accents='unicode',tokenizer=stem_tokenize)\n", + " vectorizer.fit(data_train)\n", + "\n", + " if vectorizer_name == \"tf_idf\" :\n", + " #vectorizer = tf_idf(data_train, vectorization_max_df, vectorization_min_df, vectorization_numberOfFeatures) \n", + " vectorizer = TfidfVectorizer(analyzer='word', lowercase=True, token_pattern='[a-zA-Z0-9]+', strip_accents='unicode',tokenizer=stem_tokenize)\n", + " vectorizer.fit(data_train)\n", + "\n", + " if vectorizer_name == \"doc2vec\" :\n", + " stopWords = set(stopwords.words('french'))\n", + " tagged_tr = [TaggedDocument(words = tokenize_fr_text(_d),tags = [str(i)]) for i, _d in enumerate(data_train)]\n", + " vectorizer = doc2vec(tagged_tr, max_epochs, doc2vec_vec_size, doc2vec_min_count, doc2vec_dm, doc2vec_workers)\n", + " \n", + " # saving vectorizer\n", + " with open(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name, 'wb') as file:\n", + " pickle.dump(vectorizer, file)\n", + " \n", + " vectorizer_dic[vectorizer_name] = vectorizer " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wntk5s8c88w5" + }, + "source": [ + "## Training classifier" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "rx_0eV-M3pHc" + }, + "source": [ + "classifier_dic = {}\n", + "grid_param = {}\n", + "\n", + "for classifier_name in classifier_list:\n", + " if classifier_name == \"bayes\":\n", + " classifier_dic[classifier_name] = MultinomialNB()\n", + " elif classifier_name == \"lr\":\n", + " classifier_dic[classifier_name] = LogisticRegression()\n", + " grid_param[classifier_name] = {\"C\":np.logspace(-3,3,7)}\n", + " elif classifier_name == \"sgd\":\n", + " classifier_dic[classifier_name] = SGDClassifier()\n", + " grid_param[classifier_name] = { \"loss\" : [\"log\", \"modified_huber\"]}\n", + " elif classifier_name == \"svm\":\n", + " classifier_dic[classifier_name] = SVC()\n", + " grid_param[classifier_name] = {'kernel':['linear','rbf']}\n", + " elif classifier_name == \"rfc\":\n", + " classifier_dic[classifier_name] = RandomForestClassifier()\n", + " grid_param[classifier_name] = { 'max_features': ['sqrt', 'log2'], 'max_depth' : [4,5,6,7,8]}\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "pO7oyeAF7KPK" + }, + "source": [ + "for clf_name, clf in classifier_dic.items():\n", + " if clf_name != 'bayes' :\n", + " clf = GridSearchCV(clf, grid_param[clf_name], refit = True, verbose = 3, n_jobs=-1)\n", + "\n", + " for vec_name, vectorizer in vectorizer_dic.items():\n", + "\n", + " if vec_name != 'doc2vec' :\n", + " vec_data = vectorizer.transform(data_train)\n", + " else : \n", + " vec_data = np.array([vectorizer.docvecs[str(i)] for i in range(len(tagged_tr))])\n", + "\n", + " clf.fit(vec_data, df_train[columnClass])\n", + "\n", + " clf_file_name = clf_name + '_' + vec_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n", + "\n", + " # saving classifier\n", + " with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'wb') as file:\n", + " pickle.dump(clf, file)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_evrNjmZ9E0e" + }, + "source": [ + "## Evaluation\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "TfKAjtVFblYe" + }, + "source": [ + "dataset_name = [\"validation\", \"test\"]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "h8vZar8c7KRq", + "outputId": "83511c89-9219-43d1-9e5a-820e75012166" + }, + "source": [ + "for dataset in dataset_name:\n", + " df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n", + " data_eval = df_eval[columnText].tolist()\n", + "\n", + " for classifier_name in classifier_list:\n", + "\n", + " for vectorizer_name in vectorizer_list:\n", + "\n", + " clf_file_name = classifier_name + '_' + vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n", + " with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'rb') as file:\n", + " clf = pickle.load(file)\n", + "\n", + " vec_file_name = vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n", + " with open(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name, 'rb') as file:\n", + " vectorizer = pickle.load(file)\n", + "\n", + " if vectorizer_name != 'doc2vec' :\n", + " vec_data = vectorizer.transform(data_eval)\n", + " else : \n", + " tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(data_eval)]\n", + " vec_data = np.array([vectorizer.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])\n", + "\n", + "\n", + " y_pred = clf.predict(vec_data)\n", + "\n", + "\n", + " report = classification_report(y_pred, df_eval[columnClass], output_dict = True)\n", + " precision = []\n", + " recall = []\n", + " f1 = []\n", + " support = []\n", + " dff = pd.DataFrame(columns= ['class', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", + " for c in df_eval[columnClass].unique() :\n", + " precision.append(report[c]['precision'])\n", + " recall.append(report[c]['recall'])\n", + " f1.append(report[c]['f1-score'])\n", + " support.append(report[c]['support'])\n", + "\n", + " accuracy = report['accuracy']\n", + " weighted_avg = report['weighted avg']\n", + " cnf_matrix = confusion_matrix(df_eval[columnClass], y_pred)\n", + " FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", + " FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", + " TP = np.diag(cnf_matrix)\n", + " TN = cnf_matrix.sum() - (FP + FN + TP)\n", + "\n", + " dff['class'] = df_eval[columnClass].unique()\n", + " dff['precision'] = precision\n", + " dff['recall'] = recall\n", + " dff['f1-score'] = f1\n", + " dff['support'] = support\n", + " dff['FP'] = FP\n", + " dff['FN'] = FN\n", + " dff['TP'] = TP\n", + " dff['TN'] = TN\n", + "\n", + "\n", + " print(dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass))\n", + "\n", + " print(weighted_avg)\n", + " print(accuracy)\n", + " print(dff)\n", + "\n", + " dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n", + "\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "validation_bayes_bagofwords_s10000\n", + "{'precision': 0.8377945389222964, 'recall': 0.619530464967571, 'f1-score': 0.6842670335331308, 'support': 10947}\n", + "0.619530464967571\n", + " class precision ... TP TN\n", + "0 Droit - Jurisprudence 0.963590 ... 5 10735\n", + "1 Grammaire 0.321888 ... 46 10760\n", + "2 Histoire naturelle 0.938776 ... 55 10665\n", + "3 Commerce 0.310249 ... 42 10679\n", + "4 Géographie 0.958193 ... 0 10839\n", + "5 Architecture 0.158491 ... 0 10863\n", + "6 Monnaie 0.000000 ... 4 10751\n", + "7 Médecine - Chirurgie 0.735981 ... 3 10860\n", + "8 Métiers 0.917495 ... 0 10925\n", + "9 Militaire (Art) - Guerre - Arme 0.182186 ... 1 10845\n", + "10 Anatomie 0.245989 ... 1 10853\n", + "11 Jeu 0.000000 ... 112 10553\n", + "12 Pharmacie 0.000000 ... 1138 9191\n", + "13 Antiquité 0.209125 ... 0 10921\n", + "14 Belles-lettres - Poésie 0.020513 ... 150 10358\n", + "15 Agriculture - Economie rustique 0.023585 ... 2269 8114\n", + "16 Mathématiques 0.142857 ... 357 9728\n", + "17 Beaux-arts 0.000000 ... 874 9278\n", + "18 Physique - [Sciences physico-mathématiques] 0.364372 ... 0 10893\n", + "19 Marine 0.410468 ... 149 10579\n", + "20 Chasse 0.009804 ... 5 10850\n", + "21 Arts et métiers 0.000000 ... 18 10819\n", + "22 Religion 0.526646 ... 0 10912\n", + "23 Blason 0.034483 ... 45 10699\n", + "24 Pêche 0.025641 ... 0 10926\n", + "25 Histoire 0.603041 ... 0 10886\n", + "26 Maréchage - Manège 0.051546 ... 11 10814\n", + "27 Mesure 0.000000 ... 0 10924\n", + "28 Economie domestique 0.000000 ... 315 10264\n", + "29 Philosophie 0.000000 ... 923 8722\n", + "30 Superstition 0.000000 ... 0 10888\n", + "31 Chimie 0.010638 ... 0 10854\n", + "32 Médailles 0.000000 ... 90 10659\n", + "33 Musique 0.082707 ... 0 10925\n", + "34 Caractères 0.000000 ... 1 10908\n", + "35 Spectacle 0.000000 ... 168 10570\n", + "36 Minéralogie 0.000000 ... 0 10938\n", + "37 Politique 0.000000 ... 0 10926\n", + "\n", + "[38 rows x 9 columns]\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "validation_bayes_tf_idf_s10000\n", + "{'precision': 0.9361172330822201, 'recall': 0.48853567187357266, 'f1-score': 0.6289575972884817, 'support': 10947}\n", + "0.48853567187357266\n", + " class precision ... TP TN\n", + "0 Droit - Jurisprudence 0.922100 ... 0 10735\n", + "1 Grammaire 0.000000 ... 7 10760\n", + "2 Histoire naturelle 0.888292 ... 0 10684\n", + "3 Commerce 0.036011 ... 1 10682\n", + "4 Géographie 0.995777 ... 0 10839\n", + "5 Architecture 0.003774 ... 0 10863\n", + "6 Monnaie 0.000000 ... 0 10752\n", + "7 Médecine - Chirurgie 0.221963 ... 0 10860\n", + "8 Métiers 0.903579 ... 0 10925\n", + "9 Militaire (Art) - Guerre - Arme 0.004049 ... 0 10845\n", + "10 Anatomie 0.037433 ... 0 10853\n", + "11 Jeu 0.000000 ... 13 10585\n", + "12 Pharmacie 0.000000 ... 1089 9047\n", + "13 Antiquité 0.000000 ... 0 10921\n", + "14 Belles-lettres - Poésie 0.000000 ... 0 10481\n", + "15 Agriculture - Economie rustique 0.000000 ... 2358 5636\n", + "16 Mathématiques 0.000000 ... 14 10349\n", + "17 Beaux-arts 0.000000 ... 827 9314\n", + "18 Physique - [Sciences physico-mathématiques] 0.004049 ... 0 10893\n", + "19 Marine 0.088154 ... 32 10583\n", + "20 Chasse 0.000000 ... 0 10850\n", + "21 Arts et métiers 0.000000 ... 0 10821\n", + "22 Religion 0.003135 ... 0 10912\n", + "23 Blason 0.000000 ... 1 10700\n", + "24 Pêche 0.000000 ... 0 10926\n", + "25 Histoire 0.023649 ... 0 10886\n", + "26 Maréchage - Manège 0.000000 ... 0 10814\n", + "27 Mesure 0.000000 ... 0 10924\n", + "28 Economie domestique 0.000000 ... 95 10502\n", + "29 Philosophie 0.000000 ... 909 8731\n", + "30 Superstition 0.000000 ... 0 10888\n", + "31 Chimie 0.000000 ... 0 10854\n", + "32 Médailles 0.000000 ... 1 10700\n", + "33 Musique 0.000000 ... 0 10925\n", + "34 Caractères 0.000000 ... 0 10908\n", + "35 Spectacle 0.000000 ... 1 10628\n", + "36 Minéralogie 0.000000 ... 0 10938\n", + "37 Politique 0.000000 ... 0 10926\n", + "\n", + "[38 rows x 9 columns]\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_bayes_bagofwords_s10000\n", + "{'precision': 0.8343333806034451, 'recall': 0.6158940397350994, 'f1-score': 0.6801987597575112, 'support': 13137}\n", + "0.6158940397350994\n", + " class precision ... TP TN\n", + "0 Histoire 0.579466 ... 3 12882\n", + "1 Droit - Jurisprudence 0.953423 ... 44 12913\n", + "2 Géographie 0.953906 ... 58 12804\n", + "3 Métiers 0.922949 ... 48 12815\n", + "4 Architecture 0.150943 ... 0 13008\n", + "5 Médecine - Chirurgie 0.744639 ... 0 13037\n", + "6 Mathématiques 0.225166 ... 2 12900\n", + "7 Grammaire 0.305357 ... 4 13032\n", + "8 Monnaie 0.000000 ... 0 13110\n", + "9 Commerce 0.327945 ... 1 13015\n", + "10 Anatomie 0.196429 ... 2 13025\n", + "11 Physique - [Sciences physico-mathématiques] 0.331081 ... 142 12652\n", + "12 Philosophie 0.000000 ... 1351 11028\n", + "13 Belles-lettres - Poésie 0.008511 ... 0 13106\n", + "14 Militaire (Art) - Guerre - Arme 0.199324 ... 171 12399\n", + "15 Antiquité 0.183544 ... 2711 9779\n", + "16 Maréchage - Manège 0.008621 ... 412 11633\n", + "17 Chasse 0.008197 ... 1054 11199\n", + "18 Agriculture - Economie rustique 0.011811 ... 0 13072\n", + "19 Histoire naturelle 0.942755 ... 185 12697\n", + "20 Religion 0.535248 ... 1 13021\n", + "21 Mesure 0.000000 ... 34 12983\n", + "22 Musique 0.062500 ... 0 13095\n", + "23 Arts et métiers 0.000000 ... 59 12838\n", + "24 Marine 0.425287 ... 0 13111\n", + "25 Blason 0.038095 ... 0 13064\n", + "26 Chimie 0.017857 ... 10 12976\n", + "27 Economie domestique 0.000000 ... 0 13109\n", + "28 Beaux-arts 0.000000 ... 382 12312\n", + "29 Jeu 0.000000 ... 1114 10375\n", + "30 Pêche 0.000000 ... 0 13066\n", + "31 Politique 0.000000 ... 0 13025\n", + "32 Minéralogie 0.000000 ... 98 12817\n", + "33 Pharmacie 0.000000 ... 0 13111\n", + "34 Superstition 0.000000 ... 0 13090\n", + "35 Caractères 0.000000 ... 205 12686\n", + "36 Médailles 0.000000 ... 0 13126\n", + "37 Spectacle 0.000000 ... 0 13112\n", + "\n", + "[38 rows x 9 columns]\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_bayes_tf_idf_s10000\n", + "{'precision': 0.9374431375624079, 'recall': 0.4883915658065007, 'f1-score': 0.6291194809131295, 'support': 13137}\n", + "0.4883915658065007\n", + " class precision ... TP TN\n", + "0 Histoire 0.018284 ... 0 12883\n", + "1 Droit - Jurisprudence 0.928017 ... 3 12913\n", + "2 Géographie 0.997185 ... 0 12821\n", + "3 Métiers 0.906379 ... 0 12819\n", + "4 Architecture 0.000000 ... 0 13008\n", + "5 Médecine - Chirurgie 0.230019 ... 0 13037\n", + "6 Mathématiques 0.000000 ... 0 12902\n", + "7 Grammaire 0.000000 ... 0 13032\n", + "8 Monnaie 0.000000 ... 0 13110\n", + "9 Commerce 0.036952 ... 0 13015\n", + "10 Anatomie 0.013393 ... 0 13025\n", + "11 Physique - [Sciences physico-mathématiques] 0.003378 ... 16 12701\n", + "12 Philosophie 0.000000 ... 1315 10852\n", + "13 Belles-lettres - Poésie 0.000000 ... 0 13106\n", + "14 Militaire (Art) - Guerre - Arme 0.003378 ... 0 12577\n", + "15 Antiquité 0.000000 ... 2834 6749\n", + "16 Maréchage - Manège 0.000000 ... 13 12422\n", + "17 Chasse 0.000000 ... 978 11227\n", + "18 Agriculture - Economie rustique 0.000000 ... 0 13072\n", + "19 Histoire naturelle 0.874776 ... 42 12702\n", + "20 Religion 0.002611 ... 0 13021\n", + "21 Mesure 0.000000 ... 0 12986\n", + "22 Musique 0.000000 ... 0 13095\n", + "23 Arts et métiers 0.000000 ... 1 12841\n", + "24 Marine 0.096552 ... 0 13111\n", + "25 Blason 0.000000 ... 0 13064\n", + "26 Chimie 0.000000 ... 0 12977\n", + "27 Economie domestique 0.000000 ... 0 13109\n", + "28 Beaux-arts 0.000000 ... 118 12608\n", + "29 Jeu 0.000000 ... 1094 10439\n", + "30 Pêche 0.000000 ... 0 13066\n", + "31 Politique 0.000000 ... 0 13025\n", + "32 Minéralogie 0.000000 ... 1 12840\n", + "33 Pharmacie 0.000000 ... 0 13111\n", + "34 Superstition 0.000000 ... 0 13090\n", + "35 Caractères 0.000000 ... 1 12754\n", + "36 Médailles 0.000000 ... 0 13126\n", + "37 Spectacle 0.000000 ... 0 13112\n", + "\n", + "[38 rows x 9 columns]\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "mMiQo_sR7KWn" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/EDdA_Classification_DeepLearning.ipynb b/notebooks/EDdA_Classification_DeepLearning.ipynb new file mode 100644 index 0000000..d8e9ea6 --- /dev/null +++ b/notebooks/EDdA_Classification_DeepLearning.ipynb @@ -0,0 +1,1351 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "EDdA-Classification_DeepLearning.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "0yFsoHXX8Iyy" + }, + "source": [ + "# Deep learning for EDdA classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EyksTV6277Jv" + }, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "G5LT5n9O7SLt" + }, + "source": [ + "train_path = 'training_set.tsv'\n", + "validation_path = 'validation_set.tsv'\n", + "test_path = 'test_set.tsv'\n", + "\n", + "columnText = 'contentWithoutClass'\n", + "columnClass = 'ensemble_domaine_enccre'\n", + "\n", + "minOfInstancePerClass = 0\n", + "maxOfInstancePerClass = 1500\n", + "\n", + "\n", + "batch_size = 64\n", + "max_len = 512 # \n", + "epochs = 20\n", + "embedding_dim = 300 " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tFlUCDL2778i" + }, + "source": [ + "## Setup colab environment" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Sp8d_Uus7SHJ", + "outputId": "82929364-d0a1-4962-fcb4-47224a48e6cf" + }, + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jQBu-p6hBU-j" + }, + "source": [ + "## Install packages" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bTIXsF6kBUdh" + }, + "source": [ + "#!pip install zeugma\n", + "#!pip install plot_model" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "56-04SNF8BMx" + }, + "source": [ + "## Import librairies" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HwWkSznz7SEv", + "outputId": "02ecbbf8-556f-4567-b57d-6e13a4ca28ff" + }, + "source": [ + "from nltk.tokenize import word_tokenize\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "\n", + "import keras\n", + "from keras import optimizers\n", + "from keras import backend as K\n", + "from keras import regularizers\n", + "from keras.models import Sequential\n", + "from keras.layers import Dense, Activation, Dropout, Flatten\n", + "from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D\n", + "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional\n", + "#from keras.utils import plot_model\n", + "from keras.preprocessing import sequence\n", + "from keras.preprocessing.text import Tokenizer\n", + "from keras.callbacks import EarlyStopping\n", + "\n", + "import string\n", + "import tensorflow as tf\n", + "#from zeugma import TextsToSequences\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn import preprocessing\n", + "from sklearn.metrics import classification_report\n", + "\n", + "\n", + "\n", + "from tqdm import tqdm\n", + "import requests, zipfile, io\n", + "import os, re, csv, math, codecs" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xrekV6W978l4" + }, + "source": [ + "## Utils functions" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4LJ5blQR7PUe" + }, + "source": [ + "\n", + "def resample_classes(df, classColumnName, numberOfInstances):\n", + " #random numberOfInstances elements\n", + " replace = False # with replacement\n", + " fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n", + " return df.groupby(classColumnName, as_index=False).apply(fn)\n", + " \n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-Rh3JMDh7zYd" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MtLr35eM753e" + }, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FnbNT4NF7zal" + }, + "source": [ + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n", + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n", + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "WNqDms64lfaS" + }, + "source": [ + "# download FastText\n", + "zip_file_url = \"https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip\"\n", + "r = requests.get(zip_file_url)\n", + "z = zipfile.ZipFile(io.BytesIO(r.content))\n", + "z.extractall()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PGMIi0CAmqSd", + "outputId": "09c034fd-f689-43a9-fd75-5923906d89bf" + }, + "source": [ + "print('loading word embeddings...')\n", + "\n", + "embeddings_index = {}\n", + "f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')\n", + "\n", + "for line in tqdm(f):\n", + " values = line.rstrip().rsplit(' ')\n", + " word = values[0]\n", + " coefs = np.asarray(values[1:], dtype='float32')\n", + " embeddings_index[word] = coefs\n", + "f.close()\n", + "\n", + "print('found %s word vectors' % len(embeddings_index))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "loading word embeddings...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "1999996it [03:40, 9087.22it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "found 1999996 word vectors\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nRLaQUO97zcq" + }, + "source": [ + "df_train = pd.read_csv(train_path, sep=\"\\t\")\n", + "df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)\n", + "\n", + "df_validation = pd.read_csv(validation_path, sep=\"\\t\")\n", + "df_validation = resample_classes(df_validation, columnClass, maxOfInstancePerClass)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vGWAgBH87ze8" + }, + "source": [ + "y_train = df_train[columnClass]\n", + "y_validation = df_validation[columnClass]\n", + "numberOfClasses = y_train.nunique()\n", + "\n", + "encoder = preprocessing.LabelEncoder()\n", + "\n", + "y_train = encoder.fit_transform(y_train)\n", + "y_validation = encoder.fit_transform(y_validation)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 452 + }, + "id": "7OYjo_uhoqcX", + "outputId": "79c4ff25-0476-4e12-d6ff-a8e073ee3f6c" + }, + "source": [ + "df_validation.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " <th>nb_word</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th rowspan=\"5\" valign=\"top\">0</th>\n", + " <th>10449</th>\n", + " <td>14</td>\n", + " <td>2879</td>\n", + " <td>Sabler une allée</td>\n", + " <td>Jardinage</td>\n", + " <td>terme de Jardinier.</td>\n", + " <td>Jaucourt</td>\n", + " <td>v14-1651-1</td>\n", + " <td>jardinage</td>\n", + " <td>Agriculture - Economie rustique</td>\n", + " <td>\\nSabler une allée, (terme de Jardinier.) c'es...</td>\n", + " <td>\\nSabler une allée, () c'est couvrir \\navec ar...</td>\n", + " <td>\\nSabler une allée, () c'est couvrir \\navec ar...</td>\n", + " <td>70</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8134</th>\n", + " <td>17</td>\n", + " <td>1598</td>\n", + " <td>Volée</td>\n", + " <td>Jardinage</td>\n", + " <td>Jardin.</td>\n", + " <td>Jaucourt</td>\n", + " <td>v17-842-3</td>\n", + " <td>jardinage</td>\n", + " <td>Agriculture - Economie rustique</td>\n", + " <td>\\nVolée, (Jardin.) c'est le nom qu'on donne au...</td>\n", + " <td>\\nVolée, () c'est le nom qu'on donne au travai...</td>\n", + " <td>\\nVolée, () c'est le nom qu'on donne au travai...</td>\n", + " <td>48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5308</th>\n", + " <td>13</td>\n", + " <td>2051</td>\n", + " <td>PRUNELLIER</td>\n", + " <td>Jardinage</td>\n", + " <td>Jardinage.</td>\n", + " <td>unsigned</td>\n", + " <td>v13-1146-0</td>\n", + " <td>jardinage</td>\n", + " <td>Agriculture - Economie rustique</td>\n", + " <td>\\nPRUNELLIER, s. m. (Jardinage.) arbrisseau ép...</td>\n", + " <td>\\nPRUNELLIER, s. m. () arbrisseau épineux qui ...</td>\n", + " <td>\\nPRUNELLIER, s. m. () arbrisseau épineux qui ...</td>\n", + " <td>275</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10064</th>\n", + " <td>9</td>\n", + " <td>3775</td>\n", + " <td>MACQUE</td>\n", + " <td>Economie rustique</td>\n", + " <td>Econ. rustiq.</td>\n", + " <td>unsigned</td>\n", + " <td>v9-2286-0</td>\n", + " <td>economierustique</td>\n", + " <td>Agriculture - Economie rustique</td>\n", + " <td>\\nMACQUE, s. f. (Econ. rustiq.) instrument de\\...</td>\n", + " <td>\\nMACQUE, s. f. () instrument de\\nbois dont on...</td>\n", + " <td>\\nMACQUE, s. f. () instrument de\\nbois dont on...</td>\n", + " <td>23</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5981</th>\n", + " <td>9</td>\n", + " <td>3262</td>\n", + " <td>LOQUE</td>\n", + " <td>Jardinage</td>\n", + " <td>Jardinage.</td>\n", + " <td>unsigned</td>\n", + " <td>v9-1905-0</td>\n", + " <td>jardinage</td>\n", + " <td>Agriculture - Economie rustique</td>\n", + " <td>\\nLOQUE, s. f. (Jardinage.) terme de jardinage...</td>\n", + " <td>\\nLOQUE, s. f. () terme de jardinage,\\nqui n'e...</td>\n", + " <td>\\nLOQUE, s. f. () terme de jardinage,\\nqui n'e...</td>\n", + " <td>61</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume ... nb_word\n", + "0 10449 14 ... 70\n", + " 8134 17 ... 48\n", + " 5308 13 ... 275\n", + " 10064 9 ... 23\n", + " 5981 9 ... 61\n", + "\n", + "[5 rows x 13 columns]" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HuUVfklf-dSR" + }, + "source": [ + "## Training models" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NTNh6kMTp_eU", + "outputId": "3c1eb88c-7f1d-48f1-92bc-bc671f5e1bc1" + }, + "source": [ + "#https://github.com/emmanuellaanggi/disaster_tweet_sentiment/blob/master/(Medium)_Text_Classification_Disaster_Tweet_.ipynb\n", + "\n", + "raw_docs_train = df_train[columnText].tolist()\n", + "raw_docs_validation = df_validation[columnText].tolist() \n", + "\n", + "\n", + "print(\"pre-processing train data...\")\n", + "\n", + "stop_words = set(stopwords.words('french'))\n", + "\n", + "processed_docs_train = []\n", + "for doc in tqdm(raw_docs_train):\n", + " tokens = word_tokenize(doc, language='french')\n", + " filtered = [word for word in tokens if word not in stop_words]\n", + " processed_docs_train.append(\" \".join(filtered))\n", + "#end for\n", + "\n", + "processed_docs_validation = []\n", + "for doc in tqdm(raw_docs_validation):\n", + " tokens = word_tokenize(doc)\n", + " filtered = [word for word in tokens if word not in stop_words]\n", + " processed_docs_validation.append(\" \".join(filtered))\n", + "#end for\n", + "\n", + "print(\"tokenizing input data...\")\n", + "tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n", + "tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation) #leaky\n", + "word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)\n", + "word_seq_validation = tokenizer.texts_to_sequences(processed_docs_validation)\n", + "word_index = tokenizer.word_index\n", + "print(\"dictionary size: \", len(word_index))\n", + "\n", + "#pad sequences\n", + "word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_len)\n", + "word_seq_validation = sequence.pad_sequences(word_seq_validation, maxlen=max_len)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "pre-processing train data...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 21129/21129 [00:15<00:00, 1359.31it/s]\n", + "100%|██████████| 10079/10079 [00:07<00:00, 1378.11it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tokenizing input data...\n", + "dictionary size: 95254\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Wj8RkOhT_e2c", + "outputId": "56152da7-47b7-4b07-84e7-8c499671d53e" + }, + "source": [ + "word_seq_validation" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[ 0, 0, 0, ..., 293, 8, 7],\n", + " [ 0, 0, 0, ..., 112, 8, 7],\n", + " [ 0, 0, 0, ..., 498, 212, 4],\n", + " ...,\n", + " [ 0, 0, 0, ..., 1, 28, 45],\n", + " [ 0, 0, 0, ..., 67, 12, 460],\n", + " [ 0, 0, 0, ..., 188, 213, 37]], dtype=int32)" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wGjQI0YgpQAS", + "outputId": "43a3d902-5a8d-4159-a21e-419b5ee35d7d" + }, + "source": [ + "#embedding matrix\n", + "\n", + "print('preparing embedding matrix...')\n", + "\n", + "words_not_found = []\n", + "nb_words = min(max_len, len(word_index)+1)\n", + "embedding_matrix = np.zeros((nb_words, embedding_dim))\n", + "\n", + "for word, i in word_index.items():\n", + " if i >= nb_words:\n", + " continue\n", + " embedding_vector = embeddings_index.get(word)\n", + " if (embedding_vector is not None) and len(embedding_vector) > 0:\n", + " # words not found in embedding index will be all-zeros.\n", + " embedding_matrix[i] = embedding_vector\n", + " else:\n", + " words_not_found.append(word)\n", + "print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "preparing embedding matrix...\n", + "number of null word embeddings: 70\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hjaeYIZCtGca", + "outputId": "5ab4dd1a-a500-479f-e289-892242c83de8" + }, + "source": [ + "print(\"sample words not found: \", np.random.choice(words_not_found, 10))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "sample words not found: ['especes' \"d'argent\" \"d'où\" \"d'argent\" \"qu'elle\" \"qu'elle\" \"c'étoit\"\n", + " 'différens' 'faisoit' 'faisoit']\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4O0gnsX8pNVU", + "outputId": "46feba64-b608-4b53-de15-b586dc24b880" + }, + "source": [ + "from keras.layers import BatchNormalization\n", + "import tensorflow as tf\n", + "\n", + "model = tf.keras.Sequential()\n", + "\n", + "model.add(Embedding(nb_words,embedding_dim,input_length=max_len, weights=[embedding_matrix],trainable=False))\n", + "model.add(Bidirectional(LSTM(100)))\n", + "model.add(Dense(64,activation='relu'))\n", + "model.add(Dropout(0.2))\n", + "#model.add(Dense(numberOfClasses,activation='sigmoid'))\n", + "model.add(Dense(numberOfClasses,activation='softmax'))\n", + "model.summary()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Model: \"sequential\"\n", + "_________________________________________________________________\n", + " Layer (type) Output Shape Param # \n", + "=================================================================\n", + " embedding (Embedding) (None, 512, 300) 153600 \n", + " \n", + " bidirectional (Bidirectiona (None, 200) 320800 \n", + " l) \n", + " \n", + " dense (Dense) (None, 64) 12864 \n", + " \n", + " dropout (Dropout) (None, 64) 0 \n", + " \n", + " dense_1 (Dense) (None, 38) 2470 \n", + " \n", + "=================================================================\n", + "Total params: 489,734\n", + "Trainable params: 336,134\n", + "Non-trainable params: 153,600\n", + "_________________________________________________________________\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "GcfMJl8f-cBA" + }, + "source": [ + "\n", + "#model = NN_withEmbeddings(longueur_dict, embedding_dim, max_len, numberOfClasses)\n", + "\n", + "model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", + "#model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC(multi_label=True)])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OTQTH5VDuA3I", + "outputId": "b8286232-4938-4591-b483-6b6d1bdc015e" + }, + "source": [ + "#model.fit(padded, np.array(y_train), epochs=epochs, batch_size = batch_size) \n", + "model.fit(word_seq_train, y_train, batch_size=256, epochs=epochs, validation_data=(word_seq_validation, y_validation), shuffle=True)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1/20\n", + "83/83 [==============================] - 530s 6s/step - loss: 3.0575 - accuracy: 0.1886 - val_loss: 2.2493 - val_accuracy: 0.4315\n", + "Epoch 2/20\n", + "83/83 [==============================] - 525s 6s/step - loss: 2.4420 - accuracy: 0.3559 - val_loss: 1.9674 - val_accuracy: 0.4978\n", + "Epoch 3/20\n", + "83/83 [==============================] - 538s 6s/step - loss: 2.1828 - accuracy: 0.4177 - val_loss: 1.8540 - val_accuracy: 0.5212\n", + "Epoch 4/20\n", + "83/83 [==============================] - 515s 6s/step - loss: 2.0359 - accuracy: 0.4555 - val_loss: 1.7155 - val_accuracy: 0.5439\n", + "Epoch 5/20\n", + "83/83 [==============================] - 533s 6s/step - loss: 1.9296 - accuracy: 0.4800 - val_loss: 1.6698 - val_accuracy: 0.5502\n", + "Epoch 6/20\n", + "83/83 [==============================] - 521s 6s/step - loss: 1.8527 - accuracy: 0.4990 - val_loss: 1.6268 - val_accuracy: 0.5634\n", + "Epoch 7/20\n", + "83/83 [==============================] - 517s 6s/step - loss: 1.7960 - accuracy: 0.5127 - val_loss: 1.6098 - val_accuracy: 0.5664\n", + "Epoch 8/20\n", + "83/83 [==============================] - 506s 6s/step - loss: 1.7429 - accuracy: 0.5213 - val_loss: 1.5687 - val_accuracy: 0.5741\n", + "Epoch 9/20\n", + "83/83 [==============================] - 524s 6s/step - loss: 1.6994 - accuracy: 0.5328 - val_loss: 1.5799 - val_accuracy: 0.5761\n", + "Epoch 10/20\n", + "83/83 [==============================] - 531s 6s/step - loss: 1.6568 - accuracy: 0.5426 - val_loss: 1.5366 - val_accuracy: 0.5874\n", + "Epoch 11/20\n", + "83/83 [==============================] - 515s 6s/step - loss: 1.6147 - accuracy: 0.5525 - val_loss: 1.5965 - val_accuracy: 0.5639\n", + "Epoch 12/20\n", + "83/83 [==============================] - 506s 6s/step - loss: 1.5833 - accuracy: 0.5601 - val_loss: 1.5263 - val_accuracy: 0.5880\n", + "Epoch 13/20\n", + "83/83 [==============================] - 505s 6s/step - loss: 1.5477 - accuracy: 0.5694 - val_loss: 1.5200 - val_accuracy: 0.5889\n", + "Epoch 14/20\n", + "83/83 [==============================] - 498s 6s/step - loss: 1.5119 - accuracy: 0.5776 - val_loss: 1.5272 - val_accuracy: 0.5887\n", + "Epoch 15/20\n", + "83/83 [==============================] - 500s 6s/step - loss: 1.4732 - accuracy: 0.5852 - val_loss: 1.5367 - val_accuracy: 0.5897\n", + "Epoch 16/20\n", + "83/83 [==============================] - 501s 6s/step - loss: 1.4471 - accuracy: 0.5914 - val_loss: 1.5411 - val_accuracy: 0.5832\n", + "Epoch 17/20\n", + "83/83 [==============================] - 501s 6s/step - loss: 1.4036 - accuracy: 0.6039 - val_loss: 1.5438 - val_accuracy: 0.5893\n", + "Epoch 18/20\n", + "83/83 [==============================] - 501s 6s/step - loss: 1.3778 - accuracy: 0.6075 - val_loss: 1.5547 - val_accuracy: 0.5825\n", + "Epoch 19/20\n", + "83/83 [==============================] - 502s 6s/step - loss: 1.3452 - accuracy: 0.6159 - val_loss: 1.5920 - val_accuracy: 0.5753\n", + "Epoch 20/20\n", + "83/83 [==============================] - 501s 6s/step - loss: 1.3247 - accuracy: 0.6223 - val_loss: 1.5850 - val_accuracy: 0.5773\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "<keras.callbacks.History at 0x7f4269526a90>" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Uw6YR76p_AF0" + }, + "source": [ + "## Saving models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ykTp9lyRaAma" + }, + "source": [ + "model.save(\"drive/MyDrive/Classification-EDdA/lstm_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5J4xDoqRUSfS" + }, + "source": [ + "# save embeddings\n", + "\n", + "# saving embeddings index \n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HHlEtipG_Cp0" + }, + "source": [ + "## Loading models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "fKt8ft1t_Cxx" + }, + "source": [ + "model = keras.models.load_model(\"drive/MyDrive/Classification-EDdA/lstm_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zbS4poso-3k7" + }, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "G9pjdMdNW_KS" + }, + "source": [ + "predictions = model.predict(word_seq_validation)\n", + "predictions = np.argmax(predictions,axis=1)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IHpVJ79IW_M0", + "outputId": "78e2a1aa-d35c-428c-e6c3-0ad332abcdfd" + }, + "source": [ + "report = classification_report(predictions, y_validation, output_dict = True)\n", + "\n", + "accuracy = report['accuracy']\n", + "weighted_avg = report['weighted avg']\n", + "\n", + "print(accuracy, weighted_avg)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.5773390217283461 {'precision': 0.5977985581006744, 'recall': 0.5773390217283461, 'f1-score': 0.5808733866443131, 'support': 10079}\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9SKjWffUW_PC" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LpgkGq-fW_RN" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "4gGNaPY1iuXD" + }, + "source": [ + "df_test = pd.read_csv(test_path, sep=\"\\t\")\n", + "\n", + "encoder = preprocessing.LabelEncoder()\n", + "y_test = encoder.fit_transform(df_test[columnClass])\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "P67p7BUZiuZV", + "outputId": "f958a063-ee95-4157-fcd9-796991615f03" + }, + "source": [ + "raw_docs_test = df_test[columnText].tolist()\n", + "\n", + "print(\"pre-processing test data...\")\n", + "\n", + "stop_words = set(stopwords.words('french'))\n", + "\n", + "processed_docs_test = []\n", + "for doc in tqdm(raw_docs_test):\n", + " tokens = word_tokenize(doc, language='french')\n", + " filtered = [word for word in tokens if word not in stop_words]\n", + " processed_docs_test.append(\" \".join(filtered))\n", + "#end for\n", + "\n", + "print(\"tokenizing input data...\")\n", + "#tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n", + "#tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation) #leaky\n", + "word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)\n", + "\n", + "#pad sequences\n", + "word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "pre-processing test data...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 13137/13137 [00:09<00:00, 1317.07it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tokenizing input data...\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "czeIqlD5iudH" + }, + "source": [ + "predictions = model.predict(word_seq_test)\n", + "predictions = np.argmax(predictions,axis=1)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Q9eYqi5SW_Ta", + "outputId": "3682a42a-7c07-446e-d913-3d20640fb2bf" + }, + "source": [ + "report = classification_report(predictions, y_test, output_dict = True)\n", + "\n", + "accuracy = report['accuracy']\n", + "weighted_avg = report['weighted avg']\n", + "\n", + "print(accuracy, weighted_avg)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.5957220065463956 {'precision': 0.6075119377257042, 'recall': 0.5957220065463956, 'f1-score': 0.59493432234528, 'support': 13137}\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ra4FOHVniwUI", + "outputId": "cbe576f6-ce14-49ef-9aba-2d26f76cab92" + }, + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "\n", + "classesName = encoder.classes_\n", + "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n", + "\n", + "precision = []\n", + "recall = []\n", + "f1 = []\n", + "support = []\n", + "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", + "for c in classes:\n", + " precision.append(report[c]['precision'])\n", + " recall.append(report[c]['recall'])\n", + " f1.append(report[c]['f1-score'])\n", + " support.append(report[c]['support'])\n", + "\n", + "accuracy = report['accuracy']\n", + "weighted_avg = report['weighted avg']\n", + "\n", + "\n", + "cnf_matrix = confusion_matrix(y_test, predictions)\n", + "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", + "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", + "TP = np.diag(cnf_matrix)\n", + "TN = cnf_matrix.sum() - (FP + FN + TP)\n", + "\n", + "dff['className'] = classesName\n", + "dff['precision'] = precision\n", + "dff['recall'] = recall\n", + "dff['f1-score'] = f1\n", + "dff['support'] = support\n", + "dff['FP'] = FP\n", + "dff['FN'] = FN\n", + "dff['TP'] = TP\n", + "dff['TN'] = TN\n", + "\n", + "print(\"test_lstm_s\"+str(maxOfInstancePerClass))\n", + "\n", + "print(weighted_avg)\n", + "print(accuracy)\n", + "print(dff)\n", + "\n", + "dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_test_lstm_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_lstm_s1500\n", + "{'precision': 0.6075119377257042, 'recall': 0.5957220065463956, 'f1-score': 0.59493432234528, 'support': 13137}\n", + "0.5957220065463956\n", + " className precision ... TP TN\n", + "0 Agriculture - Economie rustique 0.259843 ... 66 12780\n", + "1 Anatomie 0.446429 ... 100 12818\n", + "2 Antiquité 0.525316 ... 166 12425\n", + "3 Architecture 0.518868 ... 165 12597\n", + "4 Arts et métiers 0.007752 ... 1 13002\n", + "5 Beaux-arts 0.020000 ... 2 13016\n", + "6 Belles-lettres - Poésie 0.200000 ... 47 12667\n", + "7 Blason 0.466667 ... 49 12908\n", + "8 Caractères 0.074074 ... 2 13110\n", + "9 Chasse 0.262295 ... 32 12929\n", + "10 Chimie 0.348214 ... 39 12952\n", + "11 Commerce 0.524249 ... 227 12442\n", + "12 Droit - Jurisprudence 0.750176 ... 1063 11473\n", + "13 Economie domestique 0.000000 ... 0 13106\n", + "14 Grammaire 0.587500 ... 329 12094\n", + "15 Géographie 0.830753 ... 2361 10167\n", + "16 Histoire 0.459916 ... 327 11749\n", + "17 Histoire naturelle 0.687835 ... 769 11871\n", + "18 Jeu 0.415385 ... 27 13034\n", + "19 Marine 0.708046 ... 308 12497\n", + "20 Maréchage - Manège 0.784483 ... 91 12991\n", + "21 Mathématiques 0.450331 ... 68 12922\n", + "22 Mesure 0.333333 ... 14 13078\n", + "23 Militaire (Art) - Guerre - Arme 0.510135 ... 151 12719\n", + "24 Minéralogie 0.000000 ... 0 13111\n", + "25 Monnaie 0.041096 ... 3 13057\n", + "26 Musique 0.525000 ... 84 12922\n", + "27 Médailles 0.000000 ... 0 13109\n", + "28 Médecine - Chirurgie 0.584795 ... 300 12279\n", + "29 Métiers 0.592378 ... 715 11248\n", + "30 Pharmacie 0.014085 ... 1 13065\n", + "31 Philosophie 0.160714 ... 18 12934\n", + "32 Physique - [Sciences physico-mathématiques] 0.533784 ... 158 12690\n", + "33 Politique 0.000000 ... 0 13111\n", + "34 Pêche 0.127660 ... 6 13067\n", + "35 Religion 0.357702 ... 137 12580\n", + "36 Spectacle 0.000000 ... 0 13126\n", + "37 Superstition 0.000000 ... 0 13112\n", + "\n", + "[38 rows x 9 columns]\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "x03FC0D-iwWP" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "gSVqcywgiwYH" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-T5LfFtwiwaV" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Yjd5c70_iwcY" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "2UNjiHYliwes" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vLGTnit_W_V8" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "R-3lBXjDD9wE" + }, + "source": [ + "def predict(data, max_len):\n", + " \n", + " pad_sequ_test, _ = prepare_sequence(data, max_len)\n", + " pred_labels_ = model.predict(pad_sequ_test)\n", + "\n", + " return np.argmax(pred_labels_,axis=1)\n", + "\n", + "\n", + "def eval(data, labels, max_len):\n", + " \n", + " pred_labels_ = predict(data, max_len)\n", + " report = classification_report(pred_labels_, labels, output_dict = True)\n", + "\n", + " accuracy = report['accuracy']\n", + " weighted_avg = report['weighted avg']\n", + " \n", + " print(accuracy, weighted_avg)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6T3kAvKvExgc", + "outputId": "c6d4560e-fc64-4579-9adb-79c2e36d2386" + }, + "source": [ + "# evaluation sur le jeu de validation\n", + "eval(df_validation[columnText], y_validation, max_len)" + ], + "execution_count": null, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " return np.array(self.texts_to_sequences(texts))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.06925290207361841 {'precision': 0.09108131158125257, 'recall': 0.06925290207361841, 'f1-score': 0.06099084715237025, 'support': 10079}\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pTDJA03_-8yu", + "outputId": "d8bcdf73-c4c3-4c88-b063-90bd1cad5122" + }, + "source": [ + "# evaluation sur le jeu de test\n", + "df_test = pd.read_csv(test_path, sep=\"\\t\")\n", + "#df_test = resample_classes(df_test, columnClass, maxOfInstancePerClass)\n", + "\n", + "y_test = df_test[columnClass]\n", + "encoder = preprocessing.LabelEncoder()\n", + "y_test = encoder.fit_transform(y_test)\n", + "\n", + "eval(df_test[columnText], y_test, max_len)\n" + ], + "execution_count": null, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " return np.array(self.texts_to_sequences(texts))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.07231483595950369 {'precision': 0.081194635559303, 'recall': 0.07231483595950369, 'f1-score': 0.06322383877903374, 'support': 13137}\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/notebooks/EDdA_Classification_DeepLearning_2.ipynb b/notebooks/EDdA_Classification_DeepLearning_2.ipynb new file mode 100644 index 0000000..444fc9a --- /dev/null +++ b/notebooks/EDdA_Classification_DeepLearning_2.ipynb @@ -0,0 +1,1349 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "EDdA-Classification_DeepLearning_2.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "0yFsoHXX8Iyy" + }, + "source": [ + "# Deep learning for EDdA classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EyksTV6277Jv" + }, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "G5LT5n9O7SLt" + }, + "source": [ + "train_path = 'training_set.tsv'\n", + "validation_path = 'validation_set.tsv'\n", + "test_path = 'test_set.tsv'\n", + "\n", + "columnText = 'contentWithoutClass'\n", + "columnClass = 'ensemble_domaine_enccre'\n", + "\n", + "minOfInstancePerClass = 0\n", + "maxOfInstancePerClass = 10000\n", + "\n", + "\n", + "batch_size = 64\n", + "max_len = 512 # \n", + "epochs = 20\n", + "embedding_dim = 300 " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tFlUCDL2778i" + }, + "source": [ + "## Setup colab environment" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Sp8d_Uus7SHJ", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "20e599da-b04f-4ed9-95b0-ce22c094eff0" + }, + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jQBu-p6hBU-j" + }, + "source": [ + "## Install packages" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bTIXsF6kBUdh" + }, + "source": [ + "#!pip install zeugma\n", + "#!pip install plot_model" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "56-04SNF8BMx" + }, + "source": [ + "## Import librairies" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HwWkSznz7SEv", + "outputId": "046fd487-180e-4c50-ae33-d5ccc122ef46" + }, + "source": [ + "from nltk.tokenize import word_tokenize\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "\n", + "import keras\n", + "from keras import optimizers\n", + "from keras import backend as K\n", + "from keras import regularizers\n", + "from keras.models import Sequential\n", + "from keras.layers import Dense, Activation, Dropout, Flatten\n", + "from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D\n", + "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional\n", + "#from keras.utils import plot_model\n", + "from keras.preprocessing import sequence\n", + "from keras.preprocessing.text import Tokenizer\n", + "from keras.callbacks import EarlyStopping\n", + "\n", + "import string\n", + "import tensorflow as tf\n", + "#from zeugma import TextsToSequences\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn import preprocessing\n", + "from sklearn.metrics import classification_report\n", + "\n", + "\n", + "\n", + "from tqdm import tqdm\n", + "import requests, zipfile, io\n", + "import os, re, csv, math, codecs" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xrekV6W978l4" + }, + "source": [ + "## Utils functions" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4LJ5blQR7PUe" + }, + "source": [ + "\n", + "def resample_classes(df, classColumnName, numberOfInstances):\n", + " #random numberOfInstances elements\n", + " replace = False # with replacement\n", + " fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n", + " return df.groupby(classColumnName, as_index=False).apply(fn)\n", + " \n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-Rh3JMDh7zYd" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MtLr35eM753e" + }, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FnbNT4NF7zal" + }, + "source": [ + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n", + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n", + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "WNqDms64lfaS" + }, + "source": [ + "# download FastText\n", + "zip_file_url = \"https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip\"\n", + "r = requests.get(zip_file_url)\n", + "z = zipfile.ZipFile(io.BytesIO(r.content))\n", + "z.extractall()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PGMIi0CAmqSd", + "outputId": "f7f16180-fc1d-4163-c10b-0e7cae00b701" + }, + "source": [ + "print('loading word embeddings...')\n", + "\n", + "embeddings_index = {}\n", + "f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')\n", + "\n", + "for line in tqdm(f):\n", + " values = line.rstrip().rsplit(' ')\n", + " word = values[0]\n", + " coefs = np.asarray(values[1:], dtype='float32')\n", + " embeddings_index[word] = coefs\n", + "f.close()\n", + "\n", + "print('found %s word vectors' % len(embeddings_index))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "loading word embeddings...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "1999996it [03:42, 9002.96it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "found 1999996 word vectors\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nRLaQUO97zcq" + }, + "source": [ + "df_train = pd.read_csv(train_path, sep=\"\\t\")\n", + "df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)\n", + "\n", + "df_validation = pd.read_csv(validation_path, sep=\"\\t\")\n", + "df_validation = resample_classes(df_validation, columnClass, maxOfInstancePerClass)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vGWAgBH87ze8" + }, + "source": [ + "y_train = df_train[columnClass]\n", + "y_validation = df_validation[columnClass]\n", + "numberOfClasses = y_train.nunique()\n", + "\n", + "encoder = preprocessing.LabelEncoder()\n", + "\n", + "y_train = encoder.fit_transform(y_train)\n", + "y_validation = encoder.fit_transform(y_validation)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "7OYjo_uhoqcX", + "outputId": "17cccba3-2878-4cf0-e86c-33a20510f0a4" + }, + "source": [ + "df_validation.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " <th>nb_word</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>3</td>\n", + " <td>3723</td>\n", + " <td>Condition de Droit ou légale</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>unsigned</td>\n", + " <td>v3-1814-8</td>\n", + " <td>jurisprudence</td>\n", + " <td>Droit - Jurisprudence</td>\n", + " <td>\\nCondition de Droit ou légale, est celle que\\...</td>\n", + " <td>\\nCondition de Droit ou légale, est celle que\\...</td>\n", + " <td>\\nCondition de Droit ou légale, est celle que\\...</td>\n", + " <td>72</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>10</td>\n", + " <td>177</td>\n", + " <td>MANIER</td>\n", + " <td>Grammaire</td>\n", + " <td>Gramm.</td>\n", + " <td>unsigned</td>\n", + " <td>v10-112-0</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>\\nMANIER, v. act. (Gramm.) c'est ou toucher de...</td>\n", + " <td>\\nMANIER, v. act. () c'est ou toucher de\\nla m...</td>\n", + " <td>\\nMANIER, v. act. () c'est ou toucher de\\nla m...</td>\n", + " <td>109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>7</td>\n", + " <td>1357</td>\n", + " <td>GALAIQUE, galaïcos</td>\n", + " <td>Histoire naturelle</td>\n", + " <td>Hist. nat.</td>\n", + " <td>d'Holbach5</td>\n", + " <td>v7-606-0</td>\n", + " <td>histoirenaturelle</td>\n", + " <td>Histoire naturelle</td>\n", + " <td>\\nGALAIQUE, galaïcos, s. f. (Hist. nat.) nom d...</td>\n", + " <td>\\nGALAIQUE, galaïcos, s. f. () nom donné \\npar...</td>\n", + " <td>\\nGALAIQUE, galaïcos, s. f. () nom donné \\npar...</td>\n", + " <td>33</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>3198</td>\n", + " <td>Commis ambulant</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>unsigned</td>\n", + " <td>v3-1623-2</td>\n", + " <td>commerce</td>\n", + " <td>Commerce</td>\n", + " <td>\\nCommis ambulant, est un commis dont l'emploi...</td>\n", + " <td>\\nCommis ambulant, est un commis dont l'emploi...</td>\n", + " <td>\\nCommis ambulant, est un commis dont l'emploi...</td>\n", + " <td>43</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>17</td>\n", + " <td>3047</td>\n", + " <td>ZURMENTUM</td>\n", + " <td>Géographie ancienne</td>\n", + " <td>Géog. anc.</td>\n", + " <td>Jaucourt</td>\n", + " <td>v17-2047-0</td>\n", + " <td>géographie</td>\n", + " <td>Géographie</td>\n", + " <td>\\nZURMENTUM, (Géog. anc.) ville de l'Afrique\\n...</td>\n", + " <td>\\nZURMENTUM, () ville de l'Afrique\\npropre. Pt...</td>\n", + " <td>\\nZURMENTUM, () ville de l'Afrique\\npropre. Pt...</td>\n", + " <td>27</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero ... firstParagraph nb_word\n", + "0 3 3723 ... \\nCondition de Droit ou légale, est celle que\\... 72\n", + "1 10 177 ... \\nMANIER, v. act. () c'est ou toucher de\\nla m... 109\n", + "2 7 1357 ... \\nGALAIQUE, galaïcos, s. f. () nom donné \\npar... 33\n", + "3 3 3198 ... \\nCommis ambulant, est un commis dont l'emploi... 43\n", + "4 17 3047 ... \\nZURMENTUM, () ville de l'Afrique\\npropre. Pt... 27\n", + "\n", + "[5 rows x 13 columns]" + ] + }, + "metadata": {}, + "execution_count": 60 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HuUVfklf-dSR" + }, + "source": [ + "## Training models" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NTNh6kMTp_eU", + "outputId": "1ff499d7-a98e-47f9-815e-cbb13b5f307f" + }, + "source": [ + "#https://github.com/emmanuellaanggi/disaster_tweet_sentiment/blob/master/(Medium)_Text_Classification_Disaster_Tweet_.ipynb\n", + "\n", + "raw_docs_train = df_train[columnText].tolist()\n", + "raw_docs_validation = df_validation[columnText].tolist() \n", + "\n", + "\n", + "print(\"pre-processing train data...\")\n", + "\n", + "stop_words = set(stopwords.words('french'))\n", + "\n", + "processed_docs_train = []\n", + "for doc in tqdm(raw_docs_train):\n", + " tokens = word_tokenize(doc, language='french')\n", + " filtered = [word for word in tokens if word not in stop_words]\n", + " processed_docs_train.append(\" \".join(filtered))\n", + "#end for\n", + "\n", + "processed_docs_validation = []\n", + "for doc in tqdm(raw_docs_validation):\n", + " tokens = word_tokenize(doc)\n", + " filtered = [word for word in tokens if word not in stop_words]\n", + " processed_docs_validation.append(\" \".join(filtered))\n", + "#end for\n", + "\n", + "print(\"tokenizing input data...\")\n", + "tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n", + "tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation) #leaky\n", + "word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)\n", + "word_seq_validation = tokenizer.texts_to_sequences(processed_docs_validation)\n", + "word_index = tokenizer.word_index\n", + "print(\"dictionary size: \", len(word_index))\n", + "\n", + "#pad sequences\n", + "word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_len)\n", + "word_seq_validation = sequence.pad_sequences(word_seq_validation, maxlen=max_len)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "pre-processing train data...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 30650/30650 [00:23<00:00, 1324.19it/s]\n", + "100%|██████████| 10947/10947 [00:08<00:00, 1355.66it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tokenizing input data...\n", + "dictionary size: 115205\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Wj8RkOhT_e2c", + "outputId": "7f486466-bf76-4b82-ed32-56c31ae6dc2f" + }, + "source": [ + "word_seq_validation" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[ 0, 0, 0, ..., 9, 64, 116],\n", + " [ 0, 0, 0, ..., 301, 57, 313],\n", + " [ 0, 0, 0, ..., 9, 285, 6],\n", + " ...,\n", + " [ 0, 0, 0, ..., 26, 142, 6],\n", + " [ 0, 0, 0, ..., 333, 198, 2],\n", + " [ 0, 0, 0, ..., 24, 335, 1]], dtype=int32)" + ] + }, + "metadata": {}, + "execution_count": 62 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wGjQI0YgpQAS", + "outputId": "b2856dc6-793f-491e-8a90-bd5553f71933" + }, + "source": [ + "#embedding matrix\n", + "\n", + "print('preparing embedding matrix...')\n", + "\n", + "words_not_found = []\n", + "nb_words = min(max_len, len(word_index)+1)\n", + "embedding_matrix = np.zeros((nb_words, embedding_dim))\n", + "\n", + "for word, i in word_index.items():\n", + " if i >= nb_words:\n", + " continue\n", + " embedding_vector = embeddings_index.get(word)\n", + " if (embedding_vector is not None) and len(embedding_vector) > 0:\n", + " # words not found in embedding index will be all-zeros.\n", + " embedding_matrix[i] = embedding_vector\n", + " else:\n", + " words_not_found.append(word)\n", + "print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "preparing embedding matrix...\n", + "number of null word embeddings: 73\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hjaeYIZCtGca", + "outputId": "3ce480ec-21fa-4a94-f21d-586fd44c51bf" + }, + "source": [ + "print(\"sample words not found: \", np.random.choice(words_not_found, 10))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "sample words not found: ['ptolomée' \"l'amérique\" \"l'une\" \"qu'on\" \"lorsqu'il\" \"aujourd'hui\"\n", + " \"c'étoit\" \"qu'elle\" \"l'une\" 'lieues']\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4O0gnsX8pNVU", + "outputId": "28807df5-3c6f-4b62-fe32-a8ae250ddb7b" + }, + "source": [ + "from keras.layers import BatchNormalization\n", + "import tensorflow as tf\n", + "\n", + "model = tf.keras.Sequential()\n", + "\n", + "model.add(Embedding(nb_words,embedding_dim,input_length=max_len, weights=[embedding_matrix],trainable=False))\n", + "#model.add(Bidirectional(LSTM(100)))\n", + "model.add(Conv1D(64,5,activation='relu'))\n", + "model.add(MaxPooling1D(pool_size=(max_len - 5 + 1)))\n", + "model.add(Flatten())\n", + "model.add(Dense(numberOfClasses,activation='softmax'))\n", + "model.summary()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Model: \"sequential_2\"\n", + "_________________________________________________________________\n", + " Layer (type) Output Shape Param # \n", + "=================================================================\n", + " embedding_2 (Embedding) (None, 512, 300) 153600 \n", + " \n", + " conv1d_2 (Conv1D) (None, 508, 64) 96064 \n", + " \n", + " max_pooling1d_2 (MaxPooling (None, 1, 64) 0 \n", + " 1D) \n", + " \n", + " flatten_2 (Flatten) (None, 64) 0 \n", + " \n", + " dense_2 (Dense) (None, 38) 2470 \n", + " \n", + "=================================================================\n", + "Total params: 252,134\n", + "Trainable params: 98,534\n", + "Non-trainable params: 153,600\n", + "_________________________________________________________________\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "GcfMJl8f-cBA" + }, + "source": [ + "\n", + "#model = NN_withEmbeddings(longueur_dict, embedding_dim, max_len, numberOfClasses)\n", + "\n", + "model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", + "#model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC(multi_label=True)])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OTQTH5VDuA3I", + "outputId": "f01b4a29-6599-49b0-b1ed-52d241a68b19" + }, + "source": [ + "#model.fit(padded, np.array(y_train), epochs=epochs, batch_size = batch_size) \n", + "model.fit(word_seq_train, y_train, batch_size=256, epochs=epochs, validation_data=(word_seq_validation, y_validation), shuffle=True)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 2.4656 - accuracy: 0.3793 - val_loss: 2.1042 - val_accuracy: 0.4652\n", + "Epoch 2/20\n", + "120/120 [==============================] - 183s 2s/step - loss: 1.9110 - accuracy: 0.5068 - val_loss: 1.8333 - val_accuracy: 0.5262\n", + "Epoch 3/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 1.6637 - accuracy: 0.5682 - val_loss: 1.6986 - val_accuracy: 0.5556\n", + "Epoch 4/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 1.5183 - accuracy: 0.6033 - val_loss: 1.6377 - val_accuracy: 0.5657\n", + "Epoch 5/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 1.4169 - accuracy: 0.6247 - val_loss: 1.5928 - val_accuracy: 0.5782\n", + "Epoch 6/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 1.3342 - accuracy: 0.6436 - val_loss: 1.5676 - val_accuracy: 0.5847\n", + "Epoch 7/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 1.2657 - accuracy: 0.6595 - val_loss: 1.5651 - val_accuracy: 0.5860\n", + "Epoch 8/20\n", + "120/120 [==============================] - 183s 2s/step - loss: 1.2061 - accuracy: 0.6747 - val_loss: 1.5505 - val_accuracy: 0.5917\n", + "Epoch 9/20\n", + "120/120 [==============================] - 183s 2s/step - loss: 1.1518 - accuracy: 0.6897 - val_loss: 1.5586 - val_accuracy: 0.5873\n", + "Epoch 10/20\n", + "120/120 [==============================] - 183s 2s/step - loss: 1.1022 - accuracy: 0.7027 - val_loss: 1.5791 - val_accuracy: 0.5850\n", + "Epoch 11/20\n", + "120/120 [==============================] - 183s 2s/step - loss: 1.0543 - accuracy: 0.7150 - val_loss: 1.5675 - val_accuracy: 0.5873\n", + "Epoch 12/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 1.0111 - accuracy: 0.7260 - val_loss: 1.5801 - val_accuracy: 0.5852\n", + "Epoch 13/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 0.9718 - accuracy: 0.7358 - val_loss: 1.5925 - val_accuracy: 0.5855\n", + "Epoch 14/20\n", + "120/120 [==============================] - 183s 2s/step - loss: 0.9371 - accuracy: 0.7463 - val_loss: 1.5984 - val_accuracy: 0.5864\n", + "Epoch 15/20\n", + "120/120 [==============================] - 183s 2s/step - loss: 0.9032 - accuracy: 0.7556 - val_loss: 1.6136 - val_accuracy: 0.5816\n", + "Epoch 16/20\n", + "120/120 [==============================] - 183s 2s/step - loss: 0.8684 - accuracy: 0.7655 - val_loss: 1.6376 - val_accuracy: 0.5775\n", + "Epoch 17/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 0.8394 - accuracy: 0.7744 - val_loss: 1.6575 - val_accuracy: 0.5781\n", + "Epoch 18/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 0.8105 - accuracy: 0.7831 - val_loss: 1.6596 - val_accuracy: 0.5779\n", + "Epoch 19/20\n", + "120/120 [==============================] - 183s 2s/step - loss: 0.7826 - accuracy: 0.7910 - val_loss: 1.6774 - val_accuracy: 0.5741\n", + "Epoch 20/20\n", + "120/120 [==============================] - 184s 2s/step - loss: 0.7560 - accuracy: 0.7996 - val_loss: 1.6946 - val_accuracy: 0.5727\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "<keras.callbacks.History at 0x7f6ca8a6d890>" + ] + }, + "metadata": {}, + "execution_count": 67 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Uw6YR76p_AF0" + }, + "source": [ + "## Saving models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ykTp9lyRaAma" + }, + "source": [ + "model.save(\"drive/MyDrive/Classification-EDdA/cnn_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5J4xDoqRUSfS" + }, + "source": [ + "# save embeddings\n", + "\n", + "# saving embeddings index \n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HHlEtipG_Cp0" + }, + "source": [ + "## Loading models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "fKt8ft1t_Cxx" + }, + "source": [ + "model = keras.models.load_model(\"drive/MyDrive/Classification-EDdA/cnn_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zbS4poso-3k7" + }, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "G9pjdMdNW_KS" + }, + "source": [ + "predictions = model.predict(word_seq_validation)\n", + "predictions = np.argmax(predictions,axis=1)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IHpVJ79IW_M0", + "outputId": "2e1657b3-04d1-42f1-ea8b-9bbcd4744108" + }, + "source": [ + "report = classification_report(predictions, y_validation, output_dict = True)\n", + "\n", + "accuracy = report['accuracy']\n", + "weighted_avg = report['weighted avg']\n", + "\n", + "print(accuracy, weighted_avg)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.5726683109527725 {'precision': 0.6118028288513718, 'recall': 0.5726683109527725, 'f1-score': 0.5870482221489528, 'support': 10947}\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9SKjWffUW_PC" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LpgkGq-fW_RN" + }, + "source": [ + "df_test = pd.read_csv(test_path, sep=\"\\t\")\n", + "\n", + "encoder = preprocessing.LabelEncoder()\n", + "y_test = encoder.fit_transform(df_test[columnClass])\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Q9eYqi5SW_Ta", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "31e45f20-583a-4ca6-eac8-21863f6fef5b" + }, + "source": [ + "raw_docs_test = df_test[columnText].tolist()\n", + "\n", + "print(\"pre-processing test data...\")\n", + "\n", + "stop_words = set(stopwords.words('french'))\n", + "\n", + "processed_docs_test = []\n", + "for doc in tqdm(raw_docs_test):\n", + " tokens = word_tokenize(doc, language='french')\n", + " filtered = [word for word in tokens if word not in stop_words]\n", + " processed_docs_test.append(\" \".join(filtered))\n", + "#end for\n", + "\n", + "print(\"tokenizing input data...\")\n", + "#tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n", + "#tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation) #leaky\n", + "word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)\n", + "\n", + "#pad sequences\n", + "word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "pre-processing test data...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 13137/13137 [00:09<00:00, 1331.48it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tokenizing input data...\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_WjpJN-Bqjeb" + }, + "source": [ + "predictions = model.predict(word_seq_test)\n", + "predictions = np.argmax(predictions,axis=1)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zUwjL_dQqjgx", + "outputId": "912642ad-95eb-413a-d074-8d4881a57359" + }, + "source": [ + "report = classification_report(predictions, y_test, output_dict = True)\n", + "\n", + "accuracy = report['accuracy']\n", + "weighted_avg = report['weighted avg']\n", + "\n", + "print(accuracy, weighted_avg)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.5698409073608891 {'precision': 0.6081680700148677, 'recall': 0.5698409073608891, 'f1-score': 0.5847417616022411, 'support': 13137}\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ka6DcPe7qqvg", + "outputId": "0c8cfbe6-178d-4208-98ba-4ba688e32939" + }, + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "\n", + "classesName = encoder.classes_\n", + "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n", + "\n", + "precision = []\n", + "recall = []\n", + "f1 = []\n", + "support = []\n", + "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", + "for c in classes:\n", + " precision.append(report[c]['precision'])\n", + " recall.append(report[c]['recall'])\n", + " f1.append(report[c]['f1-score'])\n", + " support.append(report[c]['support'])\n", + "\n", + "accuracy = report['accuracy']\n", + "weighted_avg = report['weighted avg']\n", + "\n", + "\n", + "cnf_matrix = confusion_matrix(y_test, predictions)\n", + "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", + "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", + "TP = np.diag(cnf_matrix)\n", + "TN = cnf_matrix.sum() - (FP + FN + TP)\n", + "\n", + "dff['className'] = classesName\n", + "dff['precision'] = precision\n", + "dff['recall'] = recall\n", + "dff['f1-score'] = f1\n", + "dff['support'] = support\n", + "dff['FP'] = FP\n", + "dff['FN'] = FN\n", + "dff['TP'] = TP\n", + "dff['TN'] = TN\n", + "\n", + "print(\"test_cnn_s\"+str(maxOfInstancePerClass))\n", + "\n", + "print(weighted_avg)\n", + "print(accuracy)\n", + "print(dff)\n", + "\n", + "dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_test_cnn_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_cnn_s10000\n", + "{'precision': 0.6081680700148677, 'recall': 0.5698409073608891, 'f1-score': 0.5847417616022411, 'support': 13137}\n", + "0.5698409073608891\n", + " className precision ... TP TN\n", + "0 Agriculture - Economie rustique 0.216535 ... 55 12636\n", + "1 Anatomie 0.459821 ... 103 12768\n", + "2 Antiquité 0.287975 ... 91 12710\n", + "3 Architecture 0.339623 ... 108 12722\n", + "4 Arts et métiers 0.015504 ... 2 12995\n", + "5 Beaux-arts 0.060000 ... 6 13018\n", + "6 Belles-lettres - Poésie 0.127660 ... 30 12761\n", + "7 Blason 0.228571 ... 24 12993\n", + "8 Caractères 0.037037 ... 1 13110\n", + "9 Chasse 0.221311 ... 27 12962\n", + "10 Chimie 0.160714 ... 18 12991\n", + "11 Commerce 0.443418 ... 192 12490\n", + "12 Droit - Jurisprudence 0.762879 ... 1081 11263\n", + "13 Economie domestique 0.000000 ... 0 13102\n", + "14 Grammaire 0.408929 ... 229 12254\n", + "15 Géographie 0.917312 ... 2607 9910\n", + "16 Histoire 0.405063 ... 288 11777\n", + "17 Histoire naturelle 0.743292 ... 831 11661\n", + "18 Jeu 0.061538 ... 4 13067\n", + "19 Marine 0.590805 ... 257 12549\n", + "20 Maréchage - Manège 0.620690 ... 72 13001\n", + "21 Mathématiques 0.549669 ... 83 12903\n", + "22 Mesure 0.095238 ... 4 13087\n", + "23 Militaire (Art) - Guerre - Arme 0.476351 ... 141 12704\n", + "24 Minéralogie 0.000000 ... 0 13111\n", + "25 Monnaie 0.054795 ... 4 13051\n", + "26 Musique 0.287500 ... 46 12904\n", + "27 Médailles 0.000000 ... 0 13107\n", + "28 Médecine - Chirurgie 0.376218 ... 193 12149\n", + "29 Métiers 0.605634 ... 731 11047\n", + "30 Pharmacie 0.070423 ... 5 13045\n", + "31 Philosophie 0.071429 ... 8 12996\n", + "32 Physique - [Sciences physico-mathématiques] 0.378378 ... 112 12674\n", + "33 Politique 0.000000 ... 0 13110\n", + "34 Pêche 0.170213 ... 8 13069\n", + "35 Religion 0.326371 ... 125 12488\n", + "36 Spectacle 0.000000 ... 0 13121\n", + "37 Superstition 0.000000 ... 0 13112\n", + "\n", + "[38 rows x 9 columns]\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BqJ1_hUUqqx5" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "bhfuGNwIqrOQ" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "NkL3MopyqrQk" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "XLHl-pvzqjjI" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "lLR_Xvi9qjlo" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8cGcLOFTqjoP" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vLGTnit_W_V8" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "R-3lBXjDD9wE" + }, + "source": [ + "def predict(data, max_len):\n", + " \n", + " pad_sequ_test, _ = prepare_sequence(data, max_len)\n", + " pred_labels_ = model.predict(pad_sequ_test)\n", + "\n", + " return np.argmax(pred_labels_,axis=1)\n", + "\n", + "\n", + "def eval(data, labels, max_len):\n", + " \n", + " pred_labels_ = predict(data, max_len)\n", + " report = classification_report(pred_labels_, labels, output_dict = True)\n", + "\n", + " accuracy = report['accuracy']\n", + " weighted_avg = report['weighted avg']\n", + " \n", + " print(accuracy, weighted_avg)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6T3kAvKvExgc", + "outputId": "c6d4560e-fc64-4579-9adb-79c2e36d2386" + }, + "source": [ + "# evaluation sur le jeu de validation\n", + "eval(df_validation[columnText], y_validation, max_len)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " return np.array(self.texts_to_sequences(texts))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.06925290207361841 {'precision': 0.09108131158125257, 'recall': 0.06925290207361841, 'f1-score': 0.06099084715237025, 'support': 10079}\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pTDJA03_-8yu", + "outputId": "d8bcdf73-c4c3-4c88-b063-90bd1cad5122" + }, + "source": [ + "# evaluation sur le jeu de test\n", + "df_test = pd.read_csv(test_path, sep=\"\\t\")\n", + "#df_test = resample_classes(df_test, columnClass, maxOfInstancePerClass)\n", + "\n", + "y_test = df_test[columnClass]\n", + "encoder = preprocessing.LabelEncoder()\n", + "y_test = encoder.fit_transform(y_test)\n", + "\n", + "eval(df_test[columnText], y_test, max_len)\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " return np.array(self.texts_to_sequences(texts))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.07231483595950369 {'precision': 0.081194635559303, 'recall': 0.07231483595950369, 'f1-score': 0.06322383877903374, 'support': 13137}\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/notebooks/EDdA_Classification_Generate_ConfusionMatrix.ipynb b/notebooks/EDdA_Classification_Generate_ConfusionMatrix.ipynb new file mode 100644 index 0000000..14a33f7 --- /dev/null +++ b/notebooks/EDdA_Classification_Generate_ConfusionMatrix.ipynb @@ -0,0 +1,1181 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "EDdA-Classification_Generate_ConfusionMatrix.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "F-x2Ei_TdhSs" + }, + "source": [ + "train_path = 'training_set.tsv'\n", + "validation_path = 'validation_set.tsv'\n", + "test_path = 'test_set.tsv'\n", + "\n", + "columnText = 'contentWithoutClass'\n", + "columnClass = 'ensemble_domaine_enccre'\n", + "\n", + "minOfInstancePerClass = 0\n", + "maxOfInstancePerClass = 10000" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "En632UWohZBW" + }, + "source": [ + "## Setup colab environment" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WoNGyMbFdsh1", + "outputId": "c5542219-0412-4e16-9779-122d5f99a1e2" + }, + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1LXBuRs9kOOc", + "outputId": "1f5fe407-4a46-4b96-8124-1a0c334616df" + }, + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import pickle\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import plot_confusion_matrix\n", + "\n", + "from nltk.stem.snowball import SnowballStemmer\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "import nltk\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FNPXtQ19kbco" + }, + "source": [ + "def resample_classes(df, classColumnName, numberOfInstances):\n", + " #random numberOfInstances elements\n", + " replace = False # with replacement\n", + " fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n", + " return df.groupby(classColumnName, as_index=False).apply(fn)\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jHyc3VeFhrxs" + }, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8-1HRF3Vhr3y", + "outputId": "bd5f5881-363f-41a9-ade7-33bbd1158adb" + }, + "source": [ + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n", + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n", + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2021-11-26 08:17:56-- https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n", + "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n", + "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 189925180 (181M) [text/tab-separated-values]\n", + "Saving to: ‘training_set.tsv’\n", + "\n", + "training_set.tsv 100%[===================>] 181.13M 31.9MB/s in 6.3s \n", + "\n", + "2021-11-26 08:18:02 (28.9 MB/s) - ‘training_set.tsv’ saved [189925180/189925180]\n", + "\n", + "--2021-11-26 08:18:03-- https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n", + "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n", + "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 67474385 (64M) [text/tab-separated-values]\n", + "Saving to: ‘validation_set.tsv’\n", + "\n", + "validation_set.tsv 100%[===================>] 64.35M 24.4MB/s in 2.6s \n", + "\n", + "2021-11-26 08:18:06 (24.4 MB/s) - ‘validation_set.tsv’ saved [67474385/67474385]\n", + "\n", + "--2021-11-26 08:18:06-- https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv\n", + "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n", + "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 79961640 (76M) [text/tab-separated-values]\n", + "Saving to: ‘test_set.tsv’\n", + "\n", + "test_set.tsv 100%[===================>] 76.26M 25.5MB/s in 3.0s \n", + "\n", + "2021-11-26 08:18:09 (25.5 MB/s) - ‘test_set.tsv’ saved [79961640/79961640]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "P_Psa_NhhyAA" + }, + "source": [ + "\n", + "df_test = pd.read_csv(test_path, sep=\"\\t\")\n", + "df_test = resample_classes(df_test, columnClass, maxOfInstancePerClass)\n", + "#df_test.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace=True)\n", + "\n", + "\n", + "data_eval = df_test[columnText].tolist()\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "AfsjFx1L_ddl" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iPQmgaSw_dnw" + }, + "source": [ + "## Test sur l'ensemble du corpus" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "I-BT_jRs74tI" + }, + "source": [ + "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/EDdA_dataframe_withContent.tsv" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "0NrbzDu66-k3" + }, + "source": [ + "\n", + "## test sortie pour Katie avec la classification de tous les articles\n", + "df = pd.read_csv(\"EDdA_dataframe_withContent.tsv\", sep=\"\\t\")\n", + "df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace=True)\n", + "\n", + "\n", + "data_eval = df[columnText].tolist()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k07oOrFyhPJ-" + }, + "source": [ + "## Load model" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "SHCqMPk8iPZS" + }, + "source": [ + "classifier_name = \"sgd\" # sgd | lr | rfc | svm | bayes | bert-base-multilingual | camembert-base\n", + "vectorizer_name = \"tf_idf\" # bagofwords | tf_idf | doc2vec" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "oJ2xKgoVSQFC" + }, + "source": [ + "# récupérer les modèles depuis le serveur\n", + "\n", + "\n", + "# récupéréer les modèles depuis Google Drive\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "xI_4exathQdd" + }, + "source": [ + "if classifier_name in [\"sgd\", \"lr\", \"rfc\", \"svm\", \"bayes\"]:\n", + "\n", + " stop_words = set(stopwords.words('french'))\n", + " stemmer = SnowballStemmer('french').stem\n", + " def stem_tokenize(text):\n", + " return [stemmer(i) for i in word_tokenize(text) if not i in stop_words]\n", + "\n", + " vec_file_name = vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n", + " with open(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name, 'rb') as file:\n", + " vectorizer = pickle.load(file)\n", + "\n", + " clf_file_name = classifier_name + '_' + vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n", + " with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'rb') as file:\n", + " clf = pickle.load(file)\n", + "\n", + " if vectorizer_name != 'doc2vec' :\n", + " vec_data = vectorizer.transform(data_eval)\n", + " else : \n", + " tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(data_eval)]\n", + " vec_data = np.array([vectorizer.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])\n", + "\n", + "elif classifier_name in [\"bert-base-multilingual\", \"camembert-base\"]:\n", + "\n", + " clf_file_name = \"drive/MyDrive/Classification-EDdA/model_\"+classifier_name + '_s' + str(maxOfInstancePerClass) +\".pt\"\n", + "\n", + " model = torch.load(clf_file_name)\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jJjCGPTFjC78", + "outputId": "099e267e-8f5e-4c85-ef8e-b6bb60104c8d" + }, + "source": [ + "df_test[columnClass]" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 Commerce\n", + "1 NaN\n", + "2 Marine\n", + "3 Géographie\n", + "4 Histoire\n", + " ... \n", + "15849 Géographie\n", + "15850 NaN\n", + "15851 Arts et métiers\n", + "15852 Anatomie\n", + "15853 NaN\n", + "Name: ensemble_domaine_enccre, Length: 15854, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 853 + }, + "id": "o2J8mU_djDsm", + "outputId": "aa2784b6-623d-4605-cdfb-93e2b6adb3c1" + }, + "source": [ + "plot_confusion_matrix(clf, vec_data, df_test[columnClass], normalize=\"true\", include_values=False, xticks_rotation=\"vertical\", cmap=plt.cm.Blues)\n", + "name = classifier_name + '_' +vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".png\"\n", + "\n", + "print(name)\n", + "pathSave = \"drive/MyDrive/Classification-EDdA/\" + name\n", + "plt.rcParams[\"figure.figsize\"] = (10,10)\n", + "plt.rcParams[\"font.size\"] = 10\n", + "\n", + "plt.savefig(pathSave, bbox_inches='tight')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.\n", + " warnings.warn(msg, category=FutureWarning)\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "sgd_tf_idf_s10000.png\n" + ] + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x720 with 2 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5KnaRf855lsv" + }, + "source": [ + "# ajouter le code pour faire la prediction avec les modèles BERT\n", + "\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "llGjT-xsUvR4" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "3dGPXQSLUvUn" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "NQyuDQw_JOwB" + }, + "source": [ + "y_pred = clf.predict(vec_data)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zgNKwbp_eYos" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "dZGxg_OreYrO" + }, + "source": [ + "df_test = df.copy()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "g8FfxZ7bKwCe" + }, + "source": [ + "df_test['classification'] = y_pred" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hRcYKfdIK0Tm", + "outputId": "db988435-9716-4cf5-a754-04bc5356369f" + }, + "source": [ + "df_test.shape" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(61738, 14)" + ] + }, + "metadata": {}, + "execution_count": 29 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nlV3yXcCMb8v" + }, + "source": [ + "df_test.head()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GuotNONXMXgt", + "outputId": "5fb34593-c97d-4401-a617-b25aa8f7e49c" + }, + "source": [ + "df_test.loc[(df_test['ensemble_domaine_enccre'] != df_test['classification'])].shape" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(8597, 14)" + ] + }, + "metadata": {}, + "execution_count": 30 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "raw7PJrtMsDx" + }, + "source": [ + "\n", + "\n", + "df_test['class_is_true'] = df_test['ensemble_domaine_enccre'] == df_test['classification']" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310 + }, + "id": "qDD13-3dOSgK", + "outputId": "a309b603-8179-48ff-ad55-f3599f0dc699" + }, + "source": [ + "df_test.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>numero</th>\n", + " <th>head</th>\n", + " <th>normClass</th>\n", + " <th>classEDdA</th>\n", + " <th>author</th>\n", + " <th>id_enccre</th>\n", + " <th>domaine_enccre</th>\n", + " <th>ensemble_domaine_enccre</th>\n", + " <th>content</th>\n", + " <th>contentWithoutClass</th>\n", + " <th>firstParagraph</th>\n", + " <th>nb_word</th>\n", + " <th>classification</th>\n", + " <th>class_is_true</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>A, a & a</td>\n", + " <td>Grammaire</td>\n", + " <td>ordre Encyclopéd. Entend. Science de l'homme, ...</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-0</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>A, a & a s.m. (ordre Encyclopéd.\\nEntend. Scie...</td>\n", + " <td>711</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais5</td>\n", + " <td>v1-1-1</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>A, mot, est 1. la troisieme personne du présen...</td>\n", + " <td>238</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>A</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Dumarsais</td>\n", + " <td>v1-1-2</td>\n", + " <td>grammaire</td>\n", + " <td>Grammaire</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>A, préposition vient du latin à , à dextris, à ...</td>\n", + " <td>1980</td>\n", + " <td>Grammaire</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>1</td>\n", + " <td>10</td>\n", + " <td>A, numismatique ou monétaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-5</td>\n", + " <td>numismatique</td>\n", + " <td>Médailles</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>A, numismatique ou monétaire, sur le revers de...</td>\n", + " <td>112</td>\n", + " <td>Médailles</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>1</td>\n", + " <td>11</td>\n", + " <td>A, lapidaire</td>\n", + " <td>unclassified</td>\n", + " <td>unclassified</td>\n", + " <td>Mallet</td>\n", + " <td>v1-1-6</td>\n", + " <td>inscriptions</td>\n", + " <td>Histoire</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>A, lapidaire, dans les anciennes inscriptions ...</td>\n", + " <td>80</td>\n", + " <td>Histoire</td>\n", + " <td>True</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume numero ... classification class_is_true\n", + "3 1 5 ... Grammaire True\n", + "4 1 6 ... Grammaire True\n", + "5 1 7 ... Grammaire True\n", + "8 1 10 ... Médailles True\n", + "9 1 11 ... Histoire True\n", + "\n", + "[5 rows x 15 columns]" + ] + }, + "metadata": {}, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qsAd_w_iO9LZ" + }, + "source": [ + "df_test.to_csv('result_classification_sgdtfidf_21.11.24.csv', index=False)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "H4XfLD3EaaTe", + "outputId": "50c60efb-6670-4bd2-8c2d-f7c309fb0932" + }, + "source": [ + "df_test.loc[(df_test['ensemble_domaine_enccre'] == 'Géographie') & (df_test['class_is_true'] == False )].shape" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(95, 15)" + ] + }, + "metadata": {}, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "J3Nbs6zMCnWh" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "s6xTROC7CnZA" + }, + "source": [ + "## test de sortie des scores (proba) pour chaque classe" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "1TyETcoyCnbU" + }, + "source": [ + "y_pred_proba = clf.predict_proba(vec_data)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2W4i8nrLC61s", + "outputId": "86373732-4a06-487f-db1b-0a2e867974fa" + }, + "source": [ + "clf.classes_" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array(['Agriculture - Economie rustique', 'Anatomie', 'Antiquité',\n", + " 'Architecture', 'Arts et métiers', 'Beaux-arts',\n", + " 'Belles-lettres - Poésie', 'Blason', 'Caractères', 'Chasse',\n", + " 'Chimie', 'Commerce', 'Droit - Jurisprudence',\n", + " 'Economie domestique', 'Grammaire', 'Géographie', 'Histoire',\n", + " 'Histoire naturelle', 'Jeu', 'Marine', 'Maréchage - Manège',\n", + " 'Mathématiques', 'Mesure', 'Militaire (Art) - Guerre - Arme',\n", + " 'Minéralogie', 'Monnaie', 'Musique', 'Médailles',\n", + " 'Médecine - Chirurgie', 'Métiers', 'Pharmacie', 'Philosophie',\n", + " 'Physique - [Sciences physico-mathématiques]', 'Politique',\n", + " 'Pêche', 'Religion', 'Spectacle', 'Superstition'], dtype='<U43')" + ] + }, + "metadata": {}, + "execution_count": 47 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 73 + }, + "id": "tiecHJyTC66o", + "outputId": "bf846387-9964-418d-d122-9bc032c60266" + }, + "source": [ + "data_eval[0]" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "\"\\nLes pins ont encore le mérite de l'agrément ; ils\\nconservent pendant toute l'année leurs feuilles, qui\\ndans la plûpart des especes sont d'une très-belle verdure.\\nCes arbres sont d'une belle stature, & d'un accroissement \\nrégulier ; ils ne sont sujets ni aux insectes,\\n\\n\\nni à aucune maladie ; enfin plusieurs de ces pins sont\\nde la plus belle apparence au printems, par la couleur \\nvive des chatons dont ils sont chargés. Voyez sur\\nla culture du pin, le dictionnaire des Jardiniers de\\nM. Miller, & pour tous égards, le traité des arbres\\nde M. Duhamel, qui est entré dans des détails intéressans \\nsur cet arbre.\\n\"" + ] + }, + "metadata": {}, + "execution_count": 44 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cFkSivM2Cndt", + "outputId": "8fda16d7-04cc-4609-8fa6-7995a4ffd01c" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([0.38404935, 0. , 0. , 0. , 0. ,\n", + " 0.01376867, 0.10553505, 0. , 0. , 0. ,\n", + " 0. , 0. , 0. , 0. , 0. ,\n", + " 0. , 0.00485592, 0.47335577, 0. , 0. ,\n", + " 0. , 0. , 0. , 0. , 0. ,\n", + " 0. , 0. , 0. , 0.01843524, 0. ,\n", + " 0. , 0. , 0. , 0. , 0. ,\n", + " 0. , 0. , 0. ])" + ] + }, + "metadata": {}, + "execution_count": 42 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 36 + }, + "id": "3dG5qbPoCngN", + "outputId": "0ad887fe-dd94-4d4d-856a-b45b8091d650" + }, + "source": [ + "clf.classes_[np.argmax(y_pred_proba[0], axis=0)]" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'Histoire naturelle'" + ] + }, + "metadata": {}, + "execution_count": 49 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qsrY1g6mCniF" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "gFywr71BCnkt" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 645 + }, + "id": "_Gews6OdbN3d", + "outputId": "03b7bb01-51be-4d35-f090-84f02b697366" + }, + "source": [ + "df_test.loc[(df_test['ensemble_domaine_enccre'] == 'Géographie') & (df_test['class_is_true'] == False )].groupby(by=[\"classification\"]).size().reset_index(name='counts').sort_values(by='counts', ascending=False)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>classification</th>\n", + " <th>counts</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>Histoire</td>\n", + " <td>19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>Histoire naturelle</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Antiquité</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Belles-lettres - Poésie</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>Religion</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>Physique - [Sciences physico-mathématiques]</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Droit - Jurisprudence</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Commerce</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>Grammaire</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>Philosophie</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>Marine</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>Mathématiques</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>Médailles</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>Médecine - Chirurgie</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Chimie</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Beaux-arts</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Architecture</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>Militaire (Art) - Guerre - Arme</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>Musique</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " classification counts\n", + "8 Histoire 19\n", + "9 Histoire naturelle 11\n", + "0 Antiquité 10\n", + "3 Belles-lettres - Poésie 9\n", + "18 Religion 9\n", + "17 Physique - [Sciences physico-mathématiques] 8\n", + "6 Droit - Jurisprudence 5\n", + "5 Commerce 4\n", + "7 Grammaire 4\n", + "16 Philosophie 3\n", + "10 Marine 2\n", + "11 Mathématiques 2\n", + "14 Médailles 2\n", + "15 Médecine - Chirurgie 2\n", + "4 Chimie 1\n", + "2 Beaux-arts 1\n", + "1 Architecture 1\n", + "12 Militaire (Art) - Guerre - Arme 1\n", + "13 Musique 1" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "IF_N5qRqdsmj" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "C_OcQ-uudso3" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "dgFIEa0Pdsre" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tHX62GU4dsue" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file -- GitLab