diff --git a/.DS_Store b/.DS_Store index 55e7fe6422480e50b15aea87086f46dda65ca172..801dbbfe341b8be3c7a21024f642288b612c928d 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index 03156e2c22bc6e140bbe03842bc500d4728df496..db1ed1dfb73882a0d0ea40c2b70301ac99f0ce06 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ dataframe_with_normClass_artfl.csv *.pkl .DS_Store .DS_Store +.DS_Store diff --git a/notebooks/EDdA_Classification_BertFineTuning.ipynb b/notebooks/EDdA_Classification_BertFineTuning.ipynb index dc0830e18213cca6d3b8ef6733586bdb1c7715b9..4058698a7665850bd055042710bcb25391cae564 100644 --- a/notebooks/EDdA_Classification_BertFineTuning.ipynb +++ b/notebooks/EDdA_Classification_BertFineTuning.ipynb @@ -62,17 +62,7 @@ "id": "WF0qFN_g3ekz", "outputId": "445ffd96-843b-4ff1-a24d-c110964a63e4" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Your runtime has 27.3 gigabytes of available RAM\n", - "\n", - "You are using a high-RAM runtime!\n" - ] - } - ], + "outputs": [], "source": [ "from psutil import virtual_memory\n", "ram_gb = virtual_memory().total / 1e9\n", @@ -94,15 +84,7 @@ "id": "vL0S-s9Uofvn", "outputId": "415b7bf1-d3fd-42b6-ee03-13601c953a4f" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Mounted at /content/drive\n" - ] - } - ], + "outputs": [], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" @@ -127,16 +109,7 @@ "id": "dPOU-Efhf4ui", "outputId": "fc873e0c-1254-4928-c8e9-e3eb093acc64" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "There are 1 GPU(s) available.\n", - "We will use the GPU: Tesla P100-PCIE-16GB\n" - ] - } - ], + "outputs": [], "source": [ "import torch\n", "\n", @@ -175,57 +148,7 @@ "id": "pwmZ5bBvgGNh", "outputId": "e92404c6-af38-4bd8-8c99-20ec6b545b3f" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting transformers==4.10.3\n", - " Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)\n", - "\u001b[K |████████████████████████████████| 2.8 MB 5.0 MB/s \n", - "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n", - " Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n", - "\u001b[K |████████████████████████████████| 3.3 MB 38.8 MB/s \n", - "\u001b[?25hCollecting pyyaml>=5.1\n", - " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", - "\u001b[K |████████████████████████████████| 596 kB 58.6 MB/s \n", - "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (2019.12.20)\n", - "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (4.62.3)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (2.23.0)\n", - "Collecting huggingface-hub>=0.0.12\n", - " Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)\n", - "\u001b[K |████████████████████████████████| 61 kB 486 kB/s \n", - "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (3.4.0)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (1.19.5)\n", - "Collecting sacremoses\n", - " Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n", - "\u001b[K |████████████████████████████████| 895 kB 43.3 MB/s \n", - "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (21.3)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.10.3) (4.8.2)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.12->transformers==4.10.3) (3.10.0.2)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers==4.10.3) (3.0.6)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.10.3) (3.6.0)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (2.10)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (2021.10.8)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.10.3) (3.0.4)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (7.1.2)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (1.15.0)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.10.3) (1.1.0)\n", - "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n", - " Attempting uninstall: pyyaml\n", - " Found existing installation: PyYAML 3.13\n", - " Uninstalling PyYAML-3.13:\n", - " Successfully uninstalled PyYAML-3.13\n", - "Successfully installed huggingface-hub-0.2.1 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.10.3\n", - "Collecting sentencepiece\n", - " Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", - "\u001b[K |████████████████████████████████| 1.2 MB 5.1 MB/s \n", - "\u001b[?25hInstalling collected packages: sentencepiece\n", - "Successfully installed sentencepiece-0.1.96\n" - ] - } - ], + "outputs": [], "source": [ "!pip install transformers==4.10.3\n", "!pip install sentencepiece" @@ -384,16 +307,7 @@ "id": "zj3JDoJNfx1f", "outputId": "59262e3f-5fe0-49f5-bb55-8586653498ab" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(30650, 13)\n", - "(10947, 13)\n" - ] - } - ], + "outputs": [], "source": [ "print(df_train.shape)\n", "print(df_validation.shape)" @@ -417,17 +331,6 @@ "y_validation = encoder.fit_transform(y_validation)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "u9AxxaA_h1CM" - }, - "outputs": [], - "source": [ - "#train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )\n" - ] - }, { "cell_type": "code", "execution_count": null, @@ -443,39 +346,6 @@ "labels_validation = y_validation.tolist()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Dq_KF5WAsbpC", - "outputId": "ba91b953-abcb-4bed-a5c5-9e429e68239a" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([\"\\nESTAMPEUR, s. m. en , est une\\nsorte de pilon de bois, surmonté d'un manche d'environ \\ndeux piés & demi. On s'en sert pour estamper\\nles formes où l'on veut faire des vergeoises. Voyez\\nVergeoise & Estamper.\\n\",\n", - " \"\\nOn doit ébourgeonner les vignes, alors ce mot doit\\ns'entendre autrement que pour les arbres fruitiers:\\non ébourgeonne les vignes. non-seulement quand on\\nsupprime les bourgeons surnuméraires, mais encore\\nquand on arrête par-en-haut les bourgeons. Il en est\\nde même quand on détache en cassant les faux bourgeons \\nqui poussent d'ordinaire à chaque noeud à \\ncôté des yeux, à commencer par le bas. (K)\\n\",\n", - " \"\\nBois mort en pié, s'il est pourri sur pié, sans\\nsubstance, & bon seulement à brûler.\\n\",\n", - " ...,\n", - " \"\\nIl y a une hydatoscopie naturelle & permise ; elle\\nconsiste à prévoir & à prédire les orages & les tempêtes \\nsur certains signes qu'on remarque dans la mer,\\ndans l'air, & dans les nuages. Voyez Tems & Ouragans. Dict. de Trévoux.\\n\",\n", - " \"\\nMÉTÉOROMANCIE, s.f. () divination par\\nles météores ; & comme les météores ignés sont ceux\\nqui jettent le plus de crainte parmi les hommes, la\\nmétéoromancie désigne proprement la divination par\\nle tonnerre & les éclairs. Cette espece de divination\\npassa des Toscans aux Romains, sons rien perdre de\\nce qu'elle avoit de frivole. Seneque nous apprend\\nque deux auteurs graves, & qui avoient exercé des\\n\\nmagistratures, écrivoient à Rome sur cette matiere.\\nIl semble même que l'un d'eux l'épuisa entierement,\\ncar il donnoit une liste exacte des différentes especes\\nde tonnerres. Il circonstancioit & leurs noms & les\\nprognostics qui s'en pouvoient tirer ; le tout avec un\\nair de confiance plus surprenant encore que les choses\\nqu'il rapportoit. On eût dit, tant cette matiere météorologique lui étoit familiere, qu'il comptoit les tableaux \\nde sa galerie, ou qu'il faisoit la description\\ndes fleurs de son jardin. La plus ancienne maladie,\\nla plus invétérée, la plus incurable du genre humain,\\nc'est l'envie de connoître ce qui doit arriver.\\nNi le voile obscur qui nous cache notre destinée, ni\\nl'expérience journaliere, ni une infinité de tentatives \\nmalheureuses, n'ont pû guerir les hommes. Hé!\\nse dépréviennent-ils jamais d'une erreur agréablement \\nreçue? Nous sommes sur ce point aussi crédules\\nque nos ancêtres ; nous prêtons comme eux l'oreille\\nà toutes les impostures flatteuses. Pour avoir trompé\\ncent fois, elles n'ont point perdu le droit funeste de\\ntromper encore. (D. J.)\\n\",\n", - " \"\\nPENTACLE, s. m. () c'est le nom que la\\nmagie des exorcismes donne à un sceau imprimé ou\\nsur du parchemin vierge fait de peau de bouc, ou\\nsur quelque métal, or, argent, cuivre, étain, plomb,\\n&c. On ne peut faire aucune opération magique pour\\nexorciser les esprits, sans avoir ce sceau qui contient\\nles noms de Dieu. Le pentacle se fait en renfermant\\nun triangle dans deux cercles : on lit dans ce triangle \\nces trois mots ; formatio, reformatio, transformatio. A côté du triangle est le mot agla, qui est très puissant \\npour arrêter la malice des esprits. Il faut que\\nla peau sur laquelle on applique le sceau soit exorcisée \\n& bénite. On exorcise aussi l'encre & la plume,\\ndont on se sert pour écrire les mots dont on vient de\\nparler. Après cela on encense le pentacle ; on l'enferme \\ntrois jours & trois nuits dans un vase bien net ;\\nenfin, on le met dans un linge ou dans un livre que\\nl'on parfume & que l'on exorcise. Voilà les fadaises\\nqu'on lit dans le livre intitulé Encheiridion Leonis papae, ouvrage misérable, qui n'a servi qu'à gâter davantage \\nles esprits crédules & portés à la superstitition.\\n(D. J.)\\n\"],\n", - " dtype=object)" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sentences_train" - ] - }, { "cell_type": "markdown", "metadata": { @@ -548,57 +418,7 @@ "id": "C4bigx_3ibuN", "outputId": "b8cef3f8-7a6c-47d1-9d37-7b3b6d08f00b" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading CamemBERT tokenizer...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "06c6e7721b68449a9f3619ffdf18dfeb", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading: 0%| | 0.00/811k [00:00<?, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fba1d1d5c83b40659295a3457d74cb4e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading: 0%| | 0.00/1.40M [00:00<?, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6a29c1c28ceb415f91ec55512da981c5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading: 0%| | 0.00/508 [00:00<?, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Load the BERT tokenizer.\n", "if model_chosen == \"bert\":\n", @@ -619,15 +439,7 @@ "id": "5hNod5X9jDZN", "outputId": "93b6e633-afb7-4bcc-be00-44388f801d64" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Token indices sequence length is longer than the specified maximum sequence length for this model (1263 > 512). Running this sequence through the model will result in indexing errors\n" - ] - } - ], + "outputs": [], "source": [ " # Tokenize all of the sentences and map the tokens to thier word IDs.\n", "input_ids_train = []\n", @@ -685,16 +497,7 @@ "id": "W9EWv5JvjGH3", "outputId": "32cd417d-9a40-4086-d900-b81982407667" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Max sentence length train: 2253\n", - "Max sentence length validation: 3067\n" - ] - } - ], + "outputs": [], "source": [ "print('Max sentence length train: ', max([len(sen) for sen in input_ids_train]))\n", "print('Max sentence length validation: ', max([len(sen) for sen in input_ids_validation])) " @@ -862,338 +665,7 @@ "id": "C7M2Er1ajsTf", "outputId": "2c3f467d-ab09-4f8f-d464-a4e738333587" }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4873cc6c9e1d493c9a67d6536e4367a6", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading: 0%| | 0.00/445M [00:00<?, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']\n", - "- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, - { - "data": { - "text/plain": [ - "CamembertForSequenceClassification(\n", - " (roberta): RobertaModel(\n", - " (embeddings): RobertaEmbeddings(\n", - " (word_embeddings): Embedding(32005, 768, padding_idx=1)\n", - " (position_embeddings): Embedding(514, 768, padding_idx=1)\n", - " (token_type_embeddings): Embedding(1, 768)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (encoder): RobertaEncoder(\n", - " (layer): ModuleList(\n", - " (0): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (1): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (2): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (3): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (4): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (5): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (6): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (7): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (8): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (9): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (10): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (11): RobertaLayer(\n", - " (attention): RobertaAttention(\n", - " (self): RobertaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): RobertaSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): RobertaIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): RobertaOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (classifier): RobertaClassificationHead(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " (out_proj): Linear(in_features=768, out_features=38, bias=True)\n", - " )\n", - ")" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Load BertForSequenceClassification, the pretrained BERT model with a single \n", "# linear classification layer on top.\n", @@ -1267,320 +739,7 @@ "id": "SbHBbYpwkKaA", "outputId": "49f7f5f4-716d-44c2-e299-505086a89061" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======== Epoch 1 / 4 ========\n", - "Training...\n", - " Batch 40 of 2,642. Elapsed: 0:00:18.\n", - " Batch 80 of 2,642. Elapsed: 0:00:36.\n", - " Batch 120 of 2,642. Elapsed: 0:00:55.\n", - " Batch 160 of 2,642. Elapsed: 0:01:13.\n", - " Batch 200 of 2,642. Elapsed: 0:01:31.\n", - " Batch 240 of 2,642. Elapsed: 0:01:49.\n", - " Batch 280 of 2,642. Elapsed: 0:02:08.\n", - " Batch 320 of 2,642. Elapsed: 0:02:26.\n", - " Batch 360 of 2,642. Elapsed: 0:02:44.\n", - " Batch 400 of 2,642. Elapsed: 0:03:02.\n", - " Batch 440 of 2,642. Elapsed: 0:03:20.\n", - " Batch 480 of 2,642. Elapsed: 0:03:39.\n", - " Batch 520 of 2,642. Elapsed: 0:03:57.\n", - " Batch 560 of 2,642. Elapsed: 0:04:15.\n", - " Batch 600 of 2,642. Elapsed: 0:04:33.\n", - " Batch 640 of 2,642. Elapsed: 0:04:51.\n", - " Batch 680 of 2,642. Elapsed: 0:05:10.\n", - " Batch 720 of 2,642. Elapsed: 0:05:28.\n", - " Batch 760 of 2,642. Elapsed: 0:05:46.\n", - " Batch 800 of 2,642. Elapsed: 0:06:04.\n", - " Batch 840 of 2,642. Elapsed: 0:06:22.\n", - " Batch 880 of 2,642. Elapsed: 0:06:41.\n", - " Batch 920 of 2,642. Elapsed: 0:06:59.\n", - " Batch 960 of 2,642. Elapsed: 0:07:17.\n", - " Batch 1,000 of 2,642. Elapsed: 0:07:35.\n", - " Batch 1,040 of 2,642. Elapsed: 0:07:54.\n", - " Batch 1,080 of 2,642. Elapsed: 0:08:12.\n", - " Batch 1,120 of 2,642. Elapsed: 0:08:30.\n", - " Batch 1,160 of 2,642. Elapsed: 0:08:48.\n", - " Batch 1,200 of 2,642. Elapsed: 0:09:06.\n", - " Batch 1,240 of 2,642. Elapsed: 0:09:25.\n", - " Batch 1,280 of 2,642. Elapsed: 0:09:43.\n", - " Batch 1,320 of 2,642. Elapsed: 0:10:01.\n", - " Batch 1,360 of 2,642. Elapsed: 0:10:19.\n", - " Batch 1,400 of 2,642. Elapsed: 0:10:37.\n", - " Batch 1,440 of 2,642. Elapsed: 0:10:56.\n", - " Batch 1,480 of 2,642. Elapsed: 0:11:14.\n", - " Batch 1,520 of 2,642. Elapsed: 0:11:32.\n", - " Batch 1,560 of 2,642. Elapsed: 0:11:50.\n", - " Batch 1,600 of 2,642. Elapsed: 0:12:08.\n", - " Batch 1,640 of 2,642. Elapsed: 0:12:27.\n", - " Batch 1,680 of 2,642. Elapsed: 0:12:45.\n", - " Batch 1,720 of 2,642. Elapsed: 0:13:03.\n", - " Batch 1,760 of 2,642. Elapsed: 0:13:21.\n", - " Batch 1,800 of 2,642. Elapsed: 0:13:39.\n", - " Batch 1,840 of 2,642. Elapsed: 0:13:58.\n", - " Batch 1,880 of 2,642. Elapsed: 0:14:16.\n", - " Batch 1,920 of 2,642. Elapsed: 0:14:34.\n", - " Batch 1,960 of 2,642. Elapsed: 0:14:52.\n", - " Batch 2,000 of 2,642. Elapsed: 0:15:11.\n", - " Batch 2,040 of 2,642. Elapsed: 0:15:29.\n", - " Batch 2,080 of 2,642. Elapsed: 0:15:47.\n", - " Batch 2,120 of 2,642. Elapsed: 0:16:05.\n", - " Batch 2,160 of 2,642. Elapsed: 0:16:23.\n", - " Batch 2,200 of 2,642. Elapsed: 0:16:42.\n", - " Batch 2,240 of 2,642. Elapsed: 0:17:00.\n", - " Batch 2,280 of 2,642. Elapsed: 0:17:18.\n", - " Batch 2,320 of 2,642. Elapsed: 0:17:36.\n", - " Batch 2,360 of 2,642. Elapsed: 0:17:54.\n", - " Batch 2,400 of 2,642. Elapsed: 0:18:13.\n", - " Batch 2,440 of 2,642. Elapsed: 0:18:31.\n", - " Batch 2,480 of 2,642. Elapsed: 0:18:49.\n", - " Batch 2,520 of 2,642. Elapsed: 0:19:07.\n", - " Batch 2,560 of 2,642. Elapsed: 0:19:26.\n", - " Batch 2,600 of 2,642. Elapsed: 0:19:44.\n", - " Batch 2,640 of 2,642. Elapsed: 0:20:02.\n", - "\n", - " Average training loss: 2.04\n", - " Training epoch took: 0:20:03\n", - "\n", - "Running Validation...\n", - " Accuracy: 0.75\n", - " Validation took: 0:03:09\n", - "\n", - "======== Epoch 2 / 4 ========\n", - "Training...\n", - " Batch 40 of 2,642. Elapsed: 0:00:18.\n", - " Batch 80 of 2,642. Elapsed: 0:00:36.\n", - " Batch 120 of 2,642. Elapsed: 0:00:55.\n", - " Batch 160 of 2,642. Elapsed: 0:01:13.\n", - " Batch 200 of 2,642. Elapsed: 0:01:31.\n", - " Batch 240 of 2,642. Elapsed: 0:01:49.\n", - " Batch 280 of 2,642. Elapsed: 0:02:07.\n", - " Batch 320 of 2,642. Elapsed: 0:02:26.\n", - " Batch 360 of 2,642. Elapsed: 0:02:44.\n", - " Batch 400 of 2,642. Elapsed: 0:03:02.\n", - " Batch 440 of 2,642. Elapsed: 0:03:20.\n", - " Batch 480 of 2,642. Elapsed: 0:03:38.\n", - " Batch 520 of 2,642. Elapsed: 0:03:57.\n", - " Batch 560 of 2,642. Elapsed: 0:04:15.\n", - " Batch 600 of 2,642. Elapsed: 0:04:33.\n", - " Batch 640 of 2,642. Elapsed: 0:04:51.\n", - " Batch 680 of 2,642. Elapsed: 0:05:10.\n", - " Batch 720 of 2,642. Elapsed: 0:05:28.\n", - " Batch 760 of 2,642. Elapsed: 0:05:46.\n", - " Batch 800 of 2,642. Elapsed: 0:06:04.\n", - " Batch 840 of 2,642. Elapsed: 0:06:22.\n", - " Batch 880 of 2,642. Elapsed: 0:06:41.\n", - " Batch 920 of 2,642. Elapsed: 0:06:59.\n", - " Batch 960 of 2,642. Elapsed: 0:07:17.\n", - " Batch 1,000 of 2,642. Elapsed: 0:07:35.\n", - " Batch 1,040 of 2,642. Elapsed: 0:07:53.\n", - " Batch 1,080 of 2,642. Elapsed: 0:08:12.\n", - " Batch 1,120 of 2,642. Elapsed: 0:08:30.\n", - " Batch 1,160 of 2,642. Elapsed: 0:08:48.\n", - " Batch 1,200 of 2,642. Elapsed: 0:09:06.\n", - " Batch 1,240 of 2,642. Elapsed: 0:09:24.\n", - " Batch 1,280 of 2,642. Elapsed: 0:09:43.\n", - " Batch 1,320 of 2,642. Elapsed: 0:10:01.\n", - " Batch 1,360 of 2,642. Elapsed: 0:10:19.\n", - " Batch 1,400 of 2,642. Elapsed: 0:10:37.\n", - " Batch 1,440 of 2,642. Elapsed: 0:10:55.\n", - " Batch 1,480 of 2,642. Elapsed: 0:11:14.\n", - " Batch 1,520 of 2,642. Elapsed: 0:11:32.\n", - " Batch 1,560 of 2,642. Elapsed: 0:11:50.\n", - " Batch 1,600 of 2,642. Elapsed: 0:12:08.\n", - " Batch 1,640 of 2,642. Elapsed: 0:12:27.\n", - " Batch 1,680 of 2,642. Elapsed: 0:12:45.\n", - " Batch 1,720 of 2,642. Elapsed: 0:13:03.\n", - " Batch 1,760 of 2,642. Elapsed: 0:13:21.\n", - " Batch 1,800 of 2,642. Elapsed: 0:13:39.\n", - " Batch 1,840 of 2,642. Elapsed: 0:13:58.\n", - " Batch 1,880 of 2,642. Elapsed: 0:14:16.\n", - " Batch 1,920 of 2,642. Elapsed: 0:14:34.\n", - " Batch 1,960 of 2,642. Elapsed: 0:14:52.\n", - " Batch 2,000 of 2,642. Elapsed: 0:15:10.\n", - " Batch 2,040 of 2,642. Elapsed: 0:15:29.\n", - " Batch 2,080 of 2,642. Elapsed: 0:15:47.\n", - " Batch 2,120 of 2,642. Elapsed: 0:16:05.\n", - " Batch 2,160 of 2,642. Elapsed: 0:16:23.\n", - " Batch 2,200 of 2,642. Elapsed: 0:16:41.\n", - " Batch 2,240 of 2,642. Elapsed: 0:17:00.\n", - " Batch 2,280 of 2,642. Elapsed: 0:17:18.\n", - " Batch 2,320 of 2,642. Elapsed: 0:17:36.\n", - " Batch 2,360 of 2,642. Elapsed: 0:17:54.\n", - " Batch 2,400 of 2,642. Elapsed: 0:18:12.\n", - " Batch 2,440 of 2,642. Elapsed: 0:18:31.\n", - " Batch 2,480 of 2,642. Elapsed: 0:18:49.\n", - " Batch 2,520 of 2,642. Elapsed: 0:19:07.\n", - " Batch 2,560 of 2,642. Elapsed: 0:19:25.\n", - " Batch 2,600 of 2,642. Elapsed: 0:19:44.\n", - " Batch 2,640 of 2,642. Elapsed: 0:20:02.\n", - "\n", - " Average training loss: 1.03\n", - " Training epoch took: 0:20:02\n", - "\n", - "Running Validation...\n", - " Accuracy: 0.79\n", - " Validation took: 0:03:09\n", - "\n", - "======== Epoch 3 / 4 ========\n", - "Training...\n", - " Batch 40 of 2,642. Elapsed: 0:00:18.\n", - " Batch 80 of 2,642. Elapsed: 0:00:36.\n", - " Batch 120 of 2,642. Elapsed: 0:00:55.\n", - " Batch 160 of 2,642. Elapsed: 0:01:13.\n", - " Batch 200 of 2,642. Elapsed: 0:01:31.\n", - " Batch 240 of 2,642. Elapsed: 0:01:49.\n", - " Batch 280 of 2,642. Elapsed: 0:02:07.\n", - " Batch 320 of 2,642. Elapsed: 0:02:26.\n", - " Batch 360 of 2,642. Elapsed: 0:02:44.\n", - " Batch 400 of 2,642. Elapsed: 0:03:02.\n", - " Batch 440 of 2,642. Elapsed: 0:03:20.\n", - " Batch 480 of 2,642. Elapsed: 0:03:38.\n", - " Batch 520 of 2,642. Elapsed: 0:03:57.\n", - " Batch 560 of 2,642. Elapsed: 0:04:15.\n", - " Batch 600 of 2,642. Elapsed: 0:04:33.\n", - " Batch 640 of 2,642. Elapsed: 0:04:51.\n", - " Batch 680 of 2,642. Elapsed: 0:05:09.\n", - " Batch 720 of 2,642. Elapsed: 0:05:28.\n", - " Batch 760 of 2,642. Elapsed: 0:05:46.\n", - " Batch 800 of 2,642. Elapsed: 0:06:04.\n", - " Batch 840 of 2,642. Elapsed: 0:06:22.\n", - " Batch 880 of 2,642. Elapsed: 0:06:41.\n", - " Batch 920 of 2,642. Elapsed: 0:06:59.\n", - " Batch 960 of 2,642. Elapsed: 0:07:17.\n", - " Batch 1,000 of 2,642. Elapsed: 0:07:35.\n", - " Batch 1,040 of 2,642. Elapsed: 0:07:53.\n", - " Batch 1,080 of 2,642. Elapsed: 0:08:12.\n", - " Batch 1,120 of 2,642. Elapsed: 0:08:30.\n", - " Batch 1,160 of 2,642. Elapsed: 0:08:48.\n", - " Batch 1,200 of 2,642. Elapsed: 0:09:06.\n", - " Batch 1,240 of 2,642. Elapsed: 0:09:24.\n", - " Batch 1,280 of 2,642. Elapsed: 0:09:43.\n", - " Batch 1,320 of 2,642. Elapsed: 0:10:01.\n", - " Batch 1,360 of 2,642. Elapsed: 0:10:19.\n", - " Batch 1,400 of 2,642. Elapsed: 0:10:37.\n", - " Batch 1,440 of 2,642. Elapsed: 0:10:55.\n", - " Batch 1,480 of 2,642. Elapsed: 0:11:14.\n", - " Batch 1,520 of 2,642. Elapsed: 0:11:32.\n", - " Batch 1,560 of 2,642. Elapsed: 0:11:50.\n", - " Batch 1,600 of 2,642. Elapsed: 0:12:08.\n", - " Batch 1,640 of 2,642. Elapsed: 0:12:26.\n", - " Batch 1,680 of 2,642. Elapsed: 0:12:45.\n", - " Batch 1,720 of 2,642. Elapsed: 0:13:03.\n", - " Batch 1,760 of 2,642. Elapsed: 0:13:21.\n", - " Batch 1,800 of 2,642. Elapsed: 0:13:39.\n", - " Batch 1,840 of 2,642. Elapsed: 0:13:57.\n", - " Batch 1,880 of 2,642. Elapsed: 0:14:16.\n", - " Batch 1,920 of 2,642. Elapsed: 0:14:34.\n", - " Batch 1,960 of 2,642. Elapsed: 0:14:52.\n", - " Batch 2,000 of 2,642. Elapsed: 0:15:10.\n", - " Batch 2,040 of 2,642. Elapsed: 0:15:28.\n", - " Batch 2,080 of 2,642. Elapsed: 0:15:47.\n", - " Batch 2,120 of 2,642. Elapsed: 0:16:05.\n", - " Batch 2,160 of 2,642. Elapsed: 0:16:23.\n", - " Batch 2,200 of 2,642. Elapsed: 0:16:41.\n", - " Batch 2,240 of 2,642. Elapsed: 0:17:00.\n", - " Batch 2,280 of 2,642. Elapsed: 0:17:18.\n", - " Batch 2,320 of 2,642. Elapsed: 0:17:36.\n", - " Batch 2,360 of 2,642. Elapsed: 0:17:54.\n", - " Batch 2,400 of 2,642. Elapsed: 0:18:12.\n", - " Batch 2,440 of 2,642. Elapsed: 0:18:31.\n", - " Batch 2,480 of 2,642. Elapsed: 0:18:49.\n", - " Batch 2,520 of 2,642. Elapsed: 0:19:07.\n", - " Batch 2,560 of 2,642. Elapsed: 0:19:25.\n", - " Batch 2,600 of 2,642. Elapsed: 0:19:43.\n", - " Batch 2,640 of 2,642. Elapsed: 0:20:02.\n", - "\n", - " Average training loss: 0.75\n", - " Training epoch took: 0:20:02\n", - "\n", - "Running Validation...\n", - " Accuracy: 0.79\n", - " Validation took: 0:03:09\n", - "\n", - "======== Epoch 4 / 4 ========\n", - "Training...\n", - " Batch 40 of 2,642. Elapsed: 0:00:18.\n", - " Batch 80 of 2,642. Elapsed: 0:00:36.\n", - " Batch 120 of 2,642. Elapsed: 0:00:55.\n", - " Batch 160 of 2,642. Elapsed: 0:01:13.\n", - " Batch 200 of 2,642. Elapsed: 0:01:31.\n", - " Batch 240 of 2,642. Elapsed: 0:01:49.\n", - " Batch 280 of 2,642. Elapsed: 0:02:07.\n", - " Batch 320 of 2,642. Elapsed: 0:02:26.\n", - " Batch 360 of 2,642. Elapsed: 0:02:44.\n", - " Batch 400 of 2,642. Elapsed: 0:03:02.\n", - " Batch 440 of 2,642. Elapsed: 0:03:20.\n", - " Batch 480 of 2,642. Elapsed: 0:03:39.\n", - " Batch 520 of 2,642. Elapsed: 0:03:57.\n", - " Batch 560 of 2,642. Elapsed: 0:04:15.\n", - " Batch 600 of 2,642. Elapsed: 0:04:33.\n", - " Batch 640 of 2,642. Elapsed: 0:04:51.\n", - " Batch 680 of 2,642. Elapsed: 0:05:10.\n", - " Batch 720 of 2,642. Elapsed: 0:05:28.\n", - " Batch 760 of 2,642. Elapsed: 0:05:46.\n", - " Batch 800 of 2,642. Elapsed: 0:06:04.\n", - " Batch 840 of 2,642. Elapsed: 0:06:22.\n", - " Batch 880 of 2,642. Elapsed: 0:06:41.\n", - " Batch 920 of 2,642. Elapsed: 0:06:59.\n", - " Batch 960 of 2,642. Elapsed: 0:07:17.\n", - " Batch 1,000 of 2,642. Elapsed: 0:07:35.\n", - " Batch 1,040 of 2,642. Elapsed: 0:07:53.\n", - " Batch 1,080 of 2,642. Elapsed: 0:08:12.\n", - " Batch 1,120 of 2,642. Elapsed: 0:08:30.\n", - " Batch 1,160 of 2,642. Elapsed: 0:08:48.\n", - " Batch 1,200 of 2,642. Elapsed: 0:09:06.\n", - " Batch 1,240 of 2,642. Elapsed: 0:09:24.\n", - " Batch 1,280 of 2,642. Elapsed: 0:09:43.\n", - " Batch 1,320 of 2,642. Elapsed: 0:10:01.\n", - " Batch 1,360 of 2,642. Elapsed: 0:10:19.\n", - " Batch 1,400 of 2,642. Elapsed: 0:10:37.\n", - " Batch 1,440 of 2,642. Elapsed: 0:10:55.\n", - " Batch 1,480 of 2,642. Elapsed: 0:11:14.\n", - " Batch 1,520 of 2,642. Elapsed: 0:11:32.\n", - " Batch 1,560 of 2,642. Elapsed: 0:11:50.\n", - " Batch 1,600 of 2,642. Elapsed: 0:12:08.\n", - " Batch 1,640 of 2,642. Elapsed: 0:12:26.\n", - " Batch 1,680 of 2,642. Elapsed: 0:12:45.\n", - " Batch 1,720 of 2,642. Elapsed: 0:13:03.\n", - " Batch 1,760 of 2,642. Elapsed: 0:13:21.\n", - " Batch 1,800 of 2,642. Elapsed: 0:13:39.\n", - " Batch 1,840 of 2,642. Elapsed: 0:13:57.\n", - " Batch 1,880 of 2,642. Elapsed: 0:14:16.\n", - " Batch 1,920 of 2,642. Elapsed: 0:14:34.\n", - " Batch 1,960 of 2,642. Elapsed: 0:14:52.\n", - " Batch 2,000 of 2,642. Elapsed: 0:15:10.\n", - " Batch 2,040 of 2,642. Elapsed: 0:15:28.\n", - " Batch 2,080 of 2,642. Elapsed: 0:15:46.\n", - " Batch 2,120 of 2,642. Elapsed: 0:16:05.\n", - " Batch 2,160 of 2,642. Elapsed: 0:16:23.\n", - " Batch 2,200 of 2,642. Elapsed: 0:16:41.\n", - " Batch 2,240 of 2,642. Elapsed: 0:16:59.\n", - " Batch 2,280 of 2,642. Elapsed: 0:17:17.\n", - " Batch 2,320 of 2,642. Elapsed: 0:17:36.\n", - " Batch 2,360 of 2,642. Elapsed: 0:17:54.\n", - " Batch 2,400 of 2,642. Elapsed: 0:18:12.\n", - " Batch 2,440 of 2,642. Elapsed: 0:18:30.\n", - " Batch 2,480 of 2,642. Elapsed: 0:18:48.\n", - " Batch 2,520 of 2,642. Elapsed: 0:19:07.\n", - " Batch 2,560 of 2,642. Elapsed: 0:19:25.\n", - " Batch 2,600 of 2,642. Elapsed: 0:19:43.\n", - " Batch 2,640 of 2,642. Elapsed: 0:20:01.\n", - "\n", - " Average training loss: 0.60\n", - " Training epoch took: 0:20:02\n", - "\n", - "Running Validation...\n", - " Accuracy: 0.80\n", - " Validation took: 0:03:09\n", - "\n", - "Training complete!\n" - ] - } - ], + "outputs": [], "source": [ "# This training code is based on the `run_glue.py` script here:\n", "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n", @@ -1938,17 +1097,6 @@ " return pred_labels_, true_labels_" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AJ0suC8iMs8a" - }, - "outputs": [], - "source": [ - "dataset_name = [\"validation\", \"test\"]" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1957,59 +1105,82 @@ }, "outputs": [], "source": [ - "for dataset in dataset_name:\n", + "\n", + "for dataset in [\"test\", \"validation\"]:\n", " df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n", + " df_eval = df_eval.dropna(subset=[columnClass]).reset_index(drop=True) # supprimer les NaN...\n", + " \n", " data_eval = df_eval[columnText].values\n", "\n", " y = df_eval[columnClass]\n", + "\n", + " \n", + " \n", " encoder = preprocessing.LabelEncoder()\n", " y = encoder.fit_transform(y)\n", " labels = y.tolist()\n", "\n", - " pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n", - "\n", - "\n", - " report = classification_report( pred_labels_, true_labels_, output_dict = True)\n", + " # for maxOfInstancePerClass in [500, 1500, 10000]:\n", + " for maxOfInstancePerClass in [500]:\n", + " # il manque le model BERT s500 ...\n", + " \n", + " #for model_bert in [\"camembert-base\", \"bert-base-multilingual-cased\"]:\n", + " for model_bert in [\"bert-base-multilingual-cased\"]:\n", " \n", - " classes = [str(e) for e in encoder.transform(encoder.classes_)]\n", - " classesName = encoder.classes_\n", - "\n", - " precision = []\n", - " recall = []\n", - " f1 = []\n", - " support = []\n", - " dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", - " for c in classes:\n", - " precision.append(report[c]['precision'])\n", - " recall.append(report[c]['recall'])\n", - " f1.append(report[c]['f1-score'])\n", - " support.append(report[c]['support'])\n", - "\n", - " accuracy = report['accuracy']\n", - " weighted_avg = report['weighted avg']\n", - " cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n", - " FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", - " FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", - " TP = np.diag(cnf_matrix)\n", - " TN = cnf_matrix.sum() - (FP + FN + TP)\n", - "\n", - " dff['className'] = classesName\n", - " dff['precision'] = precision\n", - " dff['recall'] = recall\n", - " dff['f1-score'] = f1\n", - " dff['support'] = support\n", - " dff['FP'] = FP\n", - " dff['FN'] = FN\n", - " dff['TP'] = TP\n", - " dff['TN'] = TN\n", - "\n", - " print(dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass))\n", - "\n", - " print(weighted_avg)\n", - " print(accuracy)\n", - " print(dff)\n", - "\n", - " dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)" + " model_path = \"drive/MyDrive/Classification-EDdA/model_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".pt\"\n", + " model = torch.load(model_path)\n", + "\n", + " if model_bert == \"bert-base-multilingual-cased\":\n", + " tokenizer = BertTokenizer.from_pretrained(model_bert)\n", + " elif model_bert == \"camembert-base\":\n", + " tokenizer = CamembertTokenizer.from_pretrained(model_bert)\n", + "\n", + " pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n", + "\n", + "\n", + " report = classification_report(true_labels_, pred_labels_, output_dict = True)\n", + " \n", + " classes = [str(e) for e in encoder.transform(encoder.classes_)]\n", + " classesName = encoder.classes_\n", + "\n", + " precision = []\n", + " recall = []\n", + " f1 = []\n", + " support = []\n", + " dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", + " for c in classes:\n", + " precision.append(report[c]['precision'])\n", + " recall.append(report[c]['recall'])\n", + " f1.append(report[c]['f1-score'])\n", + " support.append(report[c]['support'])\n", + "\n", + " accuracy = report['accuracy']\n", + " weighted_avg = report['weighted avg']\n", + " cnf_matrix = confusion_matrix(true_labels_, pred_labels_)\n", + " FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", + " FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", + " TP = np.diag(cnf_matrix)\n", + " TN = cnf_matrix.sum() - (FP + FN + TP)\n", + "\n", + " dff['className'] = classesName\n", + " dff['precision'] = precision\n", + " dff['recall'] = recall\n", + " dff['f1-score'] = f1\n", + " dff['support'] = support\n", + " dff['FP'] = FP\n", + " dff['FN'] = FN\n", + " dff['TP'] = TP\n", + " dff['TN'] = TN\n", + "\n", + " print(dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass))\n", + "\n", + " print(weighted_avg)\n", + " print(accuracy)\n", + " print(dff)\n", + "\n", + " dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n", + " # enregistrer les predictions\n", + " pd.DataFrame({'labels': pd.Series(true_labels_), 'predictions': pd.Series(pred_labels_)}).to_csv(\"drive/MyDrive/Classification-EDdA/predictions/predictions_\"+dataset+\"_\"+model_bert+\"_s\"+str(maxOfInstancePerClass)+\".csv\")\n" ] }, { @@ -2065,957 +1236,6 @@ }, "outputs": [], "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "x_n57EvhJMQh" - }, - "outputs": [], - "source": [ - "model_path = \"drive/MyDrive/Classification-EDdA/model_bert-base-multilingual-cased_s10000.pt\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R3_9tA9MI8ju" - }, - "outputs": [], - "source": [ - "model = torch.load(model_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_fzgS5USJeAF", - "outputId": "be4a5506-76ed-4eef-bb3c-fe2bb77c6e4d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2021-09-30 19:38:22-- https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv\n", - "Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28\n", - "Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 356197 (348K) [text/tab-separated-values]\n", - "Saving to: ‘LGE_withContent.tsv’\n", - "\n", - "LGE_withContent.tsv 100%[===================>] 347.85K 567KB/s in 0.6s \n", - "\n", - "2021-09-30 19:38:24 (567 KB/s) - ‘LGE_withContent.tsv’ saved [356197/356197]\n", - "\n" - ] - } - ], - "source": [ - "!wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8WEJjQC7I8mP" - }, - "outputs": [], - "source": [ - "df_LGE = pd.read_csv(\"LGE_withContent.tsv\", sep=\"\\t\")\n", - "data_LGE = df_LGE[\"content\"].values\n", - "\n", - "\n", - "#pred_labels_, true_labels_ = evaluate_bert(data_eval, labels, model, batch_size)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - }, - "id": "9qJDTU-6vzkk", - "outputId": "1b279f0e-7715-4d23-f524-08e8ba327f6c" - }, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>id</th>\n", - " <th>tome</th>\n", - " <th>rank</th>\n", - " <th>domain</th>\n", - " <th>remark</th>\n", - " <th>content</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>abrabeses-0</td>\n", - " <td>1</td>\n", - " <td>623</td>\n", - " <td>geography</td>\n", - " <td>NaN</td>\n", - " <td>ABRABESES. Village d’Espagne de la prov. de Za...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>accius-0</td>\n", - " <td>1</td>\n", - " <td>1076</td>\n", - " <td>biography</td>\n", - " <td>NaN</td>\n", - " <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>achenbach-2</td>\n", - " <td>1</td>\n", - " <td>1357</td>\n", - " <td>biography</td>\n", - " <td>NaN</td>\n", - " <td>ACHENBACH(Henri), administrateur prussien, né ...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>acireale-0</td>\n", - " <td>1</td>\n", - " <td>1513</td>\n", - " <td>geography</td>\n", - " <td>NaN</td>\n", - " <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>actée-0</td>\n", - " <td>1</td>\n", - " <td>1731</td>\n", - " <td>botany</td>\n", - " <td>NaN</td>\n", - " <td>ACTÉE(Actœa L.). Genre de plantes de la famill...</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " id tome ... remark content\n", - "0 abrabeses-0 1 ... NaN ABRABESES. Village d’Espagne de la prov. de Za...\n", - "1 accius-0 1 ... NaN ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...\n", - "2 achenbach-2 1 ... NaN ACHENBACH(Henri), administrateur prussien, né ...\n", - "3 acireale-0 1 ... NaN ACIREALE. Yille de Sicile, de la province et d...\n", - "4 actée-0 1 ... NaN ACTÉE(Actœa L.). Genre de plantes de la famill...\n", - "\n", - "[5 rows x 6 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_LGE.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "71-fP61-OOwQ", - "outputId": "ef08b49e-0a9f-4653-e303-3163250af35b" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(310, 6)" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_LGE.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lFFed2EAI8oq" - }, - "outputs": [], - "source": [ - "def generate_prediction_dataloader(chosen_model, sentences_to_predict, batch_size = 8, max_len = 512):\n", - "\n", - " if chosen_model == 'bert-base-multilingual-cased' :\n", - " print('Loading Bert Tokenizer...')\n", - " tokenizer = BertTokenizer.from_pretrained(chosen_model)\n", - " elif chosen_model == 'camembert-base':\n", - " print('Loading Camembert Tokenizer...')\n", - " tokenizer = CamembertTokenizer.from_pretrained(chosen_model)\n", - "\n", - " # Tokenize all of the sentences and map the tokens to thier word IDs.\n", - " input_ids_test = []\n", - " # For every sentence...\n", - " for sent in sentences_to_predict:\n", - " # `encode` will:\n", - " # (1) Tokenize the sentence.\n", - " # (2) Prepend the `[CLS]` token to the start.\n", - " # (3) Append the `[SEP]` token to the end.\n", - " # (4) Map tokens to their IDs.\n", - " encoded_sent = tokenizer.encode(\n", - " sent, # Sentence to encode.\n", - " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", - " )\n", - "\n", - " input_ids_test.append(encoded_sent)\n", - "\n", - " # Pad our input tokens\n", - " padded_test = []\n", - " for i in input_ids_test:\n", - "\n", - " if len(i) > max_len:\n", - " padded_test.extend([i[:max_len]])\n", - " else:\n", - "\n", - " padded_test.extend([i + [0] * (max_len - len(i))])\n", - " input_ids_test = np.array(padded_test)\n", - "\n", - " # Create attention masks\n", - " attention_masks = []\n", - "\n", - " # Create a mask of 1s for each token followed by 0s for padding\n", - " for seq in input_ids_test:\n", - " seq_mask = [float(i>0) for i in seq]\n", - " attention_masks.append(seq_mask)\n", - "\n", - " # Convert to tensors.\n", - " prediction_inputs = torch.tensor(input_ids_test)\n", - " prediction_masks = torch.tensor(attention_masks)\n", - " #set batch size\n", - "\n", - "\n", - " # Create the DataLoader.\n", - " prediction_data = TensorDataset(prediction_inputs, prediction_masks)\n", - " prediction_sampler = SequentialSampler(prediction_data)\n", - " prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)\n", - "\n", - " return prediction_dataloader\n", - "\n", - "\n", - "\n", - "def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):\n", - "\n", - "\n", - " # If there's a GPU available...\n", - " if torch.cuda.is_available():\n", - "\n", - " # Tell PyTorch to use the GPU.\n", - " device = torch.device(\"cuda\")\n", - "\n", - " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n", - "\n", - " print('We will use the GPU:', torch.cuda.get_device_name(0))\n", - "\n", - " # If not...\n", - " else:\n", - " print('No GPU available, using the CPU instead.')\n", - " device = torch.device(\"cpu\")\n", - "\n", - " # Put model in evaluation mode\n", - " model.eval()\n", - "\n", - " # Tracking variables\n", - " predictions_test , true_labels = [], []\n", - " pred_labels_ = []\n", - " # Predict\n", - " for batch in sentences_to_predict_dataloader:\n", - " # Add batch to GPU\n", - " batch = tuple(t.to(device) for t in batch)\n", - "\n", - " # Unpack the inputs from the dataloader\n", - " b_input_ids, b_input_mask = batch\n", - "\n", - " # Telling the model not to compute or store gradients, saving memory and\n", - " # speeding up prediction\n", - " with torch.no_grad():\n", - " # Forward pass, calculate logit predictions\n", - " outputs = model(b_input_ids, token_type_ids=None,\n", - " attention_mask=b_input_mask)\n", - "\n", - " logits = outputs[0]\n", - " #print(logits)\n", - "\n", - " # Move logits and labels to CPU\n", - " logits = logits.detach().cpu().numpy()\n", - " #print(logits)\n", - "\n", - " # Store predictions and true labels\n", - " predictions_test.append(logits)\n", - "\n", - " #print(' DONE.')\n", - "\n", - " pred_labels = []\n", - " \n", - " for i in range(len(predictions_test)):\n", - "\n", - " # The predictions for this batch are a 2-column ndarray (one column for \"0\"\n", - " # and one column for \"1\"). Pick the label with the highest value and turn this\n", - " # in to a list of 0s and 1s.\n", - " pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()\n", - " pred_labels.append(pred_labels_i)\n", - "\n", - " pred_labels_ += [item for sublist in pred_labels for item in sublist]\n", - " return pred_labels_\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "O9eer_kgI8rC", - "outputId": "94ea7418-14a8-4918-e210-caf0018f5989" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading Bert Tokenizer...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors\n" - ] - } - ], - "source": [ - "data_loader = generate_prediction_dataloader('bert-base-multilingual-cased', data_LGE)\n", - "#data_loader = generate_prediction_dataloader('camembert-base', data_LGE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "sFpAwbrBwF2h", - "outputId": "8d210732-619d-41f0-b6e2-ad9d06a85069" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "There are 1 GPU(s) available.\n", - "We will use the GPU: Tesla P100-PCIE-16GB\n" - ] - } - ], - "source": [ - "p = predict_class_bertFineTuning( model, data_loader )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "51HF6-8UPSTc", - "outputId": "26bff792-eb8d-4e1a-efa4-a7a6c9d32bf9" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "310" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(p)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rFFGhaCvQHfh" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qgJ-O4rcQHiI", - "outputId": "bfe93dd6-4d89-4d5c-be0d-45e1c98c6b14" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "LabelEncoder()" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Il faudrait enregistrer l'encoder, \n", - "# sinon on est obligé de le refaire à partir du jeu d'entrainement pour récupérer le noms des classes.\n", - "encoder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QuST9wJoQHnS" - }, - "outputs": [], - "source": [ - "p2 = list(encoder.inverse_transform(p))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6ek7suq9QHqE", - "outputId": "6636983a-7eba-48c8-d884-f8fb437294dc" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Chimie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Mathématiques',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Musique',\n", - " 'Commerce',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Histoire naturelle',\n", - " 'Chimie',\n", - " 'Histoire',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Histoire naturelle',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Architecture',\n", - " 'Histoire naturelle',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Arts et métiers',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Marine',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Architecture',\n", - " 'Histoire naturelle',\n", - " 'Beaux-arts',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Beaux-arts',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Chimie',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Religion',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Agriculture - Economie rustique',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Jeu',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Beaux-arts',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Beaux-arts',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Histoire',\n", - " 'Histoire naturelle',\n", - " 'Commerce',\n", - " 'Histoire',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Religion',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Agriculture - Economie rustique',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Belles-lettres - Poésie',\n", - " 'Beaux-arts',\n", - " 'Religion',\n", - " 'Architecture',\n", - " 'Architecture',\n", - " 'Architecture',\n", - " 'Géographie',\n", - " 'Chimie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Beaux-arts',\n", - " 'Histoire naturelle',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Minéralogie',\n", - " 'Belles-lettres - Poésie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Grammaire',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Mathématiques',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Blason',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Histoire naturelle',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Agriculture - Economie rustique',\n", - " 'Chimie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Belles-lettres - Poésie',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Arts et métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Beaux-arts',\n", - " 'Géographie',\n", - " 'Beaux-arts',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Musique',\n", - " 'Médecine - Chirurgie',\n", - " 'Religion',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire',\n", - " 'Médecine - Chirurgie',\n", - " 'Histoire',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Chimie',\n", - " 'Antiquité',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Beaux-arts',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Antiquité',\n", - " 'Grammaire',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Beaux-arts',\n", - " 'Beaux-arts',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Architecture',\n", - " 'Commerce',\n", - " 'Antiquité',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Histoire naturelle',\n", - " 'Histoire',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Anatomie',\n", - " 'Commerce',\n", - " 'Beaux-arts',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Beaux-arts',\n", - " 'Commerce',\n", - " 'Architecture',\n", - " 'Commerce',\n", - " 'Antiquité',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Géographie',\n", - " 'Religion',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Philosophie',\n", - " 'Géographie',\n", - " 'Chimie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Beaux-arts',\n", - " 'Commerce',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie']" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XvdDj5PBQHtk" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "t39Xs0j7QHXJ" - }, - "outputs": [], - "source": [ - "df_LGE['class_bert'] = p2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - }, - "id": "-VZ7geRmQHaD", - "outputId": "350a4122-5b1f-43e2-e372-2f628f665c4a" - }, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>id</th>\n", - " <th>tome</th>\n", - " <th>rank</th>\n", - " <th>domain</th>\n", - " <th>remark</th>\n", - " <th>content</th>\n", - " <th>class_bert</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>abrabeses-0</td>\n", - " <td>1</td>\n", - " <td>623</td>\n", - " <td>geography</td>\n", - " <td>NaN</td>\n", - " <td>ABRABESES. Village d’Espagne de la prov. de Za...</td>\n", - " <td>Géographie</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>accius-0</td>\n", - " <td>1</td>\n", - " <td>1076</td>\n", - " <td>biography</td>\n", - " <td>NaN</td>\n", - " <td>ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...</td>\n", - " <td>Géographie</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>achenbach-2</td>\n", - " <td>1</td>\n", - " <td>1357</td>\n", - " <td>biography</td>\n", - " <td>NaN</td>\n", - " <td>ACHENBACH(Henri), administrateur prussien, né ...</td>\n", - " <td>Géographie</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>acireale-0</td>\n", - " <td>1</td>\n", - " <td>1513</td>\n", - " <td>geography</td>\n", - " <td>NaN</td>\n", - " <td>ACIREALE. Yille de Sicile, de la province et d...</td>\n", - " <td>Géographie</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>actée-0</td>\n", - " <td>1</td>\n", - " <td>1731</td>\n", - " <td>botany</td>\n", - " <td>NaN</td>\n", - " <td>ACTÉE(Actœa L.). Genre de plantes de la famill...</td>\n", - " <td>Histoire naturelle</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " id ... class_bert\n", - "0 abrabeses-0 ... Géographie\n", - "1 accius-0 ... Géographie\n", - "2 achenbach-2 ... Géographie\n", - "3 acireale-0 ... Géographie\n", - "4 actée-0 ... Histoire naturelle\n", - "\n", - "[5 rows x 7 columns]" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_LGE.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3xkzdkrKQHwA" - }, - "outputs": [], - "source": [ - "df_LGE.to_csv(\"drive/MyDrive/Classification-EDdA/classification_LGE.tsv\", sep=\"\\t\")" - ] } ], "metadata": { diff --git a/notebooks/EDdA_Classification_ClassicModels.ipynb b/notebooks/EDdA_Classification_ClassicModels.ipynb index fcb2ba0913c4ace29c5ba8cdda2ba0b89c1a5931..4a10ee3f243003a3359d927a5ffaf1239eddce0a 100644 --- a/notebooks/EDdA_Classification_ClassicModels.ipynb +++ b/notebooks/EDdA_Classification_ClassicModels.ipynb @@ -1,21 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "EDdA-Classification_ClassicModels.ipynb", - "provenance": [], - "collapsed_sections": [], - "machine_shape": "hm" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { "cell_type": "markdown", @@ -37,9 +20,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "D_uwiuJq3pAM" }, + "outputs": [], "source": [ "train_path = 'training_set.tsv'\n", "validation_path = 'validation_set.tsv'\n", @@ -67,9 +52,7 @@ "doc2vec_min_count = 12\n", "doc2vec_dm = 0\n", "doc2vec_workers = 8" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -82,6 +65,7 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -89,6 +73,7 @@ "id": "FsAR4CsB3aUc", "outputId": "a5e4efde-a5c9-45f9-ef1c-9223b4d52ac6" }, + "outputs": [], "source": [ "from psutil import virtual_memory\n", "ram_gb = virtual_memory().total / 1e9\n", @@ -98,22 +83,11 @@ " print('Not using a high-RAM runtime')\n", "else:\n", " print('You are using a high-RAM runtime!')" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Your runtime has 27.3 gigabytes of available RAM\n", - "\n", - "You are using a high-RAM runtime!\n" - ] - } ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -121,19 +95,10 @@ "id": "h5MwRwL53aYY", "outputId": "bc4c4c16-fb20-404a-e044-550fc4ca907d" }, + "outputs": [], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Mounted at /content/drive\n" - ] - } ] }, { @@ -147,6 +112,7 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -154,6 +120,7 @@ "id": "bcptSr6o3ac7", "outputId": "19713482-dfeb-4be3-e63c-35b4253cb9e5" }, + "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", @@ -181,33 +148,11 @@ "import string\n", "nltk.download('stopwords')\n", "nltk.download('punkt')" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n", - "[nltk_data] Downloading package punkt to /root/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "True" - ] - }, - "metadata": {}, - "execution_count": 3 - } ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -215,50 +160,9 @@ "id": "dwSVXDtWZB5H", "outputId": "44e2aa14-726f-43af-aa6a-1b7899e1025b" }, + "outputs": [], "source": [ "!python -m spacy download fr_core_news_sm" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting fr_core_news_sm==2.2.5\n", - " Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7 MB)\n", - "\u001b[K |████████████████████████████████| 14.7 MB 5.5 MB/s \n", - "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from fr_core_news_sm==2.2.5) (2.2.4)\n", - "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (0.8.2)\n", - "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (57.4.0)\n", - "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.1.3)\n", - "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.0.6)\n", - "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.5)\n", - "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.23.0)\n", - "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.0)\n", - "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (0.4.1)\n", - "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.0.6)\n", - "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (7.4.0)\n", - "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (4.62.3)\n", - "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.19.5)\n", - "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.0.6)\n", - "Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (4.8.2)\n", - "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.10.0.2)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.6.0)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (2021.10.8)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->fr_core_news_sm==2.2.5) (1.24.3)\n", - "Building wheels for collected packages: fr-core-news-sm\n", - " Building wheel for fr-core-news-sm (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.2.5-py3-none-any.whl size=14727026 sha256=994d176b35663506dd047e65863238d29b9b60313ba0dee5997c107f116477aa\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-c8y7i3ag/wheels/c9/a6/ea/0778337c34660027ee67ef3a91fb9d3600b76777a912ea1c24\n", - "Successfully built fr-core-news-sm\n", - "Installing collected packages: fr-core-news-sm\n", - "Successfully installed fr-core-news-sm-2.2.5\n", - "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", - "You can now load the model via spacy.load('fr_core_news_sm')\n" - ] - } ] }, { @@ -272,9 +176,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "Tunf_CYi3afO" }, + "outputs": [], "source": [ "def create_dict(df, classColumnName):\n", " return dict(df[classColumnName].value_counts())\n", @@ -328,9 +234,7 @@ " model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs)\n", " return model\n", " #return np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -343,28 +247,28 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ybiJYL0h3ahh" }, + "outputs": [], "source": [ "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n", "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n", "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "LRKJzWmf3pCg" }, + "outputs": [], "source": [ "df_train = pd.read_csv(train_path, sep=\"\\t\")\n", "df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -377,9 +281,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "6QQXybaQ3pE9" }, + "outputs": [], "source": [ "data_train = df_train[columnText].tolist()\n", "vectorizer_dic = {}\n", @@ -423,9 +329,7 @@ " pickle.dump(vectorizer, file)\n", " \n", " vectorizer_dic[vectorizer_name] = vectorizer " - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -438,9 +342,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "rx_0eV-M3pHc" }, + "outputs": [], "source": [ "classifier_dic = {}\n", "grid_param = {}\n", @@ -460,15 +366,15 @@ " elif classifier_name == \"rfc\":\n", " classifier_dic[classifier_name] = RandomForestClassifier()\n", " grid_param[classifier_name] = { 'max_features': ['sqrt', 'log2'], 'max_depth' : [4,5,6,7,8]}\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "pO7oyeAF7KPK" }, + "outputs": [], "source": [ "for clf_name, clf in classifier_dic.items():\n", " if clf_name != 'bayes' :\n", @@ -488,9 +394,7 @@ " # saving classifier\n", " with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'wb') as file:\n", " pickle.dump(clf, file)\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -503,17 +407,18 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "TfKAjtVFblYe" }, + "outputs": [], "source": [ "dataset_name = [\"validation\", \"test\"]" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -521,341 +426,108 @@ "id": "h8vZar8c7KRq", "outputId": "83511c89-9219-43d1-9e5a-820e75012166" }, + "outputs": [], "source": [ "for dataset in dataset_name:\n", " df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n", + " df_eval = df_eval.dropna(subset=[columnClass]).reset_index(drop=True) # supprimer les NaN...\n", " data_eval = df_eval[columnText].tolist()\n", "\n", - " for classifier_name in classifier_list:\n", + " for maxOfInstancePerClass in [500, 1500, 10000]:\n", + " \n", + "\n", + " for classifier_name in classifier_list:\n", "\n", - " for vectorizer_name in vectorizer_list:\n", + " for vectorizer_name in vectorizer_list:\n", "\n", - " clf_file_name = classifier_name + '_' + vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n", - " with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'rb') as file:\n", - " clf = pickle.load(file)\n", + " clf_file_name = classifier_name + '_' + vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n", + " with open(\"drive/MyDrive/Classification-EDdA/\"+clf_file_name, 'rb') as file:\n", + " clf = pickle.load(file)\n", "\n", - " vec_file_name = vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n", - " with open(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name, 'rb') as file:\n", - " vectorizer = pickle.load(file)\n", + " vec_file_name = vectorizer_name + '_s' + str(maxOfInstancePerClass) +\".pkl\"\n", + " with open(\"drive/MyDrive/Classification-EDdA/\"+vec_file_name, 'rb') as file:\n", + " vectorizer = pickle.load(file)\n", "\n", - " if vectorizer_name != 'doc2vec' :\n", - " vec_data = vectorizer.transform(data_eval)\n", - " else : \n", - " tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(data_eval)]\n", - " vec_data = np.array([vectorizer.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])\n", + " if vectorizer_name != 'doc2vec' :\n", + " vec_data = vectorizer.transform(data_eval)\n", + " else : \n", + " tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(data_eval)]\n", + " vec_data = np.array([vectorizer.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])\n", "\n", "\n", - " y_pred = clf.predict(vec_data)\n", + " y_pred = clf.predict(vec_data)\n", "\n", "\n", - " report = classification_report(y_pred, df_eval[columnClass], output_dict = True)\n", - " precision = []\n", - " recall = []\n", - " f1 = []\n", - " support = []\n", - " dff = pd.DataFrame(columns= ['class', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", - " for c in df_eval[columnClass].unique() :\n", - " precision.append(report[c]['precision'])\n", - " recall.append(report[c]['recall'])\n", - " f1.append(report[c]['f1-score'])\n", - " support.append(report[c]['support'])\n", + " report = classification_report(df_eval[columnClass], y_pred, output_dict = True)\n", + " precision = []\n", + " recall = []\n", + " f1 = []\n", + " support = []\n", + " dff = pd.DataFrame(columns= ['class', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", + " for c in df_eval[columnClass].unique() :\n", + " precision.append(report[c]['precision'])\n", + " recall.append(report[c]['recall'])\n", + " f1.append(report[c]['f1-score'])\n", + " support.append(report[c]['support'])\n", "\n", - " accuracy = report['accuracy']\n", - " weighted_avg = report['weighted avg']\n", - " cnf_matrix = confusion_matrix(df_eval[columnClass], y_pred)\n", - " FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", - " FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", - " TP = np.diag(cnf_matrix)\n", - " TN = cnf_matrix.sum() - (FP + FN + TP)\n", + " accuracy = report['accuracy']\n", + " weighted_avg = report['weighted avg']\n", + " cnf_matrix = confusion_matrix(df_eval[columnClass], y_pred)\n", + " FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", + " FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", + " TP = np.diag(cnf_matrix)\n", + " TN = cnf_matrix.sum() - (FP + FN + TP)\n", "\n", - " dff['class'] = df_eval[columnClass].unique()\n", - " dff['precision'] = precision\n", - " dff['recall'] = recall\n", - " dff['f1-score'] = f1\n", - " dff['support'] = support\n", - " dff['FP'] = FP\n", - " dff['FN'] = FN\n", - " dff['TP'] = TP\n", - " dff['TN'] = TN\n", + " dff['class'] = df_eval[columnClass].unique()\n", + " dff['precision'] = precision\n", + " dff['recall'] = recall\n", + " dff['f1-score'] = f1\n", + " dff['support'] = support\n", + " dff['FP'] = FP\n", + " dff['FN'] = FN\n", + " dff['TP'] = TP\n", + " dff['TN'] = TN\n", "\n", "\n", - " print(dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass))\n", + " print(dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass))\n", "\n", - " print(weighted_avg)\n", - " print(accuracy)\n", - " print(dff)\n", + " print(weighted_avg)\n", + " print(accuracy)\n", + " print(dff)\n", "\n", - " dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_\"+dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n", - "\n" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "validation_bayes_bagofwords_s10000\n", - "{'precision': 0.8377945389222964, 'recall': 0.619530464967571, 'f1-score': 0.6842670335331308, 'support': 10947}\n", - "0.619530464967571\n", - " class precision ... TP TN\n", - "0 Droit - Jurisprudence 0.963590 ... 5 10735\n", - "1 Grammaire 0.321888 ... 46 10760\n", - "2 Histoire naturelle 0.938776 ... 55 10665\n", - "3 Commerce 0.310249 ... 42 10679\n", - "4 Géographie 0.958193 ... 0 10839\n", - "5 Architecture 0.158491 ... 0 10863\n", - "6 Monnaie 0.000000 ... 4 10751\n", - "7 Médecine - Chirurgie 0.735981 ... 3 10860\n", - "8 Métiers 0.917495 ... 0 10925\n", - "9 Militaire (Art) - Guerre - Arme 0.182186 ... 1 10845\n", - "10 Anatomie 0.245989 ... 1 10853\n", - "11 Jeu 0.000000 ... 112 10553\n", - "12 Pharmacie 0.000000 ... 1138 9191\n", - "13 Antiquité 0.209125 ... 0 10921\n", - "14 Belles-lettres - Poésie 0.020513 ... 150 10358\n", - "15 Agriculture - Economie rustique 0.023585 ... 2269 8114\n", - "16 Mathématiques 0.142857 ... 357 9728\n", - "17 Beaux-arts 0.000000 ... 874 9278\n", - "18 Physique - [Sciences physico-mathématiques] 0.364372 ... 0 10893\n", - "19 Marine 0.410468 ... 149 10579\n", - "20 Chasse 0.009804 ... 5 10850\n", - "21 Arts et métiers 0.000000 ... 18 10819\n", - "22 Religion 0.526646 ... 0 10912\n", - "23 Blason 0.034483 ... 45 10699\n", - "24 Pêche 0.025641 ... 0 10926\n", - "25 Histoire 0.603041 ... 0 10886\n", - "26 Maréchage - Manège 0.051546 ... 11 10814\n", - "27 Mesure 0.000000 ... 0 10924\n", - "28 Economie domestique 0.000000 ... 315 10264\n", - "29 Philosophie 0.000000 ... 923 8722\n", - "30 Superstition 0.000000 ... 0 10888\n", - "31 Chimie 0.010638 ... 0 10854\n", - "32 Médailles 0.000000 ... 90 10659\n", - "33 Musique 0.082707 ... 0 10925\n", - "34 Caractères 0.000000 ... 1 10908\n", - "35 Spectacle 0.000000 ... 168 10570\n", - "36 Minéralogie 0.000000 ... 0 10938\n", - "37 Politique 0.000000 ... 0 10926\n", - "\n", - "[38 rows x 9 columns]\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "validation_bayes_tf_idf_s10000\n", - "{'precision': 0.9361172330822201, 'recall': 0.48853567187357266, 'f1-score': 0.6289575972884817, 'support': 10947}\n", - "0.48853567187357266\n", - " class precision ... TP TN\n", - "0 Droit - Jurisprudence 0.922100 ... 0 10735\n", - "1 Grammaire 0.000000 ... 7 10760\n", - "2 Histoire naturelle 0.888292 ... 0 10684\n", - "3 Commerce 0.036011 ... 1 10682\n", - "4 Géographie 0.995777 ... 0 10839\n", - "5 Architecture 0.003774 ... 0 10863\n", - "6 Monnaie 0.000000 ... 0 10752\n", - "7 Médecine - Chirurgie 0.221963 ... 0 10860\n", - "8 Métiers 0.903579 ... 0 10925\n", - "9 Militaire (Art) - Guerre - Arme 0.004049 ... 0 10845\n", - "10 Anatomie 0.037433 ... 0 10853\n", - "11 Jeu 0.000000 ... 13 10585\n", - "12 Pharmacie 0.000000 ... 1089 9047\n", - "13 Antiquité 0.000000 ... 0 10921\n", - "14 Belles-lettres - Poésie 0.000000 ... 0 10481\n", - "15 Agriculture - Economie rustique 0.000000 ... 2358 5636\n", - "16 Mathématiques 0.000000 ... 14 10349\n", - "17 Beaux-arts 0.000000 ... 827 9314\n", - "18 Physique - [Sciences physico-mathématiques] 0.004049 ... 0 10893\n", - "19 Marine 0.088154 ... 32 10583\n", - "20 Chasse 0.000000 ... 0 10850\n", - "21 Arts et métiers 0.000000 ... 0 10821\n", - "22 Religion 0.003135 ... 0 10912\n", - "23 Blason 0.000000 ... 1 10700\n", - "24 Pêche 0.000000 ... 0 10926\n", - "25 Histoire 0.023649 ... 0 10886\n", - "26 Maréchage - Manège 0.000000 ... 0 10814\n", - "27 Mesure 0.000000 ... 0 10924\n", - "28 Economie domestique 0.000000 ... 95 10502\n", - "29 Philosophie 0.000000 ... 909 8731\n", - "30 Superstition 0.000000 ... 0 10888\n", - "31 Chimie 0.000000 ... 0 10854\n", - "32 Médailles 0.000000 ... 1 10700\n", - "33 Musique 0.000000 ... 0 10925\n", - "34 Caractères 0.000000 ... 0 10908\n", - "35 Spectacle 0.000000 ... 1 10628\n", - "36 Minéralogie 0.000000 ... 0 10938\n", - "37 Politique 0.000000 ... 0 10926\n", - "\n", - "[38 rows x 9 columns]\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "test_bayes_bagofwords_s10000\n", - "{'precision': 0.8343333806034451, 'recall': 0.6158940397350994, 'f1-score': 0.6801987597575112, 'support': 13137}\n", - "0.6158940397350994\n", - " class precision ... TP TN\n", - "0 Histoire 0.579466 ... 3 12882\n", - "1 Droit - Jurisprudence 0.953423 ... 44 12913\n", - "2 Géographie 0.953906 ... 58 12804\n", - "3 Métiers 0.922949 ... 48 12815\n", - "4 Architecture 0.150943 ... 0 13008\n", - "5 Médecine - Chirurgie 0.744639 ... 0 13037\n", - "6 Mathématiques 0.225166 ... 2 12900\n", - "7 Grammaire 0.305357 ... 4 13032\n", - "8 Monnaie 0.000000 ... 0 13110\n", - "9 Commerce 0.327945 ... 1 13015\n", - "10 Anatomie 0.196429 ... 2 13025\n", - "11 Physique - [Sciences physico-mathématiques] 0.331081 ... 142 12652\n", - "12 Philosophie 0.000000 ... 1351 11028\n", - "13 Belles-lettres - Poésie 0.008511 ... 0 13106\n", - "14 Militaire (Art) - Guerre - Arme 0.199324 ... 171 12399\n", - "15 Antiquité 0.183544 ... 2711 9779\n", - "16 Maréchage - Manège 0.008621 ... 412 11633\n", - "17 Chasse 0.008197 ... 1054 11199\n", - "18 Agriculture - Economie rustique 0.011811 ... 0 13072\n", - "19 Histoire naturelle 0.942755 ... 185 12697\n", - "20 Religion 0.535248 ... 1 13021\n", - "21 Mesure 0.000000 ... 34 12983\n", - "22 Musique 0.062500 ... 0 13095\n", - "23 Arts et métiers 0.000000 ... 59 12838\n", - "24 Marine 0.425287 ... 0 13111\n", - "25 Blason 0.038095 ... 0 13064\n", - "26 Chimie 0.017857 ... 10 12976\n", - "27 Economie domestique 0.000000 ... 0 13109\n", - "28 Beaux-arts 0.000000 ... 382 12312\n", - "29 Jeu 0.000000 ... 1114 10375\n", - "30 Pêche 0.000000 ... 0 13066\n", - "31 Politique 0.000000 ... 0 13025\n", - "32 Minéralogie 0.000000 ... 98 12817\n", - "33 Pharmacie 0.000000 ... 0 13111\n", - "34 Superstition 0.000000 ... 0 13090\n", - "35 Caractères 0.000000 ... 205 12686\n", - "36 Médailles 0.000000 ... 0 13126\n", - "37 Spectacle 0.000000 ... 0 13112\n", - "\n", - "[38 rows x 9 columns]\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "test_bayes_tf_idf_s10000\n", - "{'precision': 0.9374431375624079, 'recall': 0.4883915658065007, 'f1-score': 0.6291194809131295, 'support': 13137}\n", - "0.4883915658065007\n", - " class precision ... TP TN\n", - "0 Histoire 0.018284 ... 0 12883\n", - "1 Droit - Jurisprudence 0.928017 ... 3 12913\n", - "2 Géographie 0.997185 ... 0 12821\n", - "3 Métiers 0.906379 ... 0 12819\n", - "4 Architecture 0.000000 ... 0 13008\n", - "5 Médecine - Chirurgie 0.230019 ... 0 13037\n", - "6 Mathématiques 0.000000 ... 0 12902\n", - "7 Grammaire 0.000000 ... 0 13032\n", - "8 Monnaie 0.000000 ... 0 13110\n", - "9 Commerce 0.036952 ... 0 13015\n", - "10 Anatomie 0.013393 ... 0 13025\n", - "11 Physique - [Sciences physico-mathématiques] 0.003378 ... 16 12701\n", - "12 Philosophie 0.000000 ... 1315 10852\n", - "13 Belles-lettres - Poésie 0.000000 ... 0 13106\n", - "14 Militaire (Art) - Guerre - Arme 0.003378 ... 0 12577\n", - "15 Antiquité 0.000000 ... 2834 6749\n", - "16 Maréchage - Manège 0.000000 ... 13 12422\n", - "17 Chasse 0.000000 ... 978 11227\n", - "18 Agriculture - Economie rustique 0.000000 ... 0 13072\n", - "19 Histoire naturelle 0.874776 ... 42 12702\n", - "20 Religion 0.002611 ... 0 13021\n", - "21 Mesure 0.000000 ... 0 12986\n", - "22 Musique 0.000000 ... 0 13095\n", - "23 Arts et métiers 0.000000 ... 1 12841\n", - "24 Marine 0.096552 ... 0 13111\n", - "25 Blason 0.000000 ... 0 13064\n", - "26 Chimie 0.000000 ... 0 12977\n", - "27 Economie domestique 0.000000 ... 0 13109\n", - "28 Beaux-arts 0.000000 ... 118 12608\n", - "29 Jeu 0.000000 ... 1094 10439\n", - "30 Pêche 0.000000 ... 0 13066\n", - "31 Politique 0.000000 ... 0 13025\n", - "32 Minéralogie 0.000000 ... 1 12840\n", - "33 Pharmacie 0.000000 ... 0 13111\n", - "34 Superstition 0.000000 ... 0 13090\n", - "35 Caractères 0.000000 ... 1 12754\n", - "36 Médailles 0.000000 ... 0 13126\n", - "37 Spectacle 0.000000 ... 0 13112\n", - "\n", - "[38 rows x 9 columns]\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - } + " dff.to_csv(\"drive/MyDrive/Classification-EDdA/reports/report_\"+dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n", + "\n", + " # enregistrer les predictions\n", + " pd.DataFrame({'labels': pd.Series(df_eval[columnClass]), 'predictions': pd.Series(y_pred)}).to_csv(\"drive/MyDrive/Classification-EDdA/predictions/predictions_\"+dataset+\"_\"+classifier_name+'_' + vectorizer_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\")\n", + "\n", + " \n" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "mMiQo_sR7KWn" }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "machine_shape": "hm", + "name": "EDdA-Classification_ClassicModels.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/EDdA_Classification_DeepLearning.ipynb b/notebooks/EDdA_Classification_DeepLearning.ipynb index d8e9ea64dd3f8eb0d5d3fa12bf8f3f9ee8fa4466..4bdd58e6756dc6b72fda0e6378d9986a9323e3c7 100644 --- a/notebooks/EDdA_Classification_DeepLearning.ipynb +++ b/notebooks/EDdA_Classification_DeepLearning.ipynb @@ -1,20 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "EDdA-Classification_DeepLearning.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { "cell_type": "markdown", @@ -36,9 +20,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "G5LT5n9O7SLt" }, + "outputs": [], "source": [ "train_path = 'training_set.tsv'\n", "validation_path = 'validation_set.tsv'\n", @@ -55,9 +41,7 @@ "max_len = 512 # \n", "epochs = 20\n", "embedding_dim = 300 " - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -70,6 +54,7 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -77,19 +62,18 @@ "id": "Sp8d_Uus7SHJ", "outputId": "82929364-d0a1-4962-fcb4-47224a48e6cf" }, - "source": [ - "from google.colab import drive\n", - "drive.mount('/content/drive')" - ], - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" ] }, { @@ -103,15 +87,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "bTIXsF6kBUdh" }, + "outputs": [], "source": [ "#!pip install zeugma\n", "#!pip install plot_model" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -124,6 +108,7 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -131,6 +116,18 @@ "id": "HwWkSznz7SEv", "outputId": "02ecbbf8-556f-4567-b57d-6e13a4ca28ff" }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + } + ], "source": [ "from nltk.tokenize import word_tokenize\n", "import nltk\n", @@ -164,19 +161,6 @@ "from tqdm import tqdm\n", "import requests, zipfile, io\n", "import os, re, csv, math, codecs" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Unzipping corpora/stopwords.zip.\n", - "[nltk_data] Downloading package punkt to /root/nltk_data...\n", - "[nltk_data] Unzipping tokenizers/punkt.zip.\n" - ] - } ] }, { @@ -190,9 +174,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "4LJ5blQR7PUe" }, + "outputs": [], "source": [ "\n", "def resample_classes(df, classColumnName, numberOfInstances):\n", @@ -201,20 +187,16 @@ " fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]\n", " return df.groupby(classColumnName, as_index=False).apply(fn)\n", " \n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "-Rh3JMDh7zYd" }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -227,34 +209,35 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "FnbNT4NF7zal" }, + "outputs": [], "source": [ "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv\n", "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/validation_set.tsv\n", "!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "WNqDms64lfaS" }, + "outputs": [], "source": [ "# download FastText\n", "zip_file_url = \"https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip\"\n", "r = requests.get(zip_file_url)\n", "z = zipfile.ZipFile(io.BytesIO(r.content))\n", "z.extractall()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -262,73 +245,74 @@ "id": "PGMIi0CAmqSd", "outputId": "09c034fd-f689-43a9-fd75-5923906d89bf" }, - "source": [ - "print('loading word embeddings...')\n", - "\n", - "embeddings_index = {}\n", - "f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')\n", - "\n", - "for line in tqdm(f):\n", - " values = line.rstrip().rsplit(' ')\n", - " word = values[0]\n", - " coefs = np.asarray(values[1:], dtype='float32')\n", - " embeddings_index[word] = coefs\n", - "f.close()\n", - "\n", - "print('found %s word vectors' % len(embeddings_index))" - ], - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "loading word embeddings...\n" ] }, { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "1999996it [03:40, 9087.22it/s]" ] }, { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "found 1999996 word vectors\n" ] }, { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "\n" ] } + ], + "source": [ + "print('loading word embeddings...')\n", + "\n", + "embeddings_index = {}\n", + "f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')\n", + "\n", + "for line in tqdm(f):\n", + " values = line.rstrip().rsplit(' ')\n", + " word = values[0]\n", + " coefs = np.asarray(values[1:], dtype='float32')\n", + " embeddings_index[word] = coefs\n", + "f.close()\n", + "\n", + "print('found %s word vectors' % len(embeddings_index))" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "nRLaQUO97zcq" }, + "outputs": [], "source": [ "df_train = pd.read_csv(train_path, sep=\"\\t\")\n", "df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)\n", "\n", "df_validation = pd.read_csv(validation_path, sep=\"\\t\")\n", "df_validation = resample_classes(df_validation, columnClass, maxOfInstancePerClass)\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "vGWAgBH87ze8" }, + "outputs": [], "source": [ "y_train = df_train[columnClass]\n", "y_validation = df_validation[columnClass]\n", @@ -338,12 +322,11 @@ "\n", "y_train = encoder.fit_transform(y_train)\n", "y_validation = encoder.fit_transform(y_validation)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -352,13 +335,8 @@ "id": "7OYjo_uhoqcX", "outputId": "79c4ff25-0476-4e12-d6ff-a8e073ee3f6c" }, - "source": [ - "df_validation.head()" - ], - "execution_count": null, "outputs": [ { - "output_type": "execute_result", "data": { "text/html": [ "<div>\n", @@ -492,9 +470,13 @@ "[5 rows x 13 columns]" ] }, + "execution_count": 10, "metadata": {}, - "execution_count": 10 + "output_type": "execute_result" } + ], + "source": [ + "df_validation.head()" ] }, { @@ -508,6 +490,7 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -515,6 +498,31 @@ "id": "NTNh6kMTp_eU", "outputId": "3c1eb88c-7f1d-48f1-92bc-bc671f5e1bc1" }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pre-processing train data...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 21129/21129 [00:15<00:00, 1359.31it/s]\n", + "100%|██████████| 10079/10079 [00:07<00:00, 1378.11it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tokenizing input data...\n", + "dictionary size: 95254\n" + ] + } + ], "source": [ "#https://github.com/emmanuellaanggi/disaster_tweet_sentiment/blob/master/(Medium)_Text_Classification_Disaster_Tweet_.ipynb\n", "\n", @@ -551,36 +559,11 @@ "#pad sequences\n", "word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_len)\n", "word_seq_validation = sequence.pad_sequences(word_seq_validation, maxlen=max_len)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "pre-processing train data...\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "100%|██████████| 21129/21129 [00:15<00:00, 1359.31it/s]\n", - "100%|██████████| 10079/10079 [00:07<00:00, 1378.11it/s]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "tokenizing input data...\n", - "dictionary size: 95254\n" - ] - } ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -588,13 +571,8 @@ "id": "Wj8RkOhT_e2c", "outputId": "56152da7-47b7-4b07-84e7-8c499671d53e" }, - "source": [ - "word_seq_validation" - ], - "execution_count": null, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "array([[ 0, 0, 0, ..., 293, 8, 7],\n", @@ -606,13 +584,18 @@ " [ 0, 0, 0, ..., 188, 213, 37]], dtype=int32)" ] }, + "execution_count": 12, "metadata": {}, - "execution_count": 12 + "output_type": "execute_result" } + ], + "source": [ + "word_seq_validation" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -620,6 +603,16 @@ "id": "wGjQI0YgpQAS", "outputId": "43a3d902-5a8d-4159-a21e-419b5ee35d7d" }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "preparing embedding matrix...\n", + "number of null word embeddings: 70\n" + ] + } + ], "source": [ "#embedding matrix\n", "\n", @@ -639,21 +632,11 @@ " else:\n", " words_not_found.append(word)\n", "print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "preparing embedding matrix...\n", - "number of null word embeddings: 70\n" - ] - } ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -661,23 +644,23 @@ "id": "hjaeYIZCtGca", "outputId": "5ab4dd1a-a500-479f-e289-892242c83de8" }, - "source": [ - "print(\"sample words not found: \", np.random.choice(words_not_found, 10))" - ], - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "sample words not found: ['especes' \"d'argent\" \"d'où\" \"d'argent\" \"qu'elle\" \"qu'elle\" \"c'étoit\"\n", " 'différens' 'faisoit' 'faisoit']\n" ] } + ], + "source": [ + "print(\"sample words not found: \", np.random.choice(words_not_found, 10))" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -685,25 +668,10 @@ "id": "4O0gnsX8pNVU", "outputId": "46feba64-b608-4b53-de15-b586dc24b880" }, - "source": [ - "from keras.layers import BatchNormalization\n", - "import tensorflow as tf\n", - "\n", - "model = tf.keras.Sequential()\n", - "\n", - "model.add(Embedding(nb_words,embedding_dim,input_length=max_len, weights=[embedding_matrix],trainable=False))\n", - "model.add(Bidirectional(LSTM(100)))\n", - "model.add(Dense(64,activation='relu'))\n", - "model.add(Dropout(0.2))\n", - "#model.add(Dense(numberOfClasses,activation='sigmoid'))\n", - "model.add(Dense(numberOfClasses,activation='softmax'))\n", - "model.summary()" - ], - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", @@ -727,25 +695,40 @@ "_________________________________________________________________\n" ] } + ], + "source": [ + "from keras.layers import BatchNormalization\n", + "import tensorflow as tf\n", + "\n", + "model = tf.keras.Sequential()\n", + "\n", + "model.add(Embedding(nb_words,embedding_dim,input_length=max_len, weights=[embedding_matrix],trainable=False))\n", + "model.add(Bidirectional(LSTM(100)))\n", + "model.add(Dense(64,activation='relu'))\n", + "model.add(Dropout(0.2))\n", + "#model.add(Dense(numberOfClasses,activation='sigmoid'))\n", + "model.add(Dense(numberOfClasses,activation='softmax'))\n", + "model.summary()" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "GcfMJl8f-cBA" }, + "outputs": [], "source": [ "\n", "#model = NN_withEmbeddings(longueur_dict, embedding_dim, max_len, numberOfClasses)\n", "\n", "model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "#model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC(multi_label=True)])" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -753,15 +736,10 @@ "id": "OTQTH5VDuA3I", "outputId": "b8286232-4938-4591-b483-6b6d1bdc015e" }, - "source": [ - "#model.fit(padded, np.array(y_train), epochs=epochs, batch_size = batch_size) \n", - "model.fit(word_seq_train, y_train, batch_size=256, epochs=epochs, validation_data=(word_seq_validation, y_validation), shuffle=True)" - ], - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Epoch 1/20\n", "83/83 [==============================] - 530s 6s/step - loss: 3.0575 - accuracy: 0.1886 - val_loss: 2.2493 - val_accuracy: 0.4315\n", @@ -806,15 +784,19 @@ ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "<keras.callbacks.History at 0x7f4269526a90>" ] }, + "execution_count": 17, "metadata": {}, - "execution_count": 17 + "output_type": "execute_result" } + ], + "source": [ + "#model.fit(padded, np.array(y_train), epochs=epochs, batch_size = batch_size) \n", + "model.fit(word_seq_train, y_train, batch_size=256, epochs=epochs, validation_data=(word_seq_validation, y_validation), shuffle=True)" ] }, { @@ -828,27 +810,27 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ykTp9lyRaAma" }, + "outputs": [], "source": [ "model.save(\"drive/MyDrive/Classification-EDdA/lstm_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "5J4xDoqRUSfS" }, + "outputs": [], "source": [ "# save embeddings\n", "\n", "# saving embeddings index \n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -861,14 +843,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "fKt8ft1t_Cxx" }, + "outputs": [], "source": [ "model = keras.models.load_model(\"drive/MyDrive/Classification-EDdA/lstm_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -881,471 +863,150 @@ }, { "cell_type": "code", - "metadata": { - "id": "G9pjdMdNW_KS" - }, - "source": [ - "predictions = model.predict(word_seq_validation)\n", - "predictions = np.argmax(predictions,axis=1)" - ], "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [] }, { "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "IHpVJ79IW_M0", - "outputId": "78e2a1aa-d35c-428c-e6c3-0ad332abcdfd" - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "report = classification_report(predictions, y_validation, output_dict = True)\n", + "from sklearn.metrics import confusion_matrix\n", "\n", - "accuracy = report['accuracy']\n", - "weighted_avg = report['weighted avg']\n", "\n", - "print(accuracy, weighted_avg)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "0.5773390217283461 {'precision': 0.5977985581006744, 'recall': 0.5773390217283461, 'f1-score': 0.5808733866443131, 'support': 10079}\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9SKjWffUW_PC" - }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "LpgkGq-fW_RN" - }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "4gGNaPY1iuXD" - }, - "source": [ - "df_test = pd.read_csv(test_path, sep=\"\\t\")\n", + "for dataset in [\"test\", \"validation\"]:\n", "\n", - "encoder = preprocessing.LabelEncoder()\n", - "y_test = encoder.fit_transform(df_test[columnClass])\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "P67p7BUZiuZV", - "outputId": "f958a063-ee95-4157-fcd9-796991615f03" - }, - "source": [ - "raw_docs_test = df_test[columnText].tolist()\n", + " df_eval = pd.read_csv(dataset+\"_set.tsv\", sep=\"\\t\")\n", + " df_eval = df_eval.dropna(subset=[columnClass]).reset_index(drop=True) # supprimer les NaN...\n", + " \n", + " encoder = preprocessing.LabelEncoder()\n", + " y_test = encoder.fit_transform(df_eval[columnClass])\n", "\n", - "print(\"pre-processing test data...\")\n", "\n", - "stop_words = set(stopwords.words('french'))\n", + " raw_docs_test = df_eval[columnText].tolist()\n", "\n", - "processed_docs_test = []\n", - "for doc in tqdm(raw_docs_test):\n", - " tokens = word_tokenize(doc, language='french')\n", - " filtered = [word for word in tokens if word not in stop_words]\n", - " processed_docs_test.append(\" \".join(filtered))\n", - "#end for\n", + " print(\"pre-processing test data...\")\n", "\n", - "print(\"tokenizing input data...\")\n", - "#tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n", - "#tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation) #leaky\n", - "word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)\n", + " stop_words = set(stopwords.words('french'))\n", "\n", - "#pad sequences\n", - "word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "pre-processing test data...\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "100%|██████████| 13137/13137 [00:09<00:00, 1317.07it/s]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "tokenizing input data...\n" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "czeIqlD5iudH" - }, - "source": [ - "predictions = model.predict(word_seq_test)\n", - "predictions = np.argmax(predictions,axis=1)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Q9eYqi5SW_Ta", - "outputId": "3682a42a-7c07-446e-d913-3d20640fb2bf" - }, - "source": [ - "report = classification_report(predictions, y_test, output_dict = True)\n", + " processed_docs_test = []\n", + " for doc in tqdm(raw_docs_test):\n", + " tokens = word_tokenize(doc, language='french')\n", + " filtered = [word for word in tokens if word not in stop_words]\n", + " processed_docs_test.append(\" \".join(filtered))\n", + " #end for\n", "\n", - "accuracy = report['accuracy']\n", - "weighted_avg = report['weighted avg']\n", + " print(\"tokenizing input data...\")\n", + " #tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)\n", + " #tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation) #leaky\n", + " word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)\n", "\n", - "print(accuracy, weighted_avg)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "0.5957220065463956 {'precision': 0.6075119377257042, 'recall': 0.5957220065463956, 'f1-score': 0.59493432234528, 'support': 13137}\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ra4FOHVniwUI", - "outputId": "cbe576f6-ce14-49ef-9aba-2d26f76cab92" - }, - "source": [ - "from sklearn.metrics import confusion_matrix\n", + " #pad sequences\n", + " word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)\n", "\n", - "classesName = encoder.classes_\n", - "classes = [str(e) for e in encoder.transform(encoder.classes_)]\n", + " for maxOfInstancePerClass in [500, 1500, 10000]:\n", + " # il manque le model BERT s500 ...\n", + " \n", + " for classifier_name in [\"lstm\", 'cnn']:\n", "\n", - "precision = []\n", - "recall = []\n", - "f1 = []\n", - "support = []\n", - "dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", - "for c in classes:\n", - " precision.append(report[c]['precision'])\n", - " recall.append(report[c]['recall'])\n", - " f1.append(report[c]['f1-score'])\n", - " support.append(report[c]['support'])\n", + " model = keras.models.load_model(\"drive/MyDrive/Classification-EDdA/\"+classifier_name+\"_fasttext_s\"+str(maxOfInstancePerClass)+\".h5\")\n", "\n", - "accuracy = report['accuracy']\n", - "weighted_avg = report['weighted avg']\n", "\n", + " predictions = model.predict(word_seq_test)\n", + " predictions = np.argmax(predictions,axis=1)\n", "\n", - "cnf_matrix = confusion_matrix(y_test, predictions)\n", - "FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", - "FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", - "TP = np.diag(cnf_matrix)\n", - "TN = cnf_matrix.sum() - (FP + FN + TP)\n", "\n", - "dff['className'] = classesName\n", - "dff['precision'] = precision\n", - "dff['recall'] = recall\n", - "dff['f1-score'] = f1\n", - "dff['support'] = support\n", - "dff['FP'] = FP\n", - "dff['FN'] = FN\n", - "dff['TP'] = TP\n", - "dff['TN'] = TN\n", + " report = classification_report(y_test, predictions, output_dict = True)\n", "\n", - "print(\"test_lstm_s\"+str(maxOfInstancePerClass))\n", + " accuracy = report['accuracy']\n", + " weighted_avg = report['weighted avg']\n", "\n", - "print(weighted_avg)\n", - "print(accuracy)\n", - "print(dff)\n", + " print(accuracy, weighted_avg)\n", "\n", - "dff.to_csv(\"drive/MyDrive/Classification-EDdA/report_test_lstm_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "test_lstm_s1500\n", - "{'precision': 0.6075119377257042, 'recall': 0.5957220065463956, 'f1-score': 0.59493432234528, 'support': 13137}\n", - "0.5957220065463956\n", - " className precision ... TP TN\n", - "0 Agriculture - Economie rustique 0.259843 ... 66 12780\n", - "1 Anatomie 0.446429 ... 100 12818\n", - "2 Antiquité 0.525316 ... 166 12425\n", - "3 Architecture 0.518868 ... 165 12597\n", - "4 Arts et métiers 0.007752 ... 1 13002\n", - "5 Beaux-arts 0.020000 ... 2 13016\n", - "6 Belles-lettres - Poésie 0.200000 ... 47 12667\n", - "7 Blason 0.466667 ... 49 12908\n", - "8 Caractères 0.074074 ... 2 13110\n", - "9 Chasse 0.262295 ... 32 12929\n", - "10 Chimie 0.348214 ... 39 12952\n", - "11 Commerce 0.524249 ... 227 12442\n", - "12 Droit - Jurisprudence 0.750176 ... 1063 11473\n", - "13 Economie domestique 0.000000 ... 0 13106\n", - "14 Grammaire 0.587500 ... 329 12094\n", - "15 Géographie 0.830753 ... 2361 10167\n", - "16 Histoire 0.459916 ... 327 11749\n", - "17 Histoire naturelle 0.687835 ... 769 11871\n", - "18 Jeu 0.415385 ... 27 13034\n", - "19 Marine 0.708046 ... 308 12497\n", - "20 Maréchage - Manège 0.784483 ... 91 12991\n", - "21 Mathématiques 0.450331 ... 68 12922\n", - "22 Mesure 0.333333 ... 14 13078\n", - "23 Militaire (Art) - Guerre - Arme 0.510135 ... 151 12719\n", - "24 Minéralogie 0.000000 ... 0 13111\n", - "25 Monnaie 0.041096 ... 3 13057\n", - "26 Musique 0.525000 ... 84 12922\n", - "27 Médailles 0.000000 ... 0 13109\n", - "28 Médecine - Chirurgie 0.584795 ... 300 12279\n", - "29 Métiers 0.592378 ... 715 11248\n", - "30 Pharmacie 0.014085 ... 1 13065\n", - "31 Philosophie 0.160714 ... 18 12934\n", - "32 Physique - [Sciences physico-mathématiques] 0.533784 ... 158 12690\n", - "33 Politique 0.000000 ... 0 13111\n", - "34 Pêche 0.127660 ... 6 13067\n", - "35 Religion 0.357702 ... 137 12580\n", - "36 Spectacle 0.000000 ... 0 13126\n", - "37 Superstition 0.000000 ... 0 13112\n", - "\n", - "[38 rows x 9 columns]\n" - ] - } + " classesName = encoder.classes_\n", + " classes = [str(e) for e in encoder.transform(encoder.classes_)]\n", + "\n", + " precision = []\n", + " recall = []\n", + " f1 = []\n", + " support = []\n", + " dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])\n", + " for c in classes:\n", + " precision.append(report[c]['precision'])\n", + " recall.append(report[c]['recall'])\n", + " f1.append(report[c]['f1-score'])\n", + " support.append(report[c]['support'])\n", + "\n", + " accuracy = report['accuracy']\n", + " weighted_avg = report['weighted avg']\n", + "\n", + "\n", + " cnf_matrix = confusion_matrix(y_test, predictions)\n", + " FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)\n", + " FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)\n", + " TP = np.diag(cnf_matrix)\n", + " TN = cnf_matrix.sum() - (FP + FN + TP)\n", + "\n", + " dff['className'] = classesName\n", + " dff['precision'] = precision\n", + " dff['recall'] = recall\n", + " dff['f1-score'] = f1\n", + " dff['support'] = support\n", + " dff['FP'] = FP\n", + " dff['FN'] = FN\n", + " dff['TP'] = TP\n", + " dff['TN'] = TN\n", + "\n", + " print(dataset+\"_\"+classifier_name+\"_s\"+str(maxOfInstancePerClass))\n", + "\n", + " print(weighted_avg)\n", + " print(accuracy)\n", + " print(dff)\n", + "\n", + " dff.to_csv(\"drive/MyDrive/Classification-EDdA/reports/report_\"+dataset+\"_\"+classifier_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\", index=False)\n", + " # enregistrer les predictions\n", + " pd.DataFrame({'labels': pd.Series(y_test), 'predictions': pd.Series(predictions)}).to_csv(\"drive/MyDrive/Classification-EDdA/predictions/predictions_\"+dataset+\"_\"+classifier_name+\"_s\"+str(maxOfInstancePerClass)+\".csv\")\n" ] }, { "cell_type": "code", - "metadata": { - "id": "x03FC0D-iwWP" - }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "gSVqcywgiwYH" - }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "-T5LfFtwiwaV" - }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Yjd5c70_iwcY" - }, - "source": [ - "" - ], "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [] }, { "cell_type": "code", - "metadata": { - "id": "2UNjiHYliwes" - }, - "source": [ - "" - ], "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [] }, { "cell_type": "code", - "metadata": { - "id": "vLGTnit_W_V8" - }, - "source": [ - "" - ], "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "R-3lBXjDD9wE" - }, - "source": [ - "def predict(data, max_len):\n", - " \n", - " pad_sequ_test, _ = prepare_sequence(data, max_len)\n", - " pred_labels_ = model.predict(pad_sequ_test)\n", - "\n", - " return np.argmax(pred_labels_,axis=1)\n", - "\n", - "\n", - "def eval(data, labels, max_len):\n", - " \n", - " pred_labels_ = predict(data, max_len)\n", - " report = classification_report(pred_labels_, labels, output_dict = True)\n", - "\n", - " accuracy = report['accuracy']\n", - " weighted_avg = report['weighted avg']\n", - " \n", - " print(accuracy, weighted_avg)" - ], - "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "EDdA-Classification_DeepLearning.ipynb", + "provenance": [] }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6T3kAvKvExgc", - "outputId": "c6d4560e-fc64-4579-9adb-79c2e36d2386" - }, - "source": [ - "# evaluation sur le jeu de validation\n", - "eval(df_validation[columnText], y_validation, max_len)" - ], - "execution_count": null, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " return np.array(self.texts_to_sequences(texts))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.06925290207361841 {'precision': 0.09108131158125257, 'recall': 0.06925290207361841, 'f1-score': 0.06099084715237025, 'support': 10079}\n" - ] - } - ] + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pTDJA03_-8yu", - "outputId": "d8bcdf73-c4c3-4c88-b063-90bd1cad5122" - }, - "source": [ - "# evaluation sur le jeu de test\n", - "df_test = pd.read_csv(test_path, sep=\"\\t\")\n", - "#df_test = resample_classes(df_test, columnClass, maxOfInstancePerClass)\n", - "\n", - "y_test = df_test[columnClass]\n", - "encoder = preprocessing.LabelEncoder()\n", - "y_test = encoder.fit_transform(y_test)\n", - "\n", - "eval(df_test[columnText], y_test, max_len)\n" - ], - "execution_count": null, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/zeugma/keras_transformers.py:33: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " return np.array(self.texts_to_sequences(texts))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.07231483595950369 {'precision': 0.081194635559303, 'recall': 0.07231483595950369, 'f1-score': 0.06322383877903374, 'support': 13137}\n" - ] - } - ] + "language_info": { + "name": "python" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +}