From c55caa844b8c7d83a2ec3224b0319f6aba4abdce Mon Sep 17 00:00:00 2001
From: Ludovic Moncla <moncla.ludovic@gmail.com>
Date: Fri, 9 Dec 2022 20:59:39 +0100
Subject: [PATCH] Delete Predictions_analysis.ipynb

---
 notebooks/Predictions_analysis.ipynb | 470 ---------------------------
 1 file changed, 470 deletions(-)
 delete mode 100644 notebooks/Predictions_analysis.ipynb

diff --git a/notebooks/Predictions_analysis.ipynb b/notebooks/Predictions_analysis.ipynb
deleted file mode 100644
index a126b03..0000000
--- a/notebooks/Predictions_analysis.ipynb
+++ /dev/null
@@ -1,470 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "M-41ZfqIHyi2"
-      },
-      "outputs": [],
-      "source": [
-        "import pandas as pd\n",
-        "import csv"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "gVaa01O5IQke",
-        "outputId": "054b0d9d-148a-4cc6-8616-b9e704eab6ea"
-      },
-      "outputs": [],
-      "source": [
-        "!wget https://geode.liris.cnrs.fr/EDdA-Classification/predictions/dataset_test_predictions_sgd_tfidf.csv"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "dYVLgduMIQm4",
-        "outputId": "4e35f288-f81a-428b-8b9e-035c3a1d3c7a"
-      },
-      "outputs": [],
-      "source": [
-        "df = pd.read_csv(\"dataset_test_predictions_sgd_tfidf.csv\")\n",
-        "\n",
-        "df.shape"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 479
-        },
-        "id": "Bp50IA0qIQpf",
-        "outputId": "c4efa4c8-4fac-4349-cc12-a331f89850ad"
-      },
-      "outputs": [],
-      "source": [
-        "df.head()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 1000
-        },
-        "id": "3Obah84eIQrm",
-        "outputId": "6971b6d4-b7c5-4029-86b9-83393899dd51"
-      },
-      "outputs": [],
-      "source": [
-        "\n",
-        "# articles dont la première prédiction correspond à la vérité terrain (\n",
-        "df[df[\"ensemble_domaine_enccre\"] == df[\"predict1\"]]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 887
-        },
-        "id": "8eadTEGmJ2BK",
-        "outputId": "f9469880-f3d5-4fb2-8ac3-4db50eb7deb1"
-      },
-      "outputs": [],
-      "source": [
-        "# articles dont la deuxième classe correspond à la vérité terrain (839)\n",
-        "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"])]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 939
-        },
-        "id": "W9PzX5DUKbwO",
-        "outputId": "9b851666-3361-452b-cab3-f8e42f06c7e4"
-      },
-      "outputs": [],
-      "source": [
-        "# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)\n",
-        "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] != df[\"predict2\"])]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 1000
-        },
-        "id": "NLcWPZlQK9BM",
-        "outputId": "8baf8f1f-9f36-4779-c12c-47ce96a627da"
-      },
-      "outputs": [],
-      "source": [
-        "# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44\n",
-        "\n",
-        "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"]) & (df[\"ensemble_domaine_enccre\"] == \"Géographie\")]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Aq7hmUshMhPh"
-      },
-      "outputs": [],
-      "source": []
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 479
-        },
-        "id": "wRv1Nv5-ztyK",
-        "outputId": "94e55eeb-f7a1-4a75-b674-5347092565f1"
-      },
-      "outputs": [],
-      "source": [
-        "df.head()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PYH0M0nddL34"
-      },
-      "source": [
-        "## Word frequency"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "RmOViUd-zwe8",
-        "outputId": "e4beff09-2cb2-4cf3-9361-0537e70f484f"
-      },
-      "outputs": [],
-      "source": [
-        "# Liste des ensembles de domaines ENCCRE (classes)\n",
-        "df.ensemble_domaine_enccre.unique()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "H9YxH28xxMGf"
-      },
-      "outputs": [],
-      "source": [
-        "lst_domaines = sorted(df.ensemble_domaine_enccre.unique())"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "26h8-7P-xMI7"
-      },
-      "outputs": [],
-      "source": [
-        "# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre\n",
-        "def wordListToFreqDict(wordlist):\n",
-        "    wordfreq = [wordlist.count(p) for p in wordlist]\n",
-        "    return dict(list(zip(wordlist,wordfreq)))\n",
-        "\n",
-        "def sortFreqDict(freqdict):\n",
-        "    aux = [(freqdict[key], key) for key in freqdict]\n",
-        "    aux.sort()\n",
-        "    aux.reverse()\n",
-        "    return aux"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9aNEjYtExMN5"
-      },
-      "outputs": [],
-      "source": [
-        "d = {}\n",
-        "for domaine in lst_domaines:\n",
-        "  l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]\n",
-        "  print(domaine)\n",
-        "  d[domaine] = sortFreqDict(wordListToFreqDict(l_text))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "1yYHcDjUY9HG",
-        "outputId": "31e2bb56-46ff-4dab-cc2f-bb117d61fb35"
-      },
-      "outputs": [],
-      "source": [
-        "d['Géographie']"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "uchRyb2gqnk0"
-      },
-      "outputs": [],
-      "source": [
-        "path = \"drive/MyDrive/Classification-EDdA/\""
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "oNvSxYxmpqed"
-      },
-      "outputs": [],
-      "source": [
-        "# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot\n",
-        "for domaine, wordFreq in d.items():\n",
-        "\n",
-        "  with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:\n",
-        "      csv_out=csv.writer(file)\n",
-        "      csv_out.writerow(['frequency','word'])\n",
-        "      csv_out.writerows(wordFreq)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tTz8JcdhdHNw"
-      },
-      "source": [
-        "## Wordclouds"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "C2NM2ayE9jcR"
-      },
-      "outputs": [],
-      "source": [
-        "from wordcloud import WordCloud"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 1000
-        },
-        "id": "EITJqnZ5ecE8",
-        "outputId": "5a0392c9-f9dd-4a6b-b06a-6a2b50aeec27"
-      },
-      "outputs": [],
-      "source": [
-        "\n",
-        "lst_clouds = []\n",
-        "cpt = 1\n",
-        "n_cols = 4\n",
-        "n_rows = 10\n",
-        "\n",
-        "plt.figure(figsize=(30,50))\n",
-        "\n",
-        "for domaine in lst_domaines:\n",
-        "    plt.subplot(n_rows, n_cols, cpt)\n",
-        "    text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values\n",
-        "    cloud_i = WordCloud(width=1080, height=720, background_color='white',\n",
-        "                        collocations=False, colormap='Set2',\n",
-        "                        max_words = 100, random_state = 42\n",
-        "                       ).generate(\" \".join(text))\n",
-        "    \n",
-        "    # https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html\n",
-        "\n",
-        "    plt.axis('off')\n",
-        "    plt.title(domaine,fontsize=10)\n",
-        "    plt.imshow(cloud_i)\n",
-        "\n",
-        "    cloud_i.to_file(path+\"/Wordclouds/Wordclouds_\"+domaine.split(\" \")[0]+\".png\")\n",
-        "    cpt += 1\n",
-        "\n",
-        "    lst_clouds.append(cloud_i)\n",
-        "\n",
-        "plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')\n",
-        "plt.show()\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rVFvp3owZDPq"
-      },
-      "outputs": [],
-      "source": []
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "2EIA9mV_ecH8"
-      },
-      "outputs": [],
-      "source": [
-        "# Récupération des mots en communs\n",
-        "m = []\n",
-        "for d1 in lst_clouds :\n",
-        "  m2 = []\n",
-        "  for d2 in lst_clouds :\n",
-        "\n",
-        "    lst_1 = d1.words_.keys()\n",
-        "    lst_2 = d2.words_.keys()\n",
-        "\n",
-        "    lst_text = [i for i in lst_1 if i in lst_2]\n",
-        "    m2.append(len(lst_text))\n",
-        "  m.append(m2)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 874
-        },
-        "id": "p-D4rLn1TCMV",
-        "outputId": "6dc3c33a-733d-466b-c134-9060e9261973"
-      },
-      "outputs": [],
-      "source": [
-        "import matplotlib.pyplot as plt\n",
-        "import seaborn as sns\n",
-        "\n",
-        "plt.figure(figsize=(16,13))\n",
-        "\n",
-        "ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')\n",
-        "\n",
-        "plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "ZSnHC2IoTqO5",
-        "outputId": "d6249442-119b-4dbd-a764-422d46fcf0f0"
-      },
-      "outputs": [],
-      "source": [
-        "# nombre de mots en commun entre Arts et Métier et Métiers :\n",
-        "\n",
-        "\n",
-        "# 4 et 29\n",
-        "\n",
-        "lst_1 = lst_clouds[4].words_.keys()\n",
-        "lst_2 = lst_clouds[29].words_.keys()\n",
-        "\n",
-        "lst_text = [i for i in lst_1 if i in lst_2]\n",
-        "len(lst_text)\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "niAw2OF0bWMi",
-        "outputId": "4f267f2d-8586-44d0-bd54-1c73ebf646bf"
-      },
-      "outputs": [],
-      "source": [
-        "# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers\n",
-        "lst_1 = lst_clouds[4].words_.keys()\n",
-        "lst_2 = lst_clouds[29].words_.keys()\n",
-        "\n",
-        "lst_text = [i for i in lst_1 if i not in lst_2]\n",
-        "lst_text"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "oYT479rsyVvq"
-      },
-      "outputs": [],
-      "source": []
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "EDdA-Classification_Analyses_predictions_proba.ipynb",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3.9.13 ('stanza-lexicoscope-py39')",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python",
-      "version": "3.9.13"
-    },
-    "vscode": {
-      "interpreter": {
-        "hash": "68d5f9281eab57a7f4901cb150f4c691b1d08935474a18f188e0e3e8f8f412b7"
-      }
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
-- 
GitLab