Create Predictions_analysis_bckp.ipynb

f5242db6 · Ludovic Moncla · c55caa84 · f5242db6
Commit f5242db6 authored 2 years ago by Ludovic Moncla
--- a/notebooks/Predictions_analysis_bckp.ipynb
+++ b/notebooks/Predictions_analysis_bckp.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "M-41ZfqIHyi2"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "import csv"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "gVaa01O5IQke",
+        "outputId": "054b0d9d-148a-4cc6-8616-b9e704eab6ea"
+      },
+      "outputs": [],
+      "source": [
+        "!wget https://geode.liris.cnrs.fr/EDdA-Classification/predictions/dataset_test_predictions_sgd_tfidf.csv"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "dYVLgduMIQm4",
+        "outputId": "4e35f288-f81a-428b-8b9e-035c3a1d3c7a"
+      },
+      "outputs": [],
+      "source": [
+        "df = pd.read_csv(\"dataset_test_predictions_sgd_tfidf.csv\")\n",
+        "\n",
+        "df.shape"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 479
+        },
+        "id": "Bp50IA0qIQpf",
+        "outputId": "c4efa4c8-4fac-4349-cc12-a331f89850ad"
+      },
+      "outputs": [],
+      "source": [
+        "df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "3Obah84eIQrm",
+        "outputId": "6971b6d4-b7c5-4029-86b9-83393899dd51"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "# articles dont la première prédiction correspond à la vérité terrain (\n",
+        "df[df[\"ensemble_domaine_enccre\"] == df[\"predict1\"]]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 887
+        },
+        "id": "8eadTEGmJ2BK",
+        "outputId": "f9469880-f3d5-4fb2-8ac3-4db50eb7deb1"
+      },
+      "outputs": [],
+      "source": [
+        "# articles dont la deuxième classe correspond à la vérité terrain (839)\n",
+        "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"])]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 939
+        },
+        "id": "W9PzX5DUKbwO",
+        "outputId": "9b851666-3361-452b-cab3-f8e42f06c7e4"
+      },
+      "outputs": [],
+      "source": [
+        "# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)\n",
+        "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] != df[\"predict2\"])]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "NLcWPZlQK9BM",
+        "outputId": "8baf8f1f-9f36-4779-c12c-47ce96a627da"
+      },
+      "outputs": [],
+      "source": [
+        "# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44\n",
+        "\n",
+        "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"]) & (df[\"ensemble_domaine_enccre\"] == \"Géographie\")]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Aq7hmUshMhPh"
+      },
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 479
+        },
+        "id": "wRv1Nv5-ztyK",
+        "outputId": "94e55eeb-f7a1-4a75-b674-5347092565f1"
+      },
+      "outputs": [],
+      "source": [
+        "df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PYH0M0nddL34"
+      },
+      "source": [
+        "## Word frequency"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "RmOViUd-zwe8",
+        "outputId": "e4beff09-2cb2-4cf3-9361-0537e70f484f"
+      },
+      "outputs": [],
+      "source": [
+        "# Liste des ensembles de domaines ENCCRE (classes)\n",
+        "df.ensemble_domaine_enccre.unique()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "H9YxH28xxMGf"
+      },
+      "outputs": [],
+      "source": [
+        "lst_domaines = sorted(df.ensemble_domaine_enccre.unique())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "26h8-7P-xMI7"
+      },
+      "outputs": [],
+      "source": [
+        "# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre\n",
+        "def wordListToFreqDict(wordlist):\n",
+        "    wordfreq = [wordlist.count(p) for p in wordlist]\n",
+        "    return dict(list(zip(wordlist,wordfreq)))\n",
+        "\n",
+        "def sortFreqDict(freqdict):\n",
+        "    aux = [(freqdict[key], key) for key in freqdict]\n",
+        "    aux.sort()\n",
+        "    aux.reverse()\n",
+        "    return aux"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9aNEjYtExMN5"
+      },
+      "outputs": [],
+      "source": [
+        "d = {}\n",
+        "for domaine in lst_domaines:\n",
+        "  l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]\n",
+        "  print(domaine)\n",
+        "  d[domaine] = sortFreqDict(wordListToFreqDict(l_text))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "1yYHcDjUY9HG",
+        "outputId": "31e2bb56-46ff-4dab-cc2f-bb117d61fb35"
+      },
+      "outputs": [],
+      "source": [
+        "d['Géographie']"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uchRyb2gqnk0"
+      },
+      "outputs": [],
+      "source": [
+        "path = \"drive/MyDrive/Classification-EDdA/\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oNvSxYxmpqed"
+      },
+      "outputs": [],
+      "source": [
+        "# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot\n",
+        "for domaine, wordFreq in d.items():\n",
+        "\n",
+        "  with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:\n",
+        "      csv_out=csv.writer(file)\n",
+        "      csv_out.writerow(['frequency','word'])\n",
+        "      csv_out.writerows(wordFreq)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tTz8JcdhdHNw"
+      },
+      "source": [
+        "## Wordclouds"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "C2NM2ayE9jcR"
+      },
+      "outputs": [],
+      "source": [
+        "from wordcloud import WordCloud"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "EITJqnZ5ecE8",
+        "outputId": "5a0392c9-f9dd-4a6b-b06a-6a2b50aeec27"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "lst_clouds = []\n",
+        "cpt = 1\n",
+        "n_cols = 4\n",
+        "n_rows = 10\n",
+        "\n",
+        "plt.figure(figsize=(30,50))\n",
+        "\n",
+        "for domaine in lst_domaines:\n",
+        "    plt.subplot(n_rows, n_cols, cpt)\n",
+        "    text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values\n",
+        "    cloud_i = WordCloud(width=1080, height=720, background_color='white',\n",
+        "                        collocations=False, colormap='Set2',\n",
+        "                        max_words = 100, random_state = 42\n",
+        "                       ).generate(\" \".join(text))\n",
+        "    \n",
+        "    # https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html\n",
+        "\n",
+        "    plt.axis('off')\n",
+        "    plt.title(domaine,fontsize=10)\n",
+        "    plt.imshow(cloud_i)\n",
+        "\n",
+        "    cloud_i.to_file(path+\"/Wordclouds/Wordclouds_\"+domaine.split(\" \")[0]+\".png\")\n",
+        "    cpt += 1\n",
+        "\n",
+        "    lst_clouds.append(cloud_i)\n",
+        "\n",
+        "plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')\n",
+        "plt.show()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rVFvp3owZDPq"
+      },
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2EIA9mV_ecH8"
+      },
+      "outputs": [],
+      "source": [
+        "# Récupération des mots en communs\n",
+        "m = []\n",
+        "for d1 in lst_clouds :\n",
+        "  m2 = []\n",
+        "  for d2 in lst_clouds :\n",
+        "\n",
+        "    lst_1 = d1.words_.keys()\n",
+        "    lst_2 = d2.words_.keys()\n",
+        "\n",
+        "    lst_text = [i for i in lst_1 if i in lst_2]\n",
+        "    m2.append(len(lst_text))\n",
+        "  m.append(m2)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 874
+        },
+        "id": "p-D4rLn1TCMV",
+        "outputId": "6dc3c33a-733d-466b-c134-9060e9261973"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "import seaborn as sns\n",
+        "\n",
+        "plt.figure(figsize=(16,13))\n",
+        "\n",
+        "ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')\n",
+        "\n",
+        "plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ZSnHC2IoTqO5",
+        "outputId": "d6249442-119b-4dbd-a764-422d46fcf0f0"
+      },
+      "outputs": [],
+      "source": [
+        "# nombre de mots en commun entre Arts et Métier et Métiers :\n",
+        "\n",
+        "\n",
+        "# 4 et 29\n",
+        "\n",
+        "lst_1 = lst_clouds[4].words_.keys()\n",
+        "lst_2 = lst_clouds[29].words_.keys()\n",
+        "\n",
+        "lst_text = [i for i in lst_1 if i in lst_2]\n",
+        "len(lst_text)\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "niAw2OF0bWMi",
+        "outputId": "4f267f2d-8586-44d0-bd54-1c73ebf646bf"
+      },
+      "outputs": [],
+      "source": [
+        "# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers\n",
+        "lst_1 = lst_clouds[4].words_.keys()\n",
+        "lst_2 = lst_clouds[29].words_.keys()\n",
+        "\n",
+        "lst_text = [i for i in lst_1 if i not in lst_2]\n",
+        "lst_text"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oYT479rsyVvq"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "EDdA-Classification_Analyses_predictions_proba.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3.9.13 ('stanza-lexicoscope-py39')",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.9.13"
+    },
+    "vscode": {
+      "interpreter": {
+        "hash": "68d5f9281eab57a7f4901cb150f4c691b1d08935474a18f188e0e3e8f8f412b7"
+      }
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+import csv
+```
+
+%% Cell type:code id: tags:
+
+``` python
+!wget https://geode.liris.cnrs.fr/EDdA-Classification/predictions/dataset_test_predictions_sgd_tfidf.csv
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df = pd.read_csv("dataset_test_predictions_sgd_tfidf.csv")
+
+df.shape
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df.head()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+
+# articles dont la première prédiction correspond à la vérité terrain (
+df[df["ensemble_domaine_enccre"] == df["predict1"]]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# articles dont la deuxième classe correspond à la vérité terrain (839)
+df[(df["ensemble_domaine_enccre"] != df["predict1"]) & (df["ensemble_domaine_enccre"] == df["predict2"])]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)
+df[(df["ensemble_domaine_enccre"] != df["predict1"]) & (df["ensemble_domaine_enccre"] != df["predict2"])]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44
+
+df[(df["ensemble_domaine_enccre"] != df["predict1"]) & (df["ensemble_domaine_enccre"] == df["predict2"]) & (df["ensemble_domaine_enccre"] == "Géographie")]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df.head()
+```
+
+%% Cell type:markdown id: tags:
+
+## Word frequency
+
+%% Cell type:code id: tags:
+
+``` python
+# Liste des ensembles de domaines ENCCRE (classes)
+df.ensemble_domaine_enccre.unique()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+lst_domaines = sorted(df.ensemble_domaine_enccre.unique())
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre
+def wordListToFreqDict(wordlist):
+    wordfreq = [wordlist.count(p) for p in wordlist]
+    return dict(list(zip(wordlist,wordfreq)))
+
+def sortFreqDict(freqdict):
+    aux = [(freqdict[key], key) for key in freqdict]
+    aux.sort()
+    aux.reverse()
+    return aux
+```
+
+%% Cell type:code id: tags:
+
+``` python
+d = {}
+for domaine in lst_domaines:
+  l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]
+  print(domaine)
+  d[domaine] = sortFreqDict(wordListToFreqDict(l_text))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+d['Géographie']
+```
+
+%% Cell type:code id: tags:
+
+``` python
+path = "drive/MyDrive/Classification-EDdA/"
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot
+for domaine, wordFreq in d.items():
+
+  with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:
+      csv_out=csv.writer(file)
+      csv_out.writerow(['frequency','word'])
+      csv_out.writerows(wordFreq)
+```
+
+%% Cell type:markdown id: tags:
+
+## Wordclouds
+
+%% Cell type:code id: tags:
+
+``` python
+from wordcloud import WordCloud
+```
+
+%% Cell type:code id: tags:
+
+``` python
+
+lst_clouds = []
+cpt = 1
+n_cols = 4
+n_rows = 10
+
+plt.figure(figsize=(30,50))
+
+for domaine in lst_domaines:
+    plt.subplot(n_rows, n_cols, cpt)
+    text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values
+    cloud_i = WordCloud(width=1080, height=720, background_color='white',
+                        collocations=False, colormap='Set2',
+                        max_words = 100, random_state = 42
+                       ).generate(" ".join(text))
+
+    # https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html
+
+    plt.axis('off')
+    plt.title(domaine,fontsize=10)
+    plt.imshow(cloud_i)
+
+    cloud_i.to_file(path+"/Wordclouds/Wordclouds_"+domaine.split(" ")[0]+".png")
+    cpt += 1
+
+    lst_clouds.append(cloud_i)
+
+plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')
+plt.show()
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Récupération des mots en communs
+m = []
+for d1 in lst_clouds :
+  m2 = []
+  for d2 in lst_clouds :
+
+    lst_1 = d1.words_.keys()
+    lst_2 = d2.words_.keys()
+
+    lst_text = [i for i in lst_1 if i in lst_2]
+    m2.append(len(lst_text))
+  m.append(m2)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+plt.figure(figsize=(16,13))
+
+ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')
+
+plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# nombre de mots en commun entre Arts et Métier et Métiers :
+
+
+# 4 et 29
+
+lst_1 = lst_clouds[4].words_.keys()
+lst_2 = lst_clouds[29].words_.keys()
+
+lst_text = [i for i in lst_1 if i in lst_2]
+len(lst_text)
+
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers
+lst_1 = lst_clouds[4].words_.keys()
+lst_2 = lst_clouds[29].words_.keys()
+
+lst_text = [i for i in lst_1 if i not in lst_2]
+lst_text
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```