{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "M-41ZfqIHyi2"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import csv"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gVaa01O5IQke",
        "outputId": "054b0d9d-148a-4cc6-8616-b9e704eab6ea"
      },
      "outputs": [],
      "source": [
        "!wget https://geode.liris.cnrs.fr/EDdA-Classification/predictions/dataset_test_predictions_sgd_tfidf.csv\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dYVLgduMIQm4",
        "outputId": "4e35f288-f81a-428b-8b9e-035c3a1d3c7a"
      },
      "outputs": [],
      "source": [
        "df = pd.read_csv(\"dataset_test_predictions_sgd_tfidf.csv\")\n",
        "\n",
        "df.shape"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 479
        },
        "id": "Bp50IA0qIQpf",
        "outputId": "c4efa4c8-4fac-4349-cc12-a331f89850ad"
      },
      "outputs": [],
      "source": [
        "df.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "3Obah84eIQrm",
        "outputId": "6971b6d4-b7c5-4029-86b9-83393899dd51"
      },
      "outputs": [],
      "source": [
        "\n",
        "# articles dont la première prédiction correspond à la vérité terrain (\n",
        "df[df[\"ensemble_domaine_enccre\"] == df[\"predict1\"]]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 887
        },
        "id": "8eadTEGmJ2BK",
        "outputId": "f9469880-f3d5-4fb2-8ac3-4db50eb7deb1"
      },
      "outputs": [],
      "source": [
        "# articles dont la deuxième classe correspond à la vérité terrain (839)\n",
        "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"])]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 939
        },
        "id": "W9PzX5DUKbwO",
        "outputId": "9b851666-3361-452b-cab3-f8e42f06c7e4"
      },
      "outputs": [],
      "source": [
        "# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)\n",
        "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] != df[\"predict2\"])]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "NLcWPZlQK9BM",
        "outputId": "8baf8f1f-9f36-4779-c12c-47ce96a627da"
      },
      "outputs": [],
      "source": [
        "# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44\n",
        "\n",
        "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"]) & (df[\"ensemble_domaine_enccre\"] == \"Géographie\")]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Aq7hmUshMhPh"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 479
        },
        "id": "wRv1Nv5-ztyK",
        "outputId": "94e55eeb-f7a1-4a75-b674-5347092565f1"
      },
      "outputs": [],
      "source": [
        "df.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "PYH0M0nddL34"
      },
      "source": [
        "## Word frequency"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RmOViUd-zwe8",
        "outputId": "e4beff09-2cb2-4cf3-9361-0537e70f484f"
      },
      "outputs": [],
      "source": [
        "# Liste des ensembles de domaines ENCCRE (classes)\n",
        "df.ensemble_domaine_enccre.unique()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "H9YxH28xxMGf"
      },
      "outputs": [],
      "source": [
        "lst_domaines = sorted(df.ensemble_domaine_enccre.unique())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "26h8-7P-xMI7"
      },
      "outputs": [],
      "source": [
        "# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre\n",
        "def wordListToFreqDict(wordlist):\n",
        "    wordfreq = [wordlist.count(p) for p in wordlist]\n",
        "    return dict(list(zip(wordlist,wordfreq)))\n",
        "\n",
        "def sortFreqDict(freqdict):\n",
        "    aux = [(freqdict[key], key) for key in freqdict]\n",
        "    aux.sort()\n",
        "    aux.reverse()\n",
        "    return aux"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9aNEjYtExMN5"
      },
      "outputs": [],
      "source": [
        "d = {}\n",
        "for domaine in lst_domaines:\n",
        "  l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]\n",
        "  print(domaine)\n",
        "  d[domaine] = sortFreqDict(wordListToFreqDict(l_text))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1yYHcDjUY9HG",
        "outputId": "31e2bb56-46ff-4dab-cc2f-bb117d61fb35"
      },
      "outputs": [],
      "source": [
        "d['Géographie']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "uchRyb2gqnk0"
      },
      "outputs": [],
      "source": [
        "path = \"drive/MyDrive/Classification-EDdA/\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "oNvSxYxmpqed"
      },
      "outputs": [],
      "source": [
        "# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot\n",
        "for domaine, wordFreq in d.items():\n",
        "\n",
        "  with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:\n",
        "      csv_out=csv.writer(file)\n",
        "      csv_out.writerow(['frequency','word'])\n",
        "      csv_out.writerows(wordFreq)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tTz8JcdhdHNw"
      },
      "source": [
        "## Wordclouds"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "C2NM2ayE9jcR"
      },
      "outputs": [],
      "source": [
        "from wordcloud import WordCloud"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "EITJqnZ5ecE8",
        "outputId": "5a0392c9-f9dd-4a6b-b06a-6a2b50aeec27"
      },
      "outputs": [],
      "source": [
        "\n",
        "lst_clouds = []\n",
        "cpt = 1\n",
        "n_cols = 4\n",
        "n_rows = 10\n",
        "\n",
        "plt.figure(figsize=(30,50))\n",
        "\n",
        "for domaine in lst_domaines:\n",
        "    plt.subplot(n_rows, n_cols, cpt)\n",
        "    text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values\n",
        "    cloud_i = WordCloud(width=1080, height=720, background_color='white',\n",
        "                        collocations=False, colormap='Set2',\n",
        "                        max_words = 100, random_state = 42\n",
        "                       ).generate(\" \".join(text))\n",
        "    \n",
        "    # https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html\n",
        "\n",
        "    plt.axis('off')\n",
        "    plt.title(domaine,fontsize=10)\n",
        "    plt.imshow(cloud_i)\n",
        "\n",
        "    cloud_i.to_file(path+\"/Wordclouds/Wordclouds_\"+domaine.split(\" \")[0]+\".png\")\n",
        "    cpt += 1\n",
        "\n",
        "    lst_clouds.append(cloud_i)\n",
        "\n",
        "plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')\n",
        "plt.show()\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rVFvp3owZDPq"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2EIA9mV_ecH8"
      },
      "outputs": [],
      "source": [
        "# Récupération des mots en communs\n",
        "m = []\n",
        "for d1 in lst_clouds :\n",
        "  m2 = []\n",
        "  for d2 in lst_clouds :\n",
        "\n",
        "    lst_1 = d1.words_.keys()\n",
        "    lst_2 = d2.words_.keys()\n",
        "\n",
        "    lst_text = [i for i in lst_1 if i in lst_2]\n",
        "    m2.append(len(lst_text))\n",
        "  m.append(m2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 874
        },
        "id": "p-D4rLn1TCMV",
        "outputId": "6dc3c33a-733d-466b-c134-9060e9261973"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "\n",
        "plt.figure(figsize=(16,13))\n",
        "\n",
        "ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')\n",
        "\n",
        "plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZSnHC2IoTqO5",
        "outputId": "d6249442-119b-4dbd-a764-422d46fcf0f0"
      },
      "outputs": [],
      "source": [
        "# nombre de mots en commun entre Arts et Métier et Métiers :\n",
        "\n",
        "\n",
        "# 4 et 29\n",
        "\n",
        "lst_1 = lst_clouds[4].words_.keys()\n",
        "lst_2 = lst_clouds[29].words_.keys()\n",
        "\n",
        "lst_text = [i for i in lst_1 if i in lst_2]\n",
        "len(lst_text)\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "niAw2OF0bWMi",
        "outputId": "4f267f2d-8586-44d0-bd54-1c73ebf646bf"
      },
      "outputs": [],
      "source": [
        "# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers\n",
        "lst_1 = lst_clouds[4].words_.keys()\n",
        "lst_2 = lst_clouds[29].words_.keys()\n",
        "\n",
        "lst_text = [i for i in lst_1 if i not in lst_2]\n",
        "lst_text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "oYT479rsyVvq"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [],
      "name": "EDdA-Classification_Analyses_predictions_proba.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3.9.13 ('stanza-lexicoscope-py39')",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.9.13"
    },
    "vscode": {
      "interpreter": {
        "hash": "68d5f9281eab57a7f4901cb150f4c691b1d08935474a18f188e0e3e8f8f412b7"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}