From c55caa844b8c7d83a2ec3224b0319f6aba4abdce Mon Sep 17 00:00:00 2001 From: Ludovic Moncla <moncla.ludovic@gmail.com> Date: Fri, 9 Dec 2022 20:59:39 +0100 Subject: [PATCH] Delete Predictions_analysis.ipynb --- notebooks/Predictions_analysis.ipynb | 470 --------------------------- 1 file changed, 470 deletions(-) delete mode 100644 notebooks/Predictions_analysis.ipynb diff --git a/notebooks/Predictions_analysis.ipynb b/notebooks/Predictions_analysis.ipynb deleted file mode 100644 index a126b03..0000000 --- a/notebooks/Predictions_analysis.ipynb +++ /dev/null @@ -1,470 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "M-41ZfqIHyi2" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "gVaa01O5IQke", - "outputId": "054b0d9d-148a-4cc6-8616-b9e704eab6ea" - }, - "outputs": [], - "source": [ - "!wget https://geode.liris.cnrs.fr/EDdA-Classification/predictions/dataset_test_predictions_sgd_tfidf.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "dYVLgduMIQm4", - "outputId": "4e35f288-f81a-428b-8b9e-035c3a1d3c7a" - }, - "outputs": [], - "source": [ - "df = pd.read_csv(\"dataset_test_predictions_sgd_tfidf.csv\")\n", - "\n", - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 479 - }, - "id": "Bp50IA0qIQpf", - "outputId": "c4efa4c8-4fac-4349-cc12-a331f89850ad" - }, - "outputs": [], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "3Obah84eIQrm", - "outputId": "6971b6d4-b7c5-4029-86b9-83393899dd51" - }, - "outputs": [], - "source": [ - "\n", - "# articles dont la première prédiction correspond à la vérité terrain (\n", - "df[df[\"ensemble_domaine_enccre\"] == df[\"predict1\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 887 - }, - "id": "8eadTEGmJ2BK", - "outputId": "f9469880-f3d5-4fb2-8ac3-4db50eb7deb1" - }, - "outputs": [], - "source": [ - "# articles dont la deuxième classe correspond à la vérité terrain (839)\n", - "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"])]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 939 - }, - "id": "W9PzX5DUKbwO", - "outputId": "9b851666-3361-452b-cab3-f8e42f06c7e4" - }, - "outputs": [], - "source": [ - "# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)\n", - "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] != df[\"predict2\"])]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "NLcWPZlQK9BM", - "outputId": "8baf8f1f-9f36-4779-c12c-47ce96a627da" - }, - "outputs": [], - "source": [ - "# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44\n", - "\n", - "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"]) & (df[\"ensemble_domaine_enccre\"] == \"Géographie\")]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Aq7hmUshMhPh" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 479 - }, - "id": "wRv1Nv5-ztyK", - "outputId": "94e55eeb-f7a1-4a75-b674-5347092565f1" - }, - "outputs": [], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PYH0M0nddL34" - }, - "source": [ - "## Word frequency" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "RmOViUd-zwe8", - "outputId": "e4beff09-2cb2-4cf3-9361-0537e70f484f" - }, - "outputs": [], - "source": [ - "# Liste des ensembles de domaines ENCCRE (classes)\n", - "df.ensemble_domaine_enccre.unique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "H9YxH28xxMGf" - }, - "outputs": [], - "source": [ - "lst_domaines = sorted(df.ensemble_domaine_enccre.unique())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "26h8-7P-xMI7" - }, - "outputs": [], - "source": [ - "# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre\n", - "def wordListToFreqDict(wordlist):\n", - " wordfreq = [wordlist.count(p) for p in wordlist]\n", - " return dict(list(zip(wordlist,wordfreq)))\n", - "\n", - "def sortFreqDict(freqdict):\n", - " aux = [(freqdict[key], key) for key in freqdict]\n", - " aux.sort()\n", - " aux.reverse()\n", - " return aux" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9aNEjYtExMN5" - }, - "outputs": [], - "source": [ - "d = {}\n", - "for domaine in lst_domaines:\n", - " l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]\n", - " print(domaine)\n", - " d[domaine] = sortFreqDict(wordListToFreqDict(l_text))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1yYHcDjUY9HG", - "outputId": "31e2bb56-46ff-4dab-cc2f-bb117d61fb35" - }, - "outputs": [], - "source": [ - "d['Géographie']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uchRyb2gqnk0" - }, - "outputs": [], - "source": [ - "path = \"drive/MyDrive/Classification-EDdA/\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oNvSxYxmpqed" - }, - "outputs": [], - "source": [ - "# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot\n", - "for domaine, wordFreq in d.items():\n", - "\n", - " with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:\n", - " csv_out=csv.writer(file)\n", - " csv_out.writerow(['frequency','word'])\n", - " csv_out.writerows(wordFreq)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tTz8JcdhdHNw" - }, - "source": [ - "## Wordclouds" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "C2NM2ayE9jcR" - }, - "outputs": [], - "source": [ - "from wordcloud import WordCloud" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "EITJqnZ5ecE8", - "outputId": "5a0392c9-f9dd-4a6b-b06a-6a2b50aeec27" - }, - "outputs": [], - "source": [ - "\n", - "lst_clouds = []\n", - "cpt = 1\n", - "n_cols = 4\n", - "n_rows = 10\n", - "\n", - "plt.figure(figsize=(30,50))\n", - "\n", - "for domaine in lst_domaines:\n", - " plt.subplot(n_rows, n_cols, cpt)\n", - " text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values\n", - " cloud_i = WordCloud(width=1080, height=720, background_color='white',\n", - " collocations=False, colormap='Set2',\n", - " max_words = 100, random_state = 42\n", - " ).generate(\" \".join(text))\n", - " \n", - " # https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html\n", - "\n", - " plt.axis('off')\n", - " plt.title(domaine,fontsize=10)\n", - " plt.imshow(cloud_i)\n", - "\n", - " cloud_i.to_file(path+\"/Wordclouds/Wordclouds_\"+domaine.split(\" \")[0]+\".png\")\n", - " cpt += 1\n", - "\n", - " lst_clouds.append(cloud_i)\n", - "\n", - "plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')\n", - "plt.show()\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rVFvp3owZDPq" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2EIA9mV_ecH8" - }, - "outputs": [], - "source": [ - "# Récupération des mots en communs\n", - "m = []\n", - "for d1 in lst_clouds :\n", - " m2 = []\n", - " for d2 in lst_clouds :\n", - "\n", - " lst_1 = d1.words_.keys()\n", - " lst_2 = d2.words_.keys()\n", - "\n", - " lst_text = [i for i in lst_1 if i in lst_2]\n", - " m2.append(len(lst_text))\n", - " m.append(m2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 874 - }, - "id": "p-D4rLn1TCMV", - "outputId": "6dc3c33a-733d-466b-c134-9060e9261973" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "plt.figure(figsize=(16,13))\n", - "\n", - "ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')\n", - "\n", - "plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ZSnHC2IoTqO5", - "outputId": "d6249442-119b-4dbd-a764-422d46fcf0f0" - }, - "outputs": [], - "source": [ - "# nombre de mots en commun entre Arts et Métier et Métiers :\n", - "\n", - "\n", - "# 4 et 29\n", - "\n", - "lst_1 = lst_clouds[4].words_.keys()\n", - "lst_2 = lst_clouds[29].words_.keys()\n", - "\n", - "lst_text = [i for i in lst_1 if i in lst_2]\n", - "len(lst_text)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "niAw2OF0bWMi", - "outputId": "4f267f2d-8586-44d0-bd54-1c73ebf646bf" - }, - "outputs": [], - "source": [ - "# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers\n", - "lst_1 = lst_clouds[4].words_.keys()\n", - "lst_2 = lst_clouds[29].words_.keys()\n", - "\n", - "lst_text = [i for i in lst_1 if i not in lst_2]\n", - "lst_text" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oYT479rsyVvq" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "EDdA-Classification_Analyses_predictions_proba.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3.9.13 ('stanza-lexicoscope-py39')", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.9.13" - }, - "vscode": { - "interpreter": { - "hash": "68d5f9281eab57a7f4901cb150f4c691b1d08935474a18f188e0e3e8f8f412b7" - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} -- GitLab