diff --git a/notebooks/Predictions_analysis_bckp.ipynb b/notebooks/Predictions_analysis_bckp.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a126b03f68b8a1878579afc755cac05848e9799b --- /dev/null +++ b/notebooks/Predictions_analysis_bckp.ipynb @@ -0,0 +1,470 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "M-41ZfqIHyi2" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gVaa01O5IQke", + "outputId": "054b0d9d-148a-4cc6-8616-b9e704eab6ea" + }, + "outputs": [], + "source": [ + "!wget https://geode.liris.cnrs.fr/EDdA-Classification/predictions/dataset_test_predictions_sgd_tfidf.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dYVLgduMIQm4", + "outputId": "4e35f288-f81a-428b-8b9e-035c3a1d3c7a" + }, + "outputs": [], + "source": [ + "df = pd.read_csv(\"dataset_test_predictions_sgd_tfidf.csv\")\n", + "\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 479 + }, + "id": "Bp50IA0qIQpf", + "outputId": "c4efa4c8-4fac-4349-cc12-a331f89850ad" + }, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "3Obah84eIQrm", + "outputId": "6971b6d4-b7c5-4029-86b9-83393899dd51" + }, + "outputs": [], + "source": [ + "\n", + "# articles dont la première prédiction correspond à la vérité terrain (\n", + "df[df[\"ensemble_domaine_enccre\"] == df[\"predict1\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 887 + }, + "id": "8eadTEGmJ2BK", + "outputId": "f9469880-f3d5-4fb2-8ac3-4db50eb7deb1" + }, + "outputs": [], + "source": [ + "# articles dont la deuxième classe correspond à la vérité terrain (839)\n", + "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 939 + }, + "id": "W9PzX5DUKbwO", + "outputId": "9b851666-3361-452b-cab3-f8e42f06c7e4" + }, + "outputs": [], + "source": [ + "# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)\n", + "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] != df[\"predict2\"])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "NLcWPZlQK9BM", + "outputId": "8baf8f1f-9f36-4779-c12c-47ce96a627da" + }, + "outputs": [], + "source": [ + "# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44\n", + "\n", + "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"]) & (df[\"ensemble_domaine_enccre\"] == \"Géographie\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Aq7hmUshMhPh" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 479 + }, + "id": "wRv1Nv5-ztyK", + "outputId": "94e55eeb-f7a1-4a75-b674-5347092565f1" + }, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PYH0M0nddL34" + }, + "source": [ + "## Word frequency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RmOViUd-zwe8", + "outputId": "e4beff09-2cb2-4cf3-9361-0537e70f484f" + }, + "outputs": [], + "source": [ + "# Liste des ensembles de domaines ENCCRE (classes)\n", + "df.ensemble_domaine_enccre.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "H9YxH28xxMGf" + }, + "outputs": [], + "source": [ + "lst_domaines = sorted(df.ensemble_domaine_enccre.unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "26h8-7P-xMI7" + }, + "outputs": [], + "source": [ + "# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre\n", + "def wordListToFreqDict(wordlist):\n", + " wordfreq = [wordlist.count(p) for p in wordlist]\n", + " return dict(list(zip(wordlist,wordfreq)))\n", + "\n", + "def sortFreqDict(freqdict):\n", + " aux = [(freqdict[key], key) for key in freqdict]\n", + " aux.sort()\n", + " aux.reverse()\n", + " return aux" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9aNEjYtExMN5" + }, + "outputs": [], + "source": [ + "d = {}\n", + "for domaine in lst_domaines:\n", + " l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]\n", + " print(domaine)\n", + " d[domaine] = sortFreqDict(wordListToFreqDict(l_text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1yYHcDjUY9HG", + "outputId": "31e2bb56-46ff-4dab-cc2f-bb117d61fb35" + }, + "outputs": [], + "source": [ + "d['Géographie']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uchRyb2gqnk0" + }, + "outputs": [], + "source": [ + "path = \"drive/MyDrive/Classification-EDdA/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oNvSxYxmpqed" + }, + "outputs": [], + "source": [ + "# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot\n", + "for domaine, wordFreq in d.items():\n", + "\n", + " with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:\n", + " csv_out=csv.writer(file)\n", + " csv_out.writerow(['frequency','word'])\n", + " csv_out.writerows(wordFreq)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tTz8JcdhdHNw" + }, + "source": [ + "## Wordclouds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "C2NM2ayE9jcR" + }, + "outputs": [], + "source": [ + "from wordcloud import WordCloud" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "EITJqnZ5ecE8", + "outputId": "5a0392c9-f9dd-4a6b-b06a-6a2b50aeec27" + }, + "outputs": [], + "source": [ + "\n", + "lst_clouds = []\n", + "cpt = 1\n", + "n_cols = 4\n", + "n_rows = 10\n", + "\n", + "plt.figure(figsize=(30,50))\n", + "\n", + "for domaine in lst_domaines:\n", + " plt.subplot(n_rows, n_cols, cpt)\n", + " text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values\n", + " cloud_i = WordCloud(width=1080, height=720, background_color='white',\n", + " collocations=False, colormap='Set2',\n", + " max_words = 100, random_state = 42\n", + " ).generate(\" \".join(text))\n", + " \n", + " # https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html\n", + "\n", + " plt.axis('off')\n", + " plt.title(domaine,fontsize=10)\n", + " plt.imshow(cloud_i)\n", + "\n", + " cloud_i.to_file(path+\"/Wordclouds/Wordclouds_\"+domaine.split(\" \")[0]+\".png\")\n", + " cpt += 1\n", + "\n", + " lst_clouds.append(cloud_i)\n", + "\n", + "plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')\n", + "plt.show()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rVFvp3owZDPq" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2EIA9mV_ecH8" + }, + "outputs": [], + "source": [ + "# Récupération des mots en communs\n", + "m = []\n", + "for d1 in lst_clouds :\n", + " m2 = []\n", + " for d2 in lst_clouds :\n", + "\n", + " lst_1 = d1.words_.keys()\n", + " lst_2 = d2.words_.keys()\n", + "\n", + " lst_text = [i for i in lst_1 if i in lst_2]\n", + " m2.append(len(lst_text))\n", + " m.append(m2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 874 + }, + "id": "p-D4rLn1TCMV", + "outputId": "6dc3c33a-733d-466b-c134-9060e9261973" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "plt.figure(figsize=(16,13))\n", + "\n", + "ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')\n", + "\n", + "plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZSnHC2IoTqO5", + "outputId": "d6249442-119b-4dbd-a764-422d46fcf0f0" + }, + "outputs": [], + "source": [ + "# nombre de mots en commun entre Arts et Métier et Métiers :\n", + "\n", + "\n", + "# 4 et 29\n", + "\n", + "lst_1 = lst_clouds[4].words_.keys()\n", + "lst_2 = lst_clouds[29].words_.keys()\n", + "\n", + "lst_text = [i for i in lst_1 if i in lst_2]\n", + "len(lst_text)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "niAw2OF0bWMi", + "outputId": "4f267f2d-8586-44d0-bd54-1c73ebf646bf" + }, + "outputs": [], + "source": [ + "# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers\n", + "lst_1 = lst_clouds[4].words_.keys()\n", + "lst_2 = lst_clouds[29].words_.keys()\n", + "\n", + "lst_text = [i for i in lst_1 if i not in lst_2]\n", + "lst_text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oYT479rsyVvq" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "EDdA-Classification_Analyses_predictions_proba.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3.9.13 ('stanza-lexicoscope-py39')", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.13" + }, + "vscode": { + "interpreter": { + "hash": "68d5f9281eab57a7f4901cb150f4c691b1d08935474a18f188e0e3e8f8f412b7" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}