{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "M-41ZfqIHyi2" }, "outputs": [], "source": [ "import pandas as pd\n", "import csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gVaa01O5IQke", "outputId": "054b0d9d-148a-4cc6-8616-b9e704eab6ea" }, "outputs": [], "source": [ "!wget https://geode.liris.cnrs.fr/EDdA-Classification/predictions/dataset_test_predictions_sgd_tfidf.csv\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dYVLgduMIQm4", "outputId": "4e35f288-f81a-428b-8b9e-035c3a1d3c7a" }, "outputs": [], "source": [ "df = pd.read_csv(\"dataset_test_predictions_sgd_tfidf.csv\")\n", "\n", "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 479 }, "id": "Bp50IA0qIQpf", "outputId": "c4efa4c8-4fac-4349-cc12-a331f89850ad" }, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "3Obah84eIQrm", "outputId": "6971b6d4-b7c5-4029-86b9-83393899dd51" }, "outputs": [], "source": [ "\n", "# articles dont la première prédiction correspond à la vérité terrain (\n", "df[df[\"ensemble_domaine_enccre\"] == df[\"predict1\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 887 }, "id": "8eadTEGmJ2BK", "outputId": "f9469880-f3d5-4fb2-8ac3-4db50eb7deb1" }, "outputs": [], "source": [ "# articles dont la deuxième classe correspond à la vérité terrain (839)\n", "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"])]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 939 }, "id": "W9PzX5DUKbwO", "outputId": "9b851666-3361-452b-cab3-f8e42f06c7e4" }, "outputs": [], "source": [ "# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)\n", "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] != df[\"predict2\"])]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "NLcWPZlQK9BM", "outputId": "8baf8f1f-9f36-4779-c12c-47ce96a627da" }, "outputs": [], "source": [ "# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44\n", "\n", "df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"]) & (df[\"ensemble_domaine_enccre\"] == \"Géographie\")]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Aq7hmUshMhPh" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 479 }, "id": "wRv1Nv5-ztyK", "outputId": "94e55eeb-f7a1-4a75-b674-5347092565f1" }, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "PYH0M0nddL34" }, "source": [ "## Word frequency" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RmOViUd-zwe8", "outputId": "e4beff09-2cb2-4cf3-9361-0537e70f484f" }, "outputs": [], "source": [ "# Liste des ensembles de domaines ENCCRE (classes)\n", "df.ensemble_domaine_enccre.unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "H9YxH28xxMGf" }, "outputs": [], "source": [ "lst_domaines = sorted(df.ensemble_domaine_enccre.unique())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "26h8-7P-xMI7" }, "outputs": [], "source": [ "# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre\n", "def wordListToFreqDict(wordlist):\n", " wordfreq = [wordlist.count(p) for p in wordlist]\n", " return dict(list(zip(wordlist,wordfreq)))\n", "\n", "def sortFreqDict(freqdict):\n", " aux = [(freqdict[key], key) for key in freqdict]\n", " aux.sort()\n", " aux.reverse()\n", " return aux" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9aNEjYtExMN5" }, "outputs": [], "source": [ "d = {}\n", "for domaine in lst_domaines:\n", " l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]\n", " print(domaine)\n", " d[domaine] = sortFreqDict(wordListToFreqDict(l_text))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1yYHcDjUY9HG", "outputId": "31e2bb56-46ff-4dab-cc2f-bb117d61fb35" }, "outputs": [], "source": [ "d['Géographie']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uchRyb2gqnk0" }, "outputs": [], "source": [ "path = \"drive/MyDrive/Classification-EDdA/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "oNvSxYxmpqed" }, "outputs": [], "source": [ "# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot\n", "for domaine, wordFreq in d.items():\n", "\n", " with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:\n", " csv_out=csv.writer(file)\n", " csv_out.writerow(['frequency','word'])\n", " csv_out.writerows(wordFreq)" ] }, { "cell_type": "markdown", "metadata": { "id": "tTz8JcdhdHNw" }, "source": [ "## Wordclouds" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "C2NM2ayE9jcR" }, "outputs": [], "source": [ "from wordcloud import WordCloud" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "EITJqnZ5ecE8", "outputId": "5a0392c9-f9dd-4a6b-b06a-6a2b50aeec27" }, "outputs": [], "source": [ "\n", "lst_clouds = []\n", "cpt = 1\n", "n_cols = 4\n", "n_rows = 10\n", "\n", "plt.figure(figsize=(30,50))\n", "\n", "for domaine in lst_domaines:\n", " plt.subplot(n_rows, n_cols, cpt)\n", " text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values\n", " cloud_i = WordCloud(width=1080, height=720, background_color='white',\n", " collocations=False, colormap='Set2',\n", " max_words = 100, random_state = 42\n", " ).generate(\" \".join(text))\n", " \n", " # https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html\n", "\n", " plt.axis('off')\n", " plt.title(domaine,fontsize=10)\n", " plt.imshow(cloud_i)\n", "\n", " cloud_i.to_file(path+\"/Wordclouds/Wordclouds_\"+domaine.split(\" \")[0]+\".png\")\n", " cpt += 1\n", "\n", " lst_clouds.append(cloud_i)\n", "\n", "plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')\n", "plt.show()\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "rVFvp3owZDPq" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2EIA9mV_ecH8" }, "outputs": [], "source": [ "# Récupération des mots en communs\n", "m = []\n", "for d1 in lst_clouds :\n", " m2 = []\n", " for d2 in lst_clouds :\n", "\n", " lst_1 = d1.words_.keys()\n", " lst_2 = d2.words_.keys()\n", "\n", " lst_text = [i for i in lst_1 if i in lst_2]\n", " m2.append(len(lst_text))\n", " m.append(m2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 874 }, "id": "p-D4rLn1TCMV", "outputId": "6dc3c33a-733d-466b-c134-9060e9261973" }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "plt.figure(figsize=(16,13))\n", "\n", "ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')\n", "\n", "plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ZSnHC2IoTqO5", "outputId": "d6249442-119b-4dbd-a764-422d46fcf0f0" }, "outputs": [], "source": [ "# nombre de mots en commun entre Arts et Métier et Métiers :\n", "\n", "\n", "# 4 et 29\n", "\n", "lst_1 = lst_clouds[4].words_.keys()\n", "lst_2 = lst_clouds[29].words_.keys()\n", "\n", "lst_text = [i for i in lst_1 if i in lst_2]\n", "len(lst_text)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "niAw2OF0bWMi", "outputId": "4f267f2d-8586-44d0-bd54-1c73ebf646bf" }, "outputs": [], "source": [ "# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers\n", "lst_1 = lst_clouds[4].words_.keys()\n", "lst_2 = lst_clouds[29].words_.keys()\n", "\n", "lst_text = [i for i in lst_1 if i not in lst_2]\n", "lst_text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "oYT479rsyVvq" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "EDdA-Classification_Analyses_predictions_proba.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3.9.13 ('stanza-lexicoscope-py39')", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.9.13" }, "vscode": { "interpreter": { "hash": "68d5f9281eab57a7f4901cb150f4c691b1d08935474a18f188e0e3e8f8f412b7" } } }, "nbformat": 4, "nbformat_minor": 0 }