{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "M-41ZfqIHyi2"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gVaa01O5IQke",
"outputId": "054b0d9d-148a-4cc6-8616-b9e704eab6ea"
},
"outputs": [],
"source": [
"!wget https://geode.liris.cnrs.fr/EDdA-Classification/predictions/dataset_test_predictions_sgd_tfidf.csv\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dYVLgduMIQm4",
"outputId": "4e35f288-f81a-428b-8b9e-035c3a1d3c7a"
},
"outputs": [],
"source": [
"df = pd.read_csv(\"dataset_test_predictions_sgd_tfidf.csv\")\n",
"\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 479
},
"id": "Bp50IA0qIQpf",
"outputId": "c4efa4c8-4fac-4349-cc12-a331f89850ad"
},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "3Obah84eIQrm",
"outputId": "6971b6d4-b7c5-4029-86b9-83393899dd51"
},
"outputs": [],
"source": [
"\n",
"# articles dont la première prédiction correspond à la vérité terrain (\n",
"df[df[\"ensemble_domaine_enccre\"] == df[\"predict1\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 887
},
"id": "8eadTEGmJ2BK",
"outputId": "f9469880-f3d5-4fb2-8ac3-4db50eb7deb1"
},
"outputs": [],
"source": [
"# articles dont la deuxième classe correspond à la vérité terrain (839)\n",
"df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"])]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 939
},
"id": "W9PzX5DUKbwO",
"outputId": "9b851666-3361-452b-cab3-f8e42f06c7e4"
},
"outputs": [],
"source": [
"# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)\n",
"df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] != df[\"predict2\"])]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "NLcWPZlQK9BM",
"outputId": "8baf8f1f-9f36-4779-c12c-47ce96a627da"
},
"outputs": [],
"source": [
"# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44\n",
"\n",
"df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"]) & (df[\"ensemble_domaine_enccre\"] == \"Géographie\")]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Aq7hmUshMhPh"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 479
},
"id": "wRv1Nv5-ztyK",
"outputId": "94e55eeb-f7a1-4a75-b674-5347092565f1"
},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PYH0M0nddL34"
},
"source": [
"## Word frequency"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RmOViUd-zwe8",
"outputId": "e4beff09-2cb2-4cf3-9361-0537e70f484f"
},
"outputs": [],
"source": [
"# Liste des ensembles de domaines ENCCRE (classes)\n",
"df.ensemble_domaine_enccre.unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "H9YxH28xxMGf"
},
"outputs": [],
"source": [
"lst_domaines = sorted(df.ensemble_domaine_enccre.unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "26h8-7P-xMI7"
},
"outputs": [],
"source": [
"# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre\n",
"def wordListToFreqDict(wordlist):\n",
" wordfreq = [wordlist.count(p) for p in wordlist]\n",
" return dict(list(zip(wordlist,wordfreq)))\n",
"\n",
"def sortFreqDict(freqdict):\n",
" aux = [(freqdict[key], key) for key in freqdict]\n",
" aux.sort()\n",
" aux.reverse()\n",
" return aux"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9aNEjYtExMN5"
},
"outputs": [],
"source": [
"d = {}\n",
"for domaine in lst_domaines:\n",
" l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]\n",
" print(domaine)\n",
" d[domaine] = sortFreqDict(wordListToFreqDict(l_text))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1yYHcDjUY9HG",
"outputId": "31e2bb56-46ff-4dab-cc2f-bb117d61fb35"
},
"outputs": [],
"source": [
"d['Géographie']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "uchRyb2gqnk0"
},
"outputs": [],
"source": [
"path = \"drive/MyDrive/Classification-EDdA/\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "oNvSxYxmpqed"
},
"outputs": [],
"source": [
"# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot\n",
"for domaine, wordFreq in d.items():\n",
"\n",
" with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:\n",
" csv_out=csv.writer(file)\n",
" csv_out.writerow(['frequency','word'])\n",
" csv_out.writerows(wordFreq)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "tTz8JcdhdHNw"
},
"source": [
"## Wordclouds"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "C2NM2ayE9jcR"
},
"outputs": [],
"source": [
"from wordcloud import WordCloud"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "EITJqnZ5ecE8",
"outputId": "5a0392c9-f9dd-4a6b-b06a-6a2b50aeec27"
},
"outputs": [],
"source": [
"\n",
"lst_clouds = []\n",
"cpt = 1\n",
"n_cols = 4\n",
"n_rows = 10\n",
"\n",
"plt.figure(figsize=(30,50))\n",
"\n",
"for domaine in lst_domaines:\n",
" plt.subplot(n_rows, n_cols, cpt)\n",
" text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values\n",
" cloud_i = WordCloud(width=1080, height=720, background_color='white',\n",
" collocations=False, colormap='Set2',\n",
" max_words = 100, random_state = 42\n",
" ).generate(\" \".join(text))\n",
" \n",
" # https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html\n",
"\n",
" plt.axis('off')\n",
" plt.title(domaine,fontsize=10)\n",
" plt.imshow(cloud_i)\n",
"\n",
" cloud_i.to_file(path+\"/Wordclouds/Wordclouds_\"+domaine.split(\" \")[0]+\".png\")\n",
" cpt += 1\n",
"\n",
" lst_clouds.append(cloud_i)\n",
"\n",
"plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')\n",
"plt.show()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rVFvp3owZDPq"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2EIA9mV_ecH8"
},
"outputs": [],
"source": [
"# Récupération des mots en communs\n",
"m = []\n",
"for d1 in lst_clouds :\n",
" m2 = []\n",
" for d2 in lst_clouds :\n",
"\n",
" lst_1 = d1.words_.keys()\n",
" lst_2 = d2.words_.keys()\n",
"\n",
" lst_text = [i for i in lst_1 if i in lst_2]\n",
" m2.append(len(lst_text))\n",
" m.append(m2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 874
},
"id": "p-D4rLn1TCMV",
"outputId": "6dc3c33a-733d-466b-c134-9060e9261973"
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"plt.figure(figsize=(16,13))\n",
"\n",
"ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')\n",
"\n",
"plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZSnHC2IoTqO5",
"outputId": "d6249442-119b-4dbd-a764-422d46fcf0f0"
},
"outputs": [],
"source": [
"# nombre de mots en commun entre Arts et Métier et Métiers :\n",
"\n",
"\n",
"# 4 et 29\n",
"\n",
"lst_1 = lst_clouds[4].words_.keys()\n",
"lst_2 = lst_clouds[29].words_.keys()\n",
"\n",
"lst_text = [i for i in lst_1 if i in lst_2]\n",
"len(lst_text)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "niAw2OF0bWMi",
"outputId": "4f267f2d-8586-44d0-bd54-1c73ebf646bf"
},
"outputs": [],
"source": [
"# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers\n",
"lst_1 = lst_clouds[4].words_.keys()\n",
"lst_2 = lst_clouds[29].words_.keys()\n",
"\n",
"lst_text = [i for i in lst_1 if i not in lst_2]\n",
"lst_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "oYT479rsyVvq"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "EDdA-Classification_Analyses_predictions_proba.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3.9.13 ('stanza-lexicoscope-py39')",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.9.13"
},
"vscode": {
"interpreter": {
"hash": "68d5f9281eab57a7f4901cb150f4c691b1d08935474a18f188e0e3e8f8f412b7"
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}