Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
EDdA Classification
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Projet GEODE
EDdA Classification
Commits
f5242db6
Commit
f5242db6
authored
2 years ago
by
Ludovic Moncla
Browse files
Options
Downloads
Patches
Plain Diff
Create Predictions_analysis_bckp.ipynb
parent
c55caa84
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
notebooks/Predictions_analysis_bckp.ipynb
+470
-0
470 additions, 0 deletions
notebooks/Predictions_analysis_bckp.ipynb
with
470 additions
and
0 deletions
notebooks/Predictions_analysis_bckp.ipynb
0 → 100644
+
470
−
0
View file @
f5242db6
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "M-41ZfqIHyi2"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gVaa01O5IQke",
"outputId": "054b0d9d-148a-4cc6-8616-b9e704eab6ea"
},
"outputs": [],
"source": [
"!wget https://geode.liris.cnrs.fr/EDdA-Classification/predictions/dataset_test_predictions_sgd_tfidf.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dYVLgduMIQm4",
"outputId": "4e35f288-f81a-428b-8b9e-035c3a1d3c7a"
},
"outputs": [],
"source": [
"df = pd.read_csv(\"dataset_test_predictions_sgd_tfidf.csv\")\n",
"\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 479
},
"id": "Bp50IA0qIQpf",
"outputId": "c4efa4c8-4fac-4349-cc12-a331f89850ad"
},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "3Obah84eIQrm",
"outputId": "6971b6d4-b7c5-4029-86b9-83393899dd51"
},
"outputs": [],
"source": [
"\n",
"# articles dont la première prédiction correspond à la vérité terrain (\n",
"df[df[\"ensemble_domaine_enccre\"] == df[\"predict1\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 887
},
"id": "8eadTEGmJ2BK",
"outputId": "f9469880-f3d5-4fb2-8ac3-4db50eb7deb1"
},
"outputs": [],
"source": [
"# articles dont la deuxième classe correspond à la vérité terrain (839)\n",
"df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"])]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 939
},
"id": "W9PzX5DUKbwO",
"outputId": "9b851666-3361-452b-cab3-f8e42f06c7e4"
},
"outputs": [],
"source": [
"# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)\n",
"df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] != df[\"predict2\"])]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "NLcWPZlQK9BM",
"outputId": "8baf8f1f-9f36-4779-c12c-47ce96a627da"
},
"outputs": [],
"source": [
"# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44\n",
"\n",
"df[(df[\"ensemble_domaine_enccre\"] != df[\"predict1\"]) & (df[\"ensemble_domaine_enccre\"] == df[\"predict2\"]) & (df[\"ensemble_domaine_enccre\"] == \"Géographie\")]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Aq7hmUshMhPh"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 479
},
"id": "wRv1Nv5-ztyK",
"outputId": "94e55eeb-f7a1-4a75-b674-5347092565f1"
},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PYH0M0nddL34"
},
"source": [
"## Word frequency"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RmOViUd-zwe8",
"outputId": "e4beff09-2cb2-4cf3-9361-0537e70f484f"
},
"outputs": [],
"source": [
"# Liste des ensembles de domaines ENCCRE (classes)\n",
"df.ensemble_domaine_enccre.unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "H9YxH28xxMGf"
},
"outputs": [],
"source": [
"lst_domaines = sorted(df.ensemble_domaine_enccre.unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "26h8-7P-xMI7"
},
"outputs": [],
"source": [
"# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre\n",
"def wordListToFreqDict(wordlist):\n",
" wordfreq = [wordlist.count(p) for p in wordlist]\n",
" return dict(list(zip(wordlist,wordfreq)))\n",
"\n",
"def sortFreqDict(freqdict):\n",
" aux = [(freqdict[key], key) for key in freqdict]\n",
" aux.sort()\n",
" aux.reverse()\n",
" return aux"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9aNEjYtExMN5"
},
"outputs": [],
"source": [
"d = {}\n",
"for domaine in lst_domaines:\n",
" l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]\n",
" print(domaine)\n",
" d[domaine] = sortFreqDict(wordListToFreqDict(l_text))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1yYHcDjUY9HG",
"outputId": "31e2bb56-46ff-4dab-cc2f-bb117d61fb35"
},
"outputs": [],
"source": [
"d['Géographie']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "uchRyb2gqnk0"
},
"outputs": [],
"source": [
"path = \"drive/MyDrive/Classification-EDdA/\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "oNvSxYxmpqed"
},
"outputs": [],
"source": [
"# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot\n",
"for domaine, wordFreq in d.items():\n",
"\n",
" with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:\n",
" csv_out=csv.writer(file)\n",
" csv_out.writerow(['frequency','word'])\n",
" csv_out.writerows(wordFreq)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "tTz8JcdhdHNw"
},
"source": [
"## Wordclouds"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "C2NM2ayE9jcR"
},
"outputs": [],
"source": [
"from wordcloud import WordCloud"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "EITJqnZ5ecE8",
"outputId": "5a0392c9-f9dd-4a6b-b06a-6a2b50aeec27"
},
"outputs": [],
"source": [
"\n",
"lst_clouds = []\n",
"cpt = 1\n",
"n_cols = 4\n",
"n_rows = 10\n",
"\n",
"plt.figure(figsize=(30,50))\n",
"\n",
"for domaine in lst_domaines:\n",
" plt.subplot(n_rows, n_cols, cpt)\n",
" text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values\n",
" cloud_i = WordCloud(width=1080, height=720, background_color='white',\n",
" collocations=False, colormap='Set2',\n",
" max_words = 100, random_state = 42\n",
" ).generate(\" \".join(text))\n",
" \n",
" # https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html\n",
"\n",
" plt.axis('off')\n",
" plt.title(domaine,fontsize=10)\n",
" plt.imshow(cloud_i)\n",
"\n",
" cloud_i.to_file(path+\"/Wordclouds/Wordclouds_\"+domaine.split(\" \")[0]+\".png\")\n",
" cpt += 1\n",
"\n",
" lst_clouds.append(cloud_i)\n",
"\n",
"plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')\n",
"plt.show()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rVFvp3owZDPq"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2EIA9mV_ecH8"
},
"outputs": [],
"source": [
"# Récupération des mots en communs\n",
"m = []\n",
"for d1 in lst_clouds :\n",
" m2 = []\n",
" for d2 in lst_clouds :\n",
"\n",
" lst_1 = d1.words_.keys()\n",
" lst_2 = d2.words_.keys()\n",
"\n",
" lst_text = [i for i in lst_1 if i in lst_2]\n",
" m2.append(len(lst_text))\n",
" m.append(m2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 874
},
"id": "p-D4rLn1TCMV",
"outputId": "6dc3c33a-733d-466b-c134-9060e9261973"
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"plt.figure(figsize=(16,13))\n",
"\n",
"ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')\n",
"\n",
"plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZSnHC2IoTqO5",
"outputId": "d6249442-119b-4dbd-a764-422d46fcf0f0"
},
"outputs": [],
"source": [
"# nombre de mots en commun entre Arts et Métier et Métiers :\n",
"\n",
"\n",
"# 4 et 29\n",
"\n",
"lst_1 = lst_clouds[4].words_.keys()\n",
"lst_2 = lst_clouds[29].words_.keys()\n",
"\n",
"lst_text = [i for i in lst_1 if i in lst_2]\n",
"len(lst_text)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "niAw2OF0bWMi",
"outputId": "4f267f2d-8586-44d0-bd54-1c73ebf646bf"
},
"outputs": [],
"source": [
"# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers\n",
"lst_1 = lst_clouds[4].words_.keys()\n",
"lst_2 = lst_clouds[29].words_.keys()\n",
"\n",
"lst_text = [i for i in lst_1 if i not in lst_2]\n",
"lst_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "oYT479rsyVvq"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "EDdA-Classification_Analyses_predictions_proba.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3.9.13 ('stanza-lexicoscope-py39')",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.9.13"
},
"vscode": {
"interpreter": {
"hash": "68d5f9281eab57a7f4901cb150f4c691b1d08935474a18f188e0e3e8f8f412b7"
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
%% Cell type:code id: tags:
```
python
import
pandas
as
pd
import
csv
```
%% Cell type:code id: tags:
```
python
!
wget
https
:
//
geode
.
liris
.
cnrs
.
fr
/
EDdA
-
Classification
/
predictions
/
dataset_test_predictions_sgd_tfidf
.
csv
```
%% Cell type:code id: tags:
```
python
df
=
pd
.
read_csv
(
"
dataset_test_predictions_sgd_tfidf.csv
"
)
df
.
shape
```
%% Cell type:code id: tags:
```
python
df
.
head
()
```
%% Cell type:code id: tags:
```
python
# articles dont la première prédiction correspond à la vérité terrain (
df
[
df
[
"
ensemble_domaine_enccre
"
]
==
df
[
"
predict1
"
]]
```
%% Cell type:code id: tags:
```
python
# articles dont la deuxième classe correspond à la vérité terrain (839)
df
[(
df
[
"
ensemble_domaine_enccre
"
]
!=
df
[
"
predict1
"
])
&
(
df
[
"
ensemble_domaine_enccre
"
]
==
df
[
"
predict2
"
])]
```
%% Cell type:code id: tags:
```
python
# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)
df
[(
df
[
"
ensemble_domaine_enccre
"
]
!=
df
[
"
predict1
"
])
&
(
df
[
"
ensemble_domaine_enccre
"
]
!=
df
[
"
predict2
"
])]
```
%% Cell type:code id: tags:
```
python
# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44
df
[(
df
[
"
ensemble_domaine_enccre
"
]
!=
df
[
"
predict1
"
])
&
(
df
[
"
ensemble_domaine_enccre
"
]
==
df
[
"
predict2
"
])
&
(
df
[
"
ensemble_domaine_enccre
"
]
==
"
Géographie
"
)]
```
%% Cell type:code id: tags:
```
python
``
`
%%
Cell
type
:
code
id
:
tags
:
```
python
df.head()
```
%% Cell type:markdown id: tags:
## Word frequency
%% Cell type:code id: tags:
```
python
# Liste des ensembles de domaines ENCCRE (classes)
df.ensemble_domaine_enccre.unique()
```
%% Cell type:code id: tags:
```
python
lst_domaines = sorted(df.ensemble_domaine_enccre.unique())
```
%% Cell type:code id: tags:
```
python
# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre
def wordListToFreqDict(wordlist):
wordfreq = [wordlist.count(p) for p in wordlist]
return dict(list(zip(wordlist,wordfreq)))
def sortFreqDict(freqdict):
aux = [(freqdict[key], key) for key in freqdict]
aux.sort()
aux.reverse()
return aux
```
%% Cell type:code id: tags:
```
python
d = {}
for domaine in lst_domaines:
l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]
print(domaine)
d[domaine] = sortFreqDict(wordListToFreqDict(l_text))
```
%% Cell type:code id: tags:
```
python
d['Géographie']
```
%% Cell type:code id: tags:
```
python
path = "drive/MyDrive/Classification-EDdA/"
```
%% Cell type:code id: tags:
```
python
# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot
for domaine, wordFreq in d.items():
with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:
csv_out=csv.writer(file)
csv_out.writerow(['frequency','word'])
csv_out.writerows(wordFreq)
```
%% Cell type:markdown id: tags:
## Wordclouds
%% Cell type:code id: tags:
```
python
from wordcloud import WordCloud
```
%% Cell type:code id: tags:
```
python
lst_clouds = []
cpt = 1
n_cols = 4
n_rows = 10
plt.figure(figsize=(30,50))
for domaine in lst_domaines:
plt.subplot(n_rows, n_cols, cpt)
text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values
cloud_i = WordCloud(width=1080, height=720, background_color='white',
collocations=False, colormap='Set2',
max_words = 100, random_state = 42
).generate(" ".join(text))
# https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html
plt.axis('off')
plt.title(domaine,fontsize=10)
plt.imshow(cloud_i)
cloud_i.to_file(path+"/Wordclouds/Wordclouds_"+domaine.split(" ")[0]+".png")
cpt += 1
lst_clouds.append(cloud_i)
plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')
plt.show()
```
%% Cell type:code id: tags:
```
python
```
%% Cell type:code id: tags:
```
python
# Récupération des mots en communs
m = []
for d1 in lst_clouds :
m2 = []
for d2 in lst_clouds :
lst_1 = d1.words_.keys()
lst_2 = d2.words_.keys()
lst_text = [i for i in lst_1 if i in lst_2]
m2.append(len(lst_text))
m.append(m2)
```
%% Cell type:code id: tags:
```
python
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(16,13))
ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')
plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')
```
%% Cell type:code id: tags:
```
python
# nombre de mots en commun entre Arts et Métier et Métiers :
# 4 et 29
lst_1 = lst_clouds[4].words_.keys()
lst_2 = lst_clouds[29].words_.keys()
lst_text = [i for i in lst_1 if i in lst_2]
len(lst_text)
```
%% Cell type:code id: tags:
```
python
# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers
lst_1 = lst_clouds[4].words_.keys()
lst_2 = lst_clouds[29].words_.keys()
lst_text = [i for i in lst_1 if i not in lst_2]
lst_text
```
%% Cell type:code id: tags:
```
python
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment