Skip to content
Snippets Groups Projects
Commit d477ca43 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update Predictions_analysis_superDomains.ipynb

parent ee5906b3
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import pandas as pd
import csv
```
%% Cell type:code id: tags:
``` python
!wget https://geode.liris.cnrs.fr/EDdA-Classification/predictions/dataset_test_predictions_sgd_tfidf.csv
```
%% Cell type:code id: tags:
``` python
df = pd.read_csv("dataset_test_predictions_sgd_tfidf.csv")
df.shape
```
%% Cell type:code id: tags:
``` python
df.head()
```
%% Cell type:code id: tags:
``` python
# articles dont la première prédiction correspond à la vérité terrain (
df[df["ensemble_domaine_enccre"] == df["predict1"]]
```
%% Cell type:code id: tags:
``` python
# articles dont la deuxième classe correspond à la vérité terrain (839)
df[(df["ensemble_domaine_enccre"] != df["predict1"]) & (df["ensemble_domaine_enccre"] == df["predict2"])]
```
%% Cell type:code id: tags:
``` python
# articles dont ni la première ni la deuxième classe correspondent à la vérité terrain (740)
df[(df["ensemble_domaine_enccre"] != df["predict1"]) & (df["ensemble_domaine_enccre"] != df["predict2"])]
```
%% Cell type:code id: tags:
``` python
# articles de géographie dont la prédiction avec la plus forte proba n'est pas Géographie (seulement la deuxième proba correspond à Géographie) -> 44
df[(df["ensemble_domaine_enccre"] != df["predict1"]) & (df["ensemble_domaine_enccre"] == df["predict2"]) & (df["ensemble_domaine_enccre"] == "Géographie")]
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
df.head()
```
%% Cell type:markdown id: tags:
## Word frequency
%% Cell type:code id: tags:
``` python
# Liste des ensembles de domaines ENCCRE (classes)
df.ensemble_domaine_enccre.unique()
```
%% Cell type:code id: tags:
``` python
lst_domaines = sorted(df.ensemble_domaine_enccre.unique())
```
%% Cell type:code id: tags:
``` python
# fonction qui retourne un dictionnaire contenant la fréquence associée à chaque mot de la liste en paramètre
def wordListToFreqDict(wordlist):
wordfreq = [wordlist.count(p) for p in wordlist]
return dict(list(zip(wordlist,wordfreq)))
def sortFreqDict(freqdict):
aux = [(freqdict[key], key) for key in freqdict]
aux.sort()
aux.reverse()
return aux
```
%% Cell type:code id: tags:
``` python
d = {}
for domaine in lst_domaines:
l_text = [word for line in list(df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values) for word in line.split()]
print(domaine)
d[domaine] = sortFreqDict(wordListToFreqDict(l_text))
```
%% Cell type:code id: tags:
``` python
d['Géographie']
```
%% Cell type:code id: tags:
``` python
path = "drive/MyDrive/Classification-EDdA/"
```
%% Cell type:code id: tags:
``` python
# on créer un fichier csv pour chaque domaine avec la fréquence de chaque mot
for domaine, wordFreq in d.items():
with open(path+'Wordclouds/frequency_'+domaine+'.csv','w') as file:
csv_out=csv.writer(file)
csv_out.writerow(['frequency','word'])
csv_out.writerows(wordFreq)
```
%% Cell type:markdown id: tags:
## Wordclouds
%% Cell type:code id: tags:
``` python
from wordcloud import WordCloud
```
%% Cell type:code id: tags:
``` python
lst_clouds = []
cpt = 1
n_cols = 4
n_rows = 10
plt.figure(figsize=(30,50))
for domaine in lst_domaines:
plt.subplot(n_rows, n_cols, cpt)
text = df[df.ensemble_domaine_enccre == domaine].contentWithoutClass.values
cloud_i = WordCloud(width=1080, height=720, background_color='white',
collocations=False, colormap='Set2',
max_words = 100, random_state = 42
).generate(" ".join(text))
# https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html
plt.axis('off')
plt.title(domaine,fontsize=10)
plt.imshow(cloud_i)
cloud_i.to_file(path+"/Wordclouds/Wordclouds_"+domaine.split(" ")[0]+".png")
cpt += 1
lst_clouds.append(cloud_i)
plt.savefig('Domaines_wordclouds.pdf', dpi=300, bbox_inches='tight')
plt.show()
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
# Récupération des mots en communs
m = []
for d1 in lst_clouds :
m2 = []
for d2 in lst_clouds :
lst_1 = d1.words_.keys()
lst_2 = d2.words_.keys()
lst_text = [i for i in lst_1 if i in lst_2]
m2.append(len(lst_text))
m.append(m2)
```
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(16,13))
ax = sns.heatmap(m, xticklabels=lst_domaines, yticklabels=lst_domaines, cmap='Blues')
plt.savefig('Heatmap_commonWords.png', dpi=300, bbox_inches='tight')
```
%% Cell type:code id: tags:
``` python
# nombre de mots en commun entre Arts et Métier et Métiers :
# 4 et 29
lst_1 = lst_clouds[4].words_.keys()
lst_2 = lst_clouds[29].words_.keys()
lst_text = [i for i in lst_1 if i in lst_2]
len(lst_text)
```
%% Cell type:code id: tags:
``` python
# mots de Arts et métier qui ne sont pas dans les 100 plus fréquents de Métiers
lst_1 = lst_clouds[4].words_.keys()
lst_2 = lst_clouds[29].words_.keys()
lst_text = [i for i in lst_1 if i not in lst_2]
lst_text
```
%% Cell type:code id: tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment