Skip to content
Snippets Groups Projects
Commit 8e5beb77 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Create Plot_Domain_Groups_Size.ipynb

parent 35b84c33
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
```
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
```
%% Cell type:code id: tags:
```
!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/EDdA_dataframe_withContent.tsv
```
%% Output
--2022-03-31 12:36:46-- https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/EDdA_dataframe_withContent.tsv
Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28
Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 270305563 (258M) [text/tab-separated-values]
Saving to: ‘EDdA_dataframe_withContent.tsv’
EDdA_dataframe_with 100%[===================>] 257.78M 4.18MB/s in 41s
2022-03-31 12:37:28 (6.22 MB/s) - ‘EDdA_dataframe_withContent.tsv’ saved [270305563/270305563]
%% Cell type:code id: tags:
```
df = pd.read_csv("EDdA_dataframe_withContent.tsv", sep="\t")
df.shape
```
%% Output
(74190, 13)
%% Cell type:code id: tags:
```
df.dropna(subset = ["contentWithoutClass", "ensemble_domaine_enccre"], inplace= True)
df.shape
```
%% Output
(61362, 13)
%% Cell type:code id: tags:
```
df.columns
```
%% Output
Index(['volume', 'numero', 'head', 'normClass', 'classEDdA', 'author',
'id_enccre', 'domaine_enccre', 'ensemble_domaine_enccre', 'content',
'contentWithoutClass', 'firstParagraph', 'nb_words'],
dtype='object')
%% Cell type:code id: tags:
```
df2 = df.groupby(['ensemble_domaine_enccre']).size().reset_index(name='counts').sort_values(by=['counts'], ascending=False)
```
%% Cell type:code id: tags:
```
df2.at[df2[df2['ensemble_domaine_enccre'] == "Physique - [Sciences physico-mathématiques]"].index.values.astype(int)[0],'ensemble_domaine_enccre'] = "Physique [...]"
df2.at[df2[df2['ensemble_domaine_enccre'] == "Agriculture - Economie rustique"].index.values.astype(int)[0],'ensemble_domaine_enccre'] = "Agriculture [...]"
df2.at[df2[df2['ensemble_domaine_enccre'] == "Militaire (Art) - Guerre - Arme"].index.values.astype(int)[0],'ensemble_domaine_enccre'] = "Militaire [...]"
```
%% Cell type:code id: tags:
```
df2
```
%% Output
ensemble_domaine_enccre counts
15 Géographie 13289
12 Droit - Jurisprudence 6901
29 Métiers 5434
17 Histoire naturelle 5405
16 Histoire 3164
28 Médecine - Chirurgie 2452
14 Grammaire 2390
19 Marine 2346
11 Commerce 1924
35 Religion 1675
3 Architecture 1454
2 Antiquité 1393
32 Physique [...] 1391
23 Militaire [...] 1320
0 Agriculture [...] 1260
1 Anatomie 1152
6 Belles-lettres - Poésie 1042
21 Mathématiques 729
26 Musique 705
9 Chasse 605
10 Chimie 586
4 Arts et métiers 574
20 Maréchage - Manège 553
7 Blason 549
31 Philosophie 490
5 Beaux-arts 447
30 Pharmacie 342
25 Monnaie 326
18 Jeu 289
34 Pêche 217
22 Mesure 189
13 Economie domestique 139
33 Politique 119
27 Médailles 118
8 Caractères 115
37 Superstition 115
24 Minéralogie 112
36 Spectacle 51
%% Cell type:code id: tags:
```
fig, ax = plt.subplots(figsize = (18, 6))
#ax = df2.plot.bar(x='ensemble_domaine_enccre', y='counts', rot=45)
ax.hlines(y=np.arange(0, 16000, 2000), xmin=-1, xmax=37, color="#bfbfbf", lw=0.6)
ax.set_xlim(-0.5, 37.5)
ax.set_ylim(0, 14000)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
bars = ax.bar(df2.ensemble_domaine_enccre, df2.counts, width=0.5)
for bar in bars :
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height + 400,
'%d' % int(height), ha='center', va='bottom', fontsize=12, rotation=90)
plt.xticks(fontsize=12, rotation=45, ha='right')
plt.yticks(fontsize=12)
fig.savefig('graphique_ensemble_domaine.png', bbox_inches = 'tight', dpi=150)
```
%% Output
%% Cell type:code id: tags:
```
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment