Skip to content
Snippets Groups Projects
Commit cb9be46f authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update Normclass2graph.ipynb

parent fc57841e
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Normclass to graph
%% Cell type:code id: tags:
``` python
import pandas as pd
import networkx as nx
```
%% Cell type:code id: tags:
``` python
df = pd.read_csv('/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_articles.tsv',sep='\t')
```
%% Cell type:code id: tags:
``` python
len(df['edda_class'].unique())
```
%% Output
2908
%% Cell type:code id: tags:
``` python
df = df.drop(df.loc[(df['edda_class'] == 'unclassified') | (df['edda_class'] == 'pending') | (df['edda_class'] == '0')].index)
```
%% Cell type:code id: tags:
``` python
len(df['edda_class'].unique())
```
%% Output
2905
%% Cell type:code id: tags:
``` python
df = df.dropna(subset=['edda_class'])
```
%% Cell type:code id: tags:
``` python
len(df['edda_class'].unique())
```
%% Output
2904
%% Cell type:code id: tags:
``` python
normclasses = df['edda_class'].unique()
normclasses
```
%% Output
array(['Grammaire', 'Ecrivains modernes', 'Calendrier Julien', ...,
'Jeux militaires françois', 'Gravure antique sur métal',
'Inscription | Médailles | Poésie'], dtype=object)
%% Cell type:code id: tags:
``` python
df_group = df.groupby('edda_class').size()
```
%% Cell type:code id: tags:
``` python
df_group
```
%% Output
edda_class
Abus des langues 1
Accord de sons 1
Acoustique 6
Agonistique 1
Agriculture 127
...
terme usité parmi les Maréchaux 1
vaisselle d'étain 1
Ébénisterie 3
Ébénisterie | Tapisserie 1
Économie rustique 3
Length: 2904, dtype: int64
%% Cell type:code id: tags:
``` python
d = {}
for normclass, freq in df_group.items():
#print(normclass, freq)
print(normclass, freq)
try:
n = [x.strip() for x in normclass.split('|')] # corrige le probleme du normclass 'Géographie moderne |'
for i in range(1,len(n)):
t = (n[0],n[i])
if t not in d:
d[t] = freq
else:
d[t] += freq
for i in range(0,len(n)):
for j in range(i+1,len(n)):
t = (n[i],n[j])
if t not in d:
d[t] = freq
else:
d[t] += freq
except AttributeError:
print(normclass)
```
%% Cell type:code id: tags:
``` python
len(d.keys())
```
%% Output
1257
%% Cell type:code id: tags:
``` python
G = nx.DiGraph()
G = nx.Graph()
content = 'node1,node2,freq\n'
for key, val in d.items():
G.add_weighted_edges_from([(key[0], key[1], val)])
content += key[0]+','+key[1]+','+str(val)+'\n'
```
%% Cell type:code id: tags:
``` python
print(f"Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")
```
%% Output
Nodes: 576, Edges: 1257
%% Cell type:code id: tags:
``` python
print(list(G.edges()))
```
%% Cell type:code id: tags:
``` python
with open("../data/normclass_network.csv", 'w') as f:
f.write(content)
```
%% Cell type:code id: tags:
``` python
nx.write_gexf(G, "../data/normclass_network.gexf")
```
%% Cell type:code id: tags:
``` python
nx.write_gml(G, "../data/normclass_network.graphml")
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment