Skip to content
Snippets Groups Projects
Commit d4e8ff00 authored by Fize Jacques's avatar Fize Jacques
Browse files

Update

parent f8b664ba
No related branches found
No related tags found
No related merge requests found
source diff could not be displayed: it is too large. Options to address this: view the blob.
source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
......@@ -5,37 +5,38 @@ import networkx as nx
import pandas as pd
import joblib
import json
import geopandas as gpd
from lib.draw import draw
parser = argparse.ArgumentParser()
parser.add_argument("input_file",help="edgelist format (sep = \",\" )")
parser.add_argument("output_file")
parser.add_argument("--encoder-file",help="LabelEncoder instance that allows to obtain a label for each node")
parser.add_argument("--country",help="if country node",action="store_true")
parser.add_argument("-w",action="store_true")
args = parser.parse_args()
if args.w:
df = pd.read_csv(args.input_file,header=None,names="source target weight".split())
G = nx.from_pandas_edgelist(df,edge_attr="weight",create_using=nx.DiGraph())
else:
df = pd.read_csv(args.input_file, header=None, names="source target".split())
G = nx.from_pandas_edgelist(df,create_using=nx.DiGraph())
G = nx.read_gexf(args.input_file)
encoder = None
labels_dict = {}
positions = {}
if args.encoder_file:
encoder = joblib.load(args.encoder_file)
if args.country:
iso2_name = json.load(open("data/ISO3166-1.alpha2.json.txt"))
world = gpd.read_file("data/TM_WORLD_BORDERS-0/TM_WORLD_BORDERS-0.3.shp")
world["centroid_c"] = world.centroid
iso2_togeom = dict(world["ISO2 centroid_c".split()].values)
positions = {k: [v.x, v.y] for k, v in iso2_togeom.items() if k in G}
for node in list(G.nodes()):
if args.country:
iso2_name = json.load(open("data/ISO3166-1.alpha2.json.txt"))
for node in list(G.nodes()):
if args.country:
labels_dict[node] = iso2_name[encoder.inverse_transform([node])[0]]
else:
labels_dict[node] = encoder.inverse_transform([node])[0]
fig, ax = draw(G,labels_dict)
labels_dict[node] = iso2_name[node]
else:
labels_dict[node] = node
fig, ax = draw(G,labels_dict,positions)
if args.country:
world.boundary.plot(ax=ax)
fig.savefig(args.output_file)
\ No newline at end of file
......@@ -2,6 +2,7 @@ from evalne.evaluation.evaluator import LPEvaluator
from evalne.evaluation.split import EvalSplit as LPEvalSplit
from evalne.evaluation.score import Scoresheet
from evalne.utils import preprocess as pp
import networkx as nx
from lib.utils import load_edgelist
......@@ -14,14 +15,14 @@ parser.add_argument("-v","--verbose",action="store_true")
args = parser.parse_args()#("data/fb_country_country_sample_6_size1000.txt".split())
# Load and preprocess the network
G = load_edgelist(args.edgelist_graph_filename,is_directed=True,weighted=True)
G = nx.read_gexf(args.edgelist_graph_filename)#load_edgelist(args.edgelist_graph_filename,is_directed=True,weighted=True)
G, _ = pp.prep_graph(G,maincc=True)
# Create an evaluator and generate train/test edge split
traintest_split = LPEvalSplit()
traintest_split.compute_splits(G,split_alg="spanning_tree",train_frac=0.8,fe_ratio=1)
nee = LPEvaluator(traintest_split)
# Create a Scoresheet to store the results
scoresheet = Scoresheet()
......@@ -31,8 +32,11 @@ methods = ['random_prediction',
'jaccard_coefficient',
"adamic_adar_index",
"preferential_attachment",
"resource_allocation_index"
]
"resource_allocation_index",
"stochastic_block_model",
"stochastic_block_model_edge_probs",
"stochastic_block_model_degree_corrected"
]
# Evaluate baselines
for method in methods:
......@@ -42,7 +46,7 @@ for method in methods:
try:
# Check if OpenNE is installed
import openne
a=0/0
# Set embedding methods from OpenNE
methods = "node2vec hope-opne gf sdne deepWalk line grarep".split() #lap-opne
commands = [
......@@ -64,7 +68,7 @@ try:
edge_embedding_methods=edge_emb, input_delim=' ', output_delim=' ', verbose=args.verbose)
scoresheet.log_results(results)
except ImportError:
except Exception:
print("The OpenNE library is not installed. Reporting results only for the baselines...")
pass
......
%% Cell type:code id: tags:
``` python
import networkx as nx
import pandas as pd
import numpy as np
```
%% Cell type:code id: tags:
``` python
df = pd.read_csv("data/gadm1_nuts2_gadm1_nuts2_aug2020.tsv",sep="\t")
```
%% Cell type:code id: tags:
``` python
df
```
%% Output
user_loc fr_loc scaled_sci
0 ABW ABW 13297827
1 ABW AGO1 29
2 ABW AGO10 54
3 ABW AGO11 41
4 ABW AGO12 42
... ... ... ...
5978020 ZWE9 ZWE5 491990
5978021 ZWE9 ZWE6 524119
5978022 ZWE9 ZWE7 929477
5978023 ZWE9 ZWE8 966771
5978024 ZWE9 ZWE9 16951824
[5978025 rows x 3 columns]
%% Cell type:code id: tags:
``` python
level_df = pd.read_csv("data/gadm1_nuts2_levels.csv")
level_df
```
%% Output
key level
0 ASM country
1 AND country
2 ATG country
3 ABW country
4 BHS country
... ... ...
2440 TR42 nuts2
2441 TR51 nuts2
2442 TR52 nuts2
2443 TR10 nuts2
2444 TR32 nuts2
[2445 rows x 2 columns]
%% Cell type:code id: tags:
``` python
level_df[level_df.key == "ABW"]
```
%% Output
key level
3 ABW country
%% Cell type:code id: tags:
``` python
import geopandas as gpd
gdf = gpd.read_file("data/ref-nuts-2021-10m/NUTS_RG_10M_2021_3035_LEVL_2.geojson")
gdf
```
%% Output
id NUTS_ID LEVL_CODE CNTR_CODE NAME_LATN NUTS_NAME \
0 DE50 DE50 2 DE Bremen Bremen
1 DE60 DE60 2 DE Hamburg Hamburg
2 DE71 DE71 2 DE Darmstadt Darmstadt
3 DE72 DE72 2 DE Gießen Gießen
4 DE73 DE73 2 DE Kassel Kassel
.. ... ... ... ... ... ...
329 HR06 HR06 2 HR Sjeverna Hrvatska Sjeverna Hrvatska
330 NO02 NO02 2 NO Innlandet Innlandet
331 NO06 NO06 2 NO Trøndelag Trøndelag
332 NO07 NO07 2 NO Nord-Norge Nord-Norge
333 NO08 NO08 2 NO Oslo og Viken Oslo og Viken
MOUNT_TYPE URBN_TYPE COAST_TYPE FID \
0 0 NaN NaN DE50
1 0 NaN NaN DE60
2 0 NaN NaN DE71
3 0 NaN NaN DE72
4 0 NaN NaN DE73
.. ... ... ... ...
329 0 0.0 0.0 HR06
330 0 NaN NaN NO02
331 0 NaN NaN NO06
332 0 NaN NaN NO07
333 0 0.0 0.0 NO08
geometry
0 MULTIPOLYGON (((4248229.070 3323043.884, 42345...
1 MULTIPOLYGON (((4336708.861 3376535.119, 43414...
2 POLYGON ((4253056.068 3043343.224, 4257541.935...
3 POLYGON ((4248924.963 3092384.236, 4258523.883...
4 POLYGON ((4299188.570 3163540.672, 4298283.911...
.. ...
329 POLYGON ((4885838.460 2569452.540, 4878828.590...
330 POLYGON ((4438332.480 4360687.112, 4440904.728...
331 MULTIPOLYGON (((4414585.206 4664076.456, 44179...
332 MULTIPOLYGON (((5073773.420 5207018.495, 50676...
333 POLYGON ((4424393.606 4083582.648, 4429455.154...
[334 rows x 11 columns]
%% Cell type:code id: tags:
``` python
gdf.info()
```
%% Output
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 334 entries, 0 to 333
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 334 non-null object
1 NUTS_ID 334 non-null object
2 LEVL_CODE 334 non-null int64
3 CNTR_CODE 334 non-null object
4 NAME_LATN 334 non-null object
5 NUTS_NAME 334 non-null object
6 MOUNT_TYPE 334 non-null int64
7 URBN_TYPE 7 non-null float64
8 COAST_TYPE 7 non-null float64
9 FID 334 non-null object
10 geometry 334 non-null geometry
dtypes: float64(2), geometry(1), int64(2), object(6)
memory usage: 28.8+ KB
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
%load_ext autoreload
%autoreload 2
```
%% Cell type:code id: tags:
``` python
import numpy as np
```
%% Cell type:code id: tags:
``` python
from utils import load_country_country_data,sample_with_pandas,to_edgelist
from joblib import dump
from sklearn.preprocessing import LabelEncoder
df = load_country_country_data("data/country_country_aug2020.tsv")
df["norm_scaled_sci"] = df.scaled_sci/df.scaled_sci.sum()
```
%% Cell type:code id: tags:
``` python
!mkdir data/graph_second_s
```
%% Cell type:code id: tags:
``` python
encoder = LabelEncoder()
encoder.fit(np.concatenate((df.user_loc.values,df.fr_loc.values)))
for i in range(10):
for size in [50,100,200,500,1000]:
test = sample_with_pandas(df,size)
to_edgelist(test,encoder,weight=True).to_csv("data/graph_second_s/fb_country_country_sample_{0}_size{1}.txt".format(i,size),index=False,header= False,sep=",")
```
%% Cell type:code id: tags:
``` python
dump(encoder,"data/graph_second_s/encoder.joblib")
```
%% Output
['data/graph_second_s/encoder.joblib']
%% Cell type:code id: tags:
``` python
```
import numpy as np
from joblib import dump
import networkx as nx
from sklearn.preprocessing import LabelEncoder
from lib.utils import load_country_country_data, sample_with_pandas, to_edgelist
......@@ -24,7 +25,8 @@ if not os.path.exists(args.output_dir):
# Load the data
df = load_country_country_data(args.input_tsv,self_link=args.self_link)
df["hash"] = df.apply(lambda row:"_".join(sorted([row.user_loc,row.fr_loc])),axis=1)
df = df.drop_duplicates(subset=['hash'])
# Normalise the sci index
df["norm_scaled_sci"] = df.scaled_sci/df.scaled_sci.sum()
......@@ -33,9 +35,11 @@ encoder.fit(np.concatenate((df.user_loc.values,df.fr_loc.values)))
for i in range(args.n): # For a number of graph
for size in args.dimensions: # Per size
test = sample_with_pandas(df,size) # sample edges using the normalised FB social interconnectedness index
output_df = to_edgelist(test,encoder,weight=True) # Parse to edgelist format
output_df.to_csv(args.output_dir + "/fb_country_country_sample_{0}_size{1}.txt".format(i,size),index=False,header= False,sep=",") # Save the output
test = sample_with_pandas(df,size) # sample edges using the normalised FB social interconnectedness indew
G = nx.from_pandas_edgelist(test, source="user_loc",target="fr_loc", edge_attr="weight", create_using=nx.Graph())
nx.write_gexf(G,args.output_dir + "/fb_country_country_sample_{0}_size{1}.gexf".format(i, size))
#output_df = to_edgelist(test,encoder,weight=True) # Parse to edgelist format
#output_df.to_csv(args.output_dir + "/fb_country_country_sample_{0}_size{1}.txt".format(i,size),index=False,header= False,sep=",") # Save the output
# Save encoder to reverse the label transformation
dump(encoder,args.output_dir + "/encoder.joblib")
\ No newline at end of file
#dump(encoder,args.output_dir + "/encoder.joblib")
\ No newline at end of file
......@@ -49,7 +49,7 @@ def get_force_atlas(weight_influence=0, scaling_ratio=3.0, gravity=5):
return forceatlas2
def draw(G, labels_dict={}, iteration_force_atlase=2000, figsize=(40, 20), font_size=12, stroke_width=3,
def draw(G, labels_dict={}, positions = {}, iteration_force_atlase=2000, figsize=(40, 20), font_size=12, stroke_width=3,
stroke_color="black", font_color="white", edge_cmap=plt.cm.viridis, weight=True):
"""
Return a figure of a NetworkX graph
......@@ -82,10 +82,11 @@ def draw(G, labels_dict={}, iteration_force_atlase=2000, figsize=(40, 20), font_
plt.gcf() # Clean previous figure associated with the 'plt' instance
# Compute node position using the Force Atlas algorithm
force_atlas = get_force_atlas()
positions = force_atlas.forceatlas2_networkx_layout(G,
pos=None,
iterations=iteration_force_atlase)
if not positions:
force_atlas = get_force_atlas()
positions = force_atlas.forceatlas2_networkx_layout(G,
pos=None,
iterations=iteration_force_atlase)
# Initialise the figure canvas
fig, ax = plt.subplots(1, figsize=figsize)
......
......@@ -19,8 +19,7 @@ def load_country_country_data(filename, self_link=False):
pandas.Dataframe
data
"""
df = pd.read_csv(filename, sep="\t")
df = df[(~df.user_loc.isna()) & (~df.fr_loc.isna())]
df = pd.read_csv(filename, sep="\t").fillna("NA")
ign = ["CW", "XK"] # No coords for these two countries ... got to investigate!
df = df[(~df.user_loc.isin(ign)) & (~df.fr_loc.isin(ign))]
if not self_link:
......
......@@ -7,6 +7,7 @@ from lib.utils import load_edgelist
import os
import pandas as pd
from tqdm import tqdm
import networkx as nx
import argparse
......@@ -15,7 +16,7 @@ parser.add_argument("dataset_dir")
parser.add_argument("output_filename")
args = parser.parse_args()
fns = glob.glob(args.dataset_dir + "/*.txt")
fns = glob.glob(args.dataset_dir + "/*.gexf")
all_res = []
for fn in tqdm(fns):
......@@ -27,8 +28,13 @@ for fn in tqdm(fns):
continue
df_results = parse_evalne_output(open(fn + "_results_lp").read())
name = os.path.basename(fn)
edge_len = len(pd.read_csv(fn, sep="\t", header=None))
df_results["nb_edge"] = edge_len
G = nx.read_gexf(fn)
top10node = pd.DataFrame(list(G.degree()), columns="node degree".split()).sort_values("degree",ascending=False).head(10).node.values
df_results["nb_edge"] = len(list(G.edges()))
df_results["transitivity"] = nx.transitivity(G)
df_results["density"] = nx.density(G)
df_results["top10_node"] = "|".join(top10node)
df_results["size"] = len(G)
df_results["filename"] = name
all_res.append(df_results)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment