Skip to content
Snippets Groups Projects
Commit b720b5e5 authored by Fize Jacques's avatar Fize Jacques
Browse files

Add graph generator for stochastic block model, configuration model that takes...

Add graph generator for stochastic block model, configuration model that takes nb of nodes and nb of edges in parameter + Debug
parent fe63b6d3
No related branches found
No related tags found
No related merge requests found
......@@ -40,7 +40,7 @@ log("Density "+ str(len(G)/len(list(G.edges()))))
log("Building link prediction dataset...")
# Create an evaluator and generate train/test edge split
traintest_split = LPEvalSplit()
traintest_split.compute_splits(G,split_alg="spanning_tree",train_frac=0.8,fe_ratio=1)
traintest_split.compute_splits(G,split_alg="fast",train_frac=0.6,fe_ratio=1)
nee = LPEvaluator(traintest_split)
log("Dataset Built !")
......
......@@ -26,14 +26,16 @@ def generate_sbm_prob_matrix(nb_of_blocks,prob_btw_block=0.1):
GRAPH_SIZE = [50,75,100]
EDGE_SIZE = []
OUTPUT_DIR = args.output_dir
if not os.path.exists(OUTPUT_DIR):
raise FileExistsError("Output directory does not exists !")
nx.waxman_graph
parameters = {
"planted_partition_graph": {
"l": [3,5,8],
"k": [10,20],
"l": [3,5,8], # nb of groups
"k": [10,20], # nb de noeud
"p_in": [0.2,0.5,0.7],
"p_out": [0.1]
},
......@@ -41,26 +43,12 @@ parameters = {
"sizes": [[random.choice([10,20,30]) for k in range(i)] for i in [3,5,8]],
"p": [] # Filled later
},
"fast_gnp_random_graph": {
"dense_gnm_random_graph": {
"n": GRAPH_SIZE,
"p": [0.4,0.6]
"m": EDGE_SIZE
},
"random_powerlaw_tree_sequence": { # configuration_model
"powerlaw_graph": { # configuration_model
"n": GRAPH_SIZE,
"tries":[10000]
},
# "random_geometric_graph": {
# "n": GRAPH_SIZE,
# "radius": [0.4,0.6]
# },
"waxman_graph": {
"n": GRAPH_SIZE,
"beta": [0.4,0.6],
"alpha": [0.4,0.6]
},
"geographical_threshold_graph": {
"n": GRAPH_SIZE,
"theta": [0.2,0.3]
},
}
# Generating transition matrices for stochastic block model
......
# coding = utf-8
from collections import Iterable
import numpy as np
import networkx as nx
import pandas as pd
import random
def powerlaw(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1):
nb_stubs = nb_edges * 2
# Draw a first time a powerlaw degree sequence
degs = np.round(nx.utils.powerlaw_sequence(nb_nodes, exponent=exponent))
degs = degs[degs >= min_deg]
# Compute de degree sum
sum_deg = degs.sum()
for _ in range(tries):
# If the sum of the degree sequence is equal to the number of stubs, then it's good
if sum_deg == nb_stubs:
return degs
# Draw a a new powerlaw degree sequence
new_draw = np.round(nx.utils.powerlaw_sequence(nb_nodes, exponent=exponent))
new_draw = new_draw[new_draw >= min_deg]
new_sum_deg = new_draw.sum()
# If the new degree sequence is closer to the objective than the previously draw sequence
if abs(nb_stubs - new_sum_deg) < abs(nb_stubs - sum_deg):
degs = new_draw
sum_deg = new_sum_deg
# Once the final draw is executed and the sequence degree sum is not equal to number of stubs expected
if not sum_deg == nb_stubs:
# We randomly pick sequence degrees and increment (or decrement) their values
diff = abs(sum_deg - nb_stubs)
signe = -1 if (nb_stubs - sum_deg) < 0 else 1
indexes = np.random.choice(np.arange(len(degs)), int(diff))
for ind in indexes:
degs[ind] = degs[ind] + signe
return degs
def get_countries_coords():
"""
Return the coordinates of each country in the world.
Returns
-------
np.ndarray
coordinates
"""
try:
import geopandas as gpd
except:
raise ImportError("Geopandas is not installed !")
gdf = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
return np.asarray(gdf.centroid.apply(lambda x: [x.x, x.y]).values.tolist())
def powerlaw_graph(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1):
return nx.configuration_model(powerlaw(nb_nodes,nb_edges,exponent,tries,min_deg))
def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b), self_link=False):
if coords and isinstance(coords, Iterable) and not isinstance(coords, str):
if len(coords) != nb_nodes:
raise ValueError("number of nodes must match the size of the coords dict")
elif coords == "random":
coords = np.random.random(nb_nodes * 2).reshape(nb_nodes, 2)
coords[:, 0] = (coords[:, 0] * 360) - 180
coords[:, 1] = (coords[:, 1] * 180) - 90
else:
coords = get_countries_coords()
if nb_nodes > len(coords):
raise ValueError(
"Too many nodes for coords = \"country\". Change nb_nodes value or change coords to 'random' or your own list of coords")
coords_index = np.random.choice(np.arange(len(coords)), nb_nodes)
coords = coords[coords_index]
data = []
for i in range(nb_nodes):
for j in range(nb_nodes):
if i == j and not self_link:
continue
data.append([i, j, dist_func(coords[i], coords[j])])
df = pd.DataFrame(data, columns="src tar weight".split())
df = df.sample(nb_edges, weights="weight")
G = nx.from_pandas_edgelist(df, source="src", target="tar", edge_attr="weight")
for n in list(G.nodes()): G.nodes[n]["pos"] = coords[n]
return G
def ER_graph(nb_nodes,nb_edges):
return nx.dense_gnm_random_graph(nb_nodes,nb_edges)
def stochastic_block_model_graph(nb_nodes,nb_edges,nb_com,percentage_edge_betw,verbose=False):
percentage_edge_within = 1 - percentage_edge_betw
if nb_edges > (1 / nb_com) * (nb_nodes * (nb_nodes - 1)) / 2:
raise ValueError("nb_edges must be inferior to {0}".format((1 / nb_com) * (nb_nodes * (nb_nodes - 1)) / 2))
G = nx.planted_partition_graph(nb_com, int(np.round(nb_nodes / nb_com)), 1, 1)
if verbose:
print(G.size())
block_assign = nx.get_node_attributes(G, "block")
inter_edges,intra_edges = [], []
register = set([])
for n1 in list(G.nodes()):
for n2 in list(G.nodes()):
hash_ = "_".join(sorted([str(n1), str(n2)]))
if (n1 == n2) or (hash_ in register):
continue
b1, b2 = block_assign[n1], block_assign[n2]
if b1 != b2:
inter_edges.append([n1, n2])
else:
intra_edges.append([n1, n2])
register.add(hash_)
inter_edges = np.asarray(inter_edges)
intra_edges = np.asarray(intra_edges)
inter_N, intra_N = len(inter_edges), len(intra_edges)
if verbose:
print(inter_N, intra_N)
print(int(np.ceil(nb_edges * percentage_edge_betw)), int(np.ceil(nb_edges * percentage_edge_within)))
final_edges = []
index_inter = np.random.choice(np.arange(inter_N), int(np.ceil(nb_edges * percentage_edge_betw)), replace=False)
index_intra = np.random.choice(np.arange(intra_N), int(np.ceil(nb_edges * percentage_edge_within)), replace=False)
final_edges.extend(inter_edges[index_inter])
final_edges.extend(intra_edges[index_intra])
if verbose:
print(len(final_edges))
G2 = nx.from_edgelist(final_edges)
for n in list(G2.nodes()):
G2.nodes[n]["block"] = block_assign[n]
return G2
......@@ -83,3 +83,5 @@ def load_edgelist(path, weighted=False, is_directed=False, sep=","):
df = pd.read_csv(path, header=None, names="source target".split(),sep=sep)
G = nx.from_pandas_edgelist(df, create_using=template)
return G
......@@ -22,15 +22,14 @@ fns = sorted(glob.glob(args.dataset_dir + "/*." + args.format))
all_res = []
pbar = tqdm(fns)
for fn in pbar:
if os.path.exists(fn + "_results_lp"):
continue
pbar.set_description("run eval on "+ fn)
command = "python evalNE_script.py {0} -f {1} -n".format(fn, args.format).split()
output = subprocess.run(command,stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT)
if not output.returncode == 0:
print("Error! for the command :", " ".join(command))
continue
if not os.path.exists(fn + "_results_lp"):
command = "python evalNE_script.py {0} -f {1} -n".format(fn, args.format).split()
output = subprocess.run(command)
if not output.returncode == 0:
print("Error! for the command :", " ".join(command))
continue
df_results = parse_evalne_output(open(fn + "_results_lp").read())
name = os.path.basename(fn)
G = None
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment