From b720b5e5936c082d58264a8287a2549ccbd7f9ff Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Thu, 28 Jan 2021 18:39:17 +0100 Subject: [PATCH] Add graph generator for stochastic block model, configuration model that takes nb of nodes and nb of edges in parameter + Debug --- evalNE_script.py | 2 +- generate_theoric_random_graph.py | 26 ++---- lib/random.py | 140 +++++++++++++++++++++++++++++++ lib/utils.py | 2 + run_eval.py | 15 ++-- 5 files changed, 157 insertions(+), 28 deletions(-) create mode 100644 lib/random.py diff --git a/evalNE_script.py b/evalNE_script.py index 29fdf12..dc79263 100644 --- a/evalNE_script.py +++ b/evalNE_script.py @@ -40,7 +40,7 @@ log("Density "+ str(len(G)/len(list(G.edges())))) log("Building link prediction dataset...") # Create an evaluator and generate train/test edge split traintest_split = LPEvalSplit() -traintest_split.compute_splits(G,split_alg="spanning_tree",train_frac=0.8,fe_ratio=1) +traintest_split.compute_splits(G,split_alg="fast",train_frac=0.6,fe_ratio=1) nee = LPEvaluator(traintest_split) log("Dataset Built !") diff --git a/generate_theoric_random_graph.py b/generate_theoric_random_graph.py index 0709bda..1dde7cf 100644 --- a/generate_theoric_random_graph.py +++ b/generate_theoric_random_graph.py @@ -26,14 +26,16 @@ def generate_sbm_prob_matrix(nb_of_blocks,prob_btw_block=0.1): GRAPH_SIZE = [50,75,100] +EDGE_SIZE = [] OUTPUT_DIR = args.output_dir if not os.path.exists(OUTPUT_DIR): raise FileExistsError("Output directory does not exists !") +nx.waxman_graph parameters = { "planted_partition_graph": { - "l": [3,5,8], - "k": [10,20], + "l": [3,5,8], # nb of groups + "k": [10,20], # nb de noeud "p_in": [0.2,0.5,0.7], "p_out": [0.1] }, @@ -41,26 +43,12 @@ parameters = { "sizes": [[random.choice([10,20,30]) for k in range(i)] for i in [3,5,8]], "p": [] # Filled later }, - "fast_gnp_random_graph": { + "dense_gnm_random_graph": { "n": GRAPH_SIZE, - "p": [0.4,0.6] + "m": EDGE_SIZE }, - "random_powerlaw_tree_sequence": { # configuration_model + "powerlaw_graph": { # configuration_model "n": GRAPH_SIZE, - "tries":[10000] - }, - # "random_geometric_graph": { - # "n": GRAPH_SIZE, - # "radius": [0.4,0.6] - # }, - "waxman_graph": { - "n": GRAPH_SIZE, - "beta": [0.4,0.6], - "alpha": [0.4,0.6] - }, - "geographical_threshold_graph": { - "n": GRAPH_SIZE, - "theta": [0.2,0.3] }, } # Generating transition matrices for stochastic block model diff --git a/lib/random.py b/lib/random.py new file mode 100644 index 0000000..8b57697 --- /dev/null +++ b/lib/random.py @@ -0,0 +1,140 @@ +# coding = utf-8 +from collections import Iterable + +import numpy as np +import networkx as nx +import pandas as pd + +import random + + +def powerlaw(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1): + nb_stubs = nb_edges * 2 + # Draw a first time a powerlaw degree sequence + degs = np.round(nx.utils.powerlaw_sequence(nb_nodes, exponent=exponent)) + + degs = degs[degs >= min_deg] + # Compute de degree sum + sum_deg = degs.sum() + + for _ in range(tries): + # If the sum of the degree sequence is equal to the number of stubs, then it's good + if sum_deg == nb_stubs: + return degs + # Draw a a new powerlaw degree sequence + new_draw = np.round(nx.utils.powerlaw_sequence(nb_nodes, exponent=exponent)) + new_draw = new_draw[new_draw >= min_deg] + new_sum_deg = new_draw.sum() + + # If the new degree sequence is closer to the objective than the previously draw sequence + if abs(nb_stubs - new_sum_deg) < abs(nb_stubs - sum_deg): + degs = new_draw + sum_deg = new_sum_deg + + # Once the final draw is executed and the sequence degree sum is not equal to number of stubs expected + if not sum_deg == nb_stubs: + # We randomly pick sequence degrees and increment (or decrement) their values + diff = abs(sum_deg - nb_stubs) + signe = -1 if (nb_stubs - sum_deg) < 0 else 1 + indexes = np.random.choice(np.arange(len(degs)), int(diff)) + for ind in indexes: + degs[ind] = degs[ind] + signe + + return degs + + +def get_countries_coords(): + """ + Return the coordinates of each country in the world. + Returns + ------- + np.ndarray + coordinates + """ + try: + import geopandas as gpd + except: + raise ImportError("Geopandas is not installed !") + gdf = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")) + return np.asarray(gdf.centroid.apply(lambda x: [x.x, x.y]).values.tolist()) + + +def powerlaw_graph(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1): + return nx.configuration_model(powerlaw(nb_nodes,nb_edges,exponent,tries,min_deg)) + +def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b), self_link=False): + if coords and isinstance(coords, Iterable) and not isinstance(coords, str): + if len(coords) != nb_nodes: + raise ValueError("number of nodes must match the size of the coords dict") + elif coords == "random": + coords = np.random.random(nb_nodes * 2).reshape(nb_nodes, 2) + coords[:, 0] = (coords[:, 0] * 360) - 180 + coords[:, 1] = (coords[:, 1] * 180) - 90 + else: + coords = get_countries_coords() + if nb_nodes > len(coords): + raise ValueError( + "Too many nodes for coords = \"country\". Change nb_nodes value or change coords to 'random' or your own list of coords") + coords_index = np.random.choice(np.arange(len(coords)), nb_nodes) + coords = coords[coords_index] + data = [] + for i in range(nb_nodes): + for j in range(nb_nodes): + if i == j and not self_link: + continue + data.append([i, j, dist_func(coords[i], coords[j])]) + df = pd.DataFrame(data, columns="src tar weight".split()) + df = df.sample(nb_edges, weights="weight") + G = nx.from_pandas_edgelist(df, source="src", target="tar", edge_attr="weight") + for n in list(G.nodes()): G.nodes[n]["pos"] = coords[n] + return G + +def ER_graph(nb_nodes,nb_edges): + return nx.dense_gnm_random_graph(nb_nodes,nb_edges) + + +def stochastic_block_model_graph(nb_nodes,nb_edges,nb_com,percentage_edge_betw,verbose=False): + + percentage_edge_within = 1 - percentage_edge_betw + if nb_edges > (1 / nb_com) * (nb_nodes * (nb_nodes - 1)) / 2: + raise ValueError("nb_edges must be inferior to {0}".format((1 / nb_com) * (nb_nodes * (nb_nodes - 1)) / 2)) + + G = nx.planted_partition_graph(nb_com, int(np.round(nb_nodes / nb_com)), 1, 1) + if verbose: + print(G.size()) + + block_assign = nx.get_node_attributes(G, "block") + inter_edges,intra_edges = [], [] + register = set([]) + for n1 in list(G.nodes()): + for n2 in list(G.nodes()): + hash_ = "_".join(sorted([str(n1), str(n2)])) + if (n1 == n2) or (hash_ in register): + continue + b1, b2 = block_assign[n1], block_assign[n2] + if b1 != b2: + inter_edges.append([n1, n2]) + else: + intra_edges.append([n1, n2]) + register.add(hash_) + + inter_edges = np.asarray(inter_edges) + intra_edges = np.asarray(intra_edges) + inter_N, intra_N = len(inter_edges), len(intra_edges) + + if verbose: + print(inter_N, intra_N) + print(int(np.ceil(nb_edges * percentage_edge_betw)), int(np.ceil(nb_edges * percentage_edge_within))) + + final_edges = [] + index_inter = np.random.choice(np.arange(inter_N), int(np.ceil(nb_edges * percentage_edge_betw)), replace=False) + index_intra = np.random.choice(np.arange(intra_N), int(np.ceil(nb_edges * percentage_edge_within)), replace=False) + final_edges.extend(inter_edges[index_inter]) + final_edges.extend(intra_edges[index_intra]) + if verbose: + print(len(final_edges)) + + G2 = nx.from_edgelist(final_edges) + for n in list(G2.nodes()): + G2.nodes[n]["block"] = block_assign[n] + return G2 diff --git a/lib/utils.py b/lib/utils.py index 958dce8..aa53243 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -83,3 +83,5 @@ def load_edgelist(path, weighted=False, is_directed=False, sep=","): df = pd.read_csv(path, header=None, names="source target".split(),sep=sep) G = nx.from_pandas_edgelist(df, create_using=template) return G + + diff --git a/run_eval.py b/run_eval.py index 21d3958..c4a739b 100644 --- a/run_eval.py +++ b/run_eval.py @@ -22,15 +22,14 @@ fns = sorted(glob.glob(args.dataset_dir + "/*." + args.format)) all_res = [] pbar = tqdm(fns) for fn in pbar: - if os.path.exists(fn + "_results_lp"): - continue pbar.set_description("run eval on "+ fn) - command = "python evalNE_script.py {0} -f {1} -n".format(fn, args.format).split() - output = subprocess.run(command,stdout=subprocess.DEVNULL, - stderr=subprocess.STDOUT) - if not output.returncode == 0: - print("Error! for the command :", " ".join(command)) - continue + + if not os.path.exists(fn + "_results_lp"): + command = "python evalNE_script.py {0} -f {1} -n".format(fn, args.format).split() + output = subprocess.run(command) + if not output.returncode == 0: + print("Error! for the command :", " ".join(command)) + continue df_results = parse_evalne_output(open(fn + "_results_lp").read()) name = os.path.basename(fn) G = None -- GitLab