Add graph generator for stochastic block model, configuration model that takes...

Add graph generator for stochastic block model, configuration model that takes nb of nodes and nb of edges in parameter + Debug

Add graph generator for stochastic block model, configuration model that takes...
Add graph generator for stochastic block model, configuration model that takes nb of nodes and nb of edges in parameter + Debug
b720b5e5 · Fize Jacques · fe63b6d3 · b720b5e5 · b720b5e5 · b720b5e5
Commit b720b5e5 authored 4 years ago by Fize Jacques
--- a/evalNE_script.py
+++ b/evalNE_script.py
@@ -40,7 +40,7 @@ log("Density "+ str(len(G)/len(list(G.edges()))))
 log("Building link prediction dataset...")
 # Create an evaluator and generate train/test edge split
 traintest_split = LPEvalSplit()
-traintest_split.compute_splits(G,split_alg="spanning_tree",train_frac=0.8,fe_ratio=1)
+traintest_split.compute_splits(G,split_alg="fast",train_frac=0.6,fe_ratio=1)
 nee = LPEvaluator(traintest_split)

 log("Dataset Built !")

--- a/generate_theoric_random_graph.py
+++ b/generate_theoric_random_graph.py
@@ -26,14 +26,16 @@ def generate_sbm_prob_matrix(nb_of_blocks,prob_btw_block=0.1):


 GRAPH_SIZE = [50,75,100]
+EDGE_SIZE = []
 OUTPUT_DIR = args.output_dir
 if not os.path.exists(OUTPUT_DIR):
    raise FileExistsError("Output directory does not exists !")

+nx.waxman_graph
 parameters = {
    "planted_partition_graph": {
-        "l": [3,5,8],
-        "k": [10,20],
+        "l": [3,5,8], # nb of groups
+        "k": [10,20], # nb de noeud
        "p_in": [0.2,0.5,0.7],
        "p_out": [0.1]
    },
@@ -41,26 +43,12 @@ parameters = {
        "sizes": [[random.choice([10,20,30]) for k in range(i)] for i in [3,5,8]],
        "p": [] # Filled later
    },
-    "fast_gnp_random_graph": {
+    "dense_gnm_random_graph": {
        "n": GRAPH_SIZE,
-        "p": [0.4,0.6]
+        "m": EDGE_SIZE
    },
-    "random_powerlaw_tree_sequence": {  # configuration_model
+    "powerlaw_graph": {  # configuration_model
        "n": GRAPH_SIZE,
-        "tries":[10000]
-    },
-    # "random_geometric_graph": {
-    #     "n": GRAPH_SIZE,
-    #     "radius": [0.4,0.6]
-    # },
-    "waxman_graph": {
-        "n": GRAPH_SIZE,
-        "beta": [0.4,0.6],
-        "alpha": [0.4,0.6]
-    },
-    "geographical_threshold_graph": {
-        "n": GRAPH_SIZE,
-        "theta": [0.2,0.3]
    },
 }
 # Generating transition matrices for stochastic block model

--- a/lib/random.py
+++ b/lib/random.py
+# coding = utf-8
+from collections import Iterable
+
+import numpy as np
+import networkx as nx
+import pandas as pd
+
+import random
+
+
+def powerlaw(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1):
+    nb_stubs = nb_edges * 2
+    # Draw a first time a powerlaw degree sequence
+    degs = np.round(nx.utils.powerlaw_sequence(nb_nodes, exponent=exponent))
+
+    degs = degs[degs >= min_deg]
+    # Compute de degree sum
+    sum_deg = degs.sum()
+
+    for _ in range(tries):
+        # If the sum of the degree sequence is equal to the number of stubs, then it's good
+        if sum_deg == nb_stubs:
+            return degs
+        # Draw a a new powerlaw degree sequence
+        new_draw = np.round(nx.utils.powerlaw_sequence(nb_nodes, exponent=exponent))
+        new_draw = new_draw[new_draw >= min_deg]
+        new_sum_deg = new_draw.sum()
+
+        # If the new degree sequence is closer to the objective than the previously draw sequence
+        if abs(nb_stubs - new_sum_deg) < abs(nb_stubs - sum_deg):
+            degs = new_draw
+            sum_deg = new_sum_deg
+
+    # Once the final draw is executed and the sequence degree sum is not equal to number of stubs expected
+    if not sum_deg == nb_stubs:
+        # We randomly pick sequence degrees and increment (or decrement) their values
+        diff = abs(sum_deg - nb_stubs)
+        signe = -1 if (nb_stubs - sum_deg) < 0 else 1
+        indexes = np.random.choice(np.arange(len(degs)), int(diff))
+        for ind in indexes:
+            degs[ind] = degs[ind] + signe
+
+    return degs
+
+
+def get_countries_coords():
+    """
+    Return the coordinates of each country in the world.
+    Returns
+    -------
+    np.ndarray
+        coordinates
+    """
+    try:
+        import geopandas as gpd
+    except:
+        raise ImportError("Geopandas is not installed !")
+    gdf = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
+    return np.asarray(gdf.centroid.apply(lambda x: [x.x, x.y]).values.tolist())
+
+
+def powerlaw_graph(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1):
+    return nx.configuration_model(powerlaw(nb_nodes,nb_edges,exponent,tries,min_deg))
+
+def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b), self_link=False):
+    if coords and isinstance(coords, Iterable) and not isinstance(coords, str):
+        if len(coords) != nb_nodes:
+            raise ValueError("number of nodes must match the size of the coords dict")
+    elif coords == "random":
+        coords = np.random.random(nb_nodes * 2).reshape(nb_nodes, 2)
+        coords[:, 0] = (coords[:, 0] * 360) - 180
+        coords[:, 1] = (coords[:, 1] * 180) - 90
+    else:
+        coords = get_countries_coords()
+        if nb_nodes > len(coords):
+            raise ValueError(
+                "Too many nodes for coords = \"country\". Change nb_nodes value or change coords to 'random' or your own list of coords")
+        coords_index = np.random.choice(np.arange(len(coords)), nb_nodes)
+        coords = coords[coords_index]
+    data = []
+    for i in range(nb_nodes):
+        for j in range(nb_nodes):
+            if i == j and not self_link:
+                continue
+            data.append([i, j, dist_func(coords[i], coords[j])])
+    df = pd.DataFrame(data, columns="src tar weight".split())
+    df = df.sample(nb_edges, weights="weight")
+    G = nx.from_pandas_edgelist(df, source="src", target="tar", edge_attr="weight")
+    for n in list(G.nodes()): G.nodes[n]["pos"] = coords[n]
+    return G
+
+def ER_graph(nb_nodes,nb_edges):
+    return nx.dense_gnm_random_graph(nb_nodes,nb_edges)
+
+
+def stochastic_block_model_graph(nb_nodes,nb_edges,nb_com,percentage_edge_betw,verbose=False):
+
+    percentage_edge_within = 1 - percentage_edge_betw
+    if nb_edges > (1 / nb_com) * (nb_nodes * (nb_nodes - 1)) / 2:
+        raise ValueError("nb_edges must be inferior to {0}".format((1 / nb_com) * (nb_nodes * (nb_nodes - 1)) / 2))
+
+    G = nx.planted_partition_graph(nb_com, int(np.round(nb_nodes / nb_com)), 1, 1)
+    if verbose:
+        print(G.size())
+
+    block_assign = nx.get_node_attributes(G, "block")
+    inter_edges,intra_edges = [], []
+    register = set([])
+    for n1 in list(G.nodes()):
+        for n2 in list(G.nodes()):
+            hash_ = "_".join(sorted([str(n1), str(n2)]))
+            if (n1 == n2) or (hash_ in register):
+                continue
+            b1, b2 = block_assign[n1], block_assign[n2]
+            if b1 != b2:
+                inter_edges.append([n1, n2])
+            else:
+                intra_edges.append([n1, n2])
+            register.add(hash_)
+
+    inter_edges = np.asarray(inter_edges)
+    intra_edges = np.asarray(intra_edges)
+    inter_N, intra_N = len(inter_edges), len(intra_edges)
+
+    if verbose:
+        print(inter_N, intra_N)
+        print(int(np.ceil(nb_edges * percentage_edge_betw)), int(np.ceil(nb_edges * percentage_edge_within)))
+
+    final_edges = []
+    index_inter = np.random.choice(np.arange(inter_N), int(np.ceil(nb_edges * percentage_edge_betw)), replace=False)
+    index_intra = np.random.choice(np.arange(intra_N), int(np.ceil(nb_edges * percentage_edge_within)), replace=False)
+    final_edges.extend(inter_edges[index_inter])
+    final_edges.extend(intra_edges[index_intra])
+    if verbose:
+        print(len(final_edges))
+
+    G2 = nx.from_edgelist(final_edges)
+    for n in list(G2.nodes()):
+        G2.nodes[n]["block"] = block_assign[n]
+    return G2
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -83,3 +83,5 @@ def load_edgelist(path, weighted=False, is_directed=False, sep=","):
        df = pd.read_csv(path, header=None, names="source target".split(),sep=sep)
        G = nx.from_pandas_edgelist(df, create_using=template)
    return G
+
+
--- a/run_eval.py
+++ b/run_eval.py
@@ -22,15 +22,14 @@ fns = sorted(glob.glob(args.dataset_dir + "/*." + args.format))
 all_res = []
 pbar = tqdm(fns)
 for fn in pbar:
-    if os.path.exists(fn + "_results_lp"):
-        continue
    pbar.set_description("run eval on "+ fn)
-    command = "python evalNE_script.py {0} -f {1} -n".format(fn, args.format).split()
-    output = subprocess.run(command,stdout=subprocess.DEVNULL,
-    stderr=subprocess.STDOUT)
-    if not output.returncode == 0:
-        print("Error! for the command :", " ".join(command))
-        continue
+
+    if not os.path.exists(fn + "_results_lp"):
+        command = "python evalNE_script.py {0} -f {1} -n".format(fn, args.format).split()
+        output = subprocess.run(command)
+        if not output.returncode == 0:
+            print("Error! for the command :", " ".join(command))
+            continue
    df_results = parse_evalne_output(open(fn + "_results_lp").read())
    name = os.path.basename(fn)
    G = None