From b720b5e5936c082d58264a8287a2549ccbd7f9ff Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Thu, 28 Jan 2021 18:39:17 +0100
Subject: [PATCH] Add graph generator for stochastic block model, configuration
 model that takes nb of nodes and nb of edges in parameter + Debug

---
 evalNE_script.py                 |   2 +-
 generate_theoric_random_graph.py |  26 ++----
 lib/random.py                    | 140 +++++++++++++++++++++++++++++++
 lib/utils.py                     |   2 +
 run_eval.py                      |  15 ++--
 5 files changed, 157 insertions(+), 28 deletions(-)
 create mode 100644 lib/random.py

diff --git a/evalNE_script.py b/evalNE_script.py
index 29fdf12..dc79263 100644
--- a/evalNE_script.py
+++ b/evalNE_script.py
@@ -40,7 +40,7 @@ log("Density "+ str(len(G)/len(list(G.edges()))))
 log("Building link prediction dataset...")
 # Create an evaluator and generate train/test edge split
 traintest_split = LPEvalSplit()
-traintest_split.compute_splits(G,split_alg="spanning_tree",train_frac=0.8,fe_ratio=1)
+traintest_split.compute_splits(G,split_alg="fast",train_frac=0.6,fe_ratio=1)
 nee = LPEvaluator(traintest_split)
 
 log("Dataset Built !")
diff --git a/generate_theoric_random_graph.py b/generate_theoric_random_graph.py
index 0709bda..1dde7cf 100644
--- a/generate_theoric_random_graph.py
+++ b/generate_theoric_random_graph.py
@@ -26,14 +26,16 @@ def generate_sbm_prob_matrix(nb_of_blocks,prob_btw_block=0.1):
 
 
 GRAPH_SIZE = [50,75,100]
+EDGE_SIZE = []
 OUTPUT_DIR = args.output_dir
 if not os.path.exists(OUTPUT_DIR):
     raise FileExistsError("Output directory does not exists !")
 
+nx.waxman_graph
 parameters = {
     "planted_partition_graph": {
-        "l": [3,5,8],
-        "k": [10,20],
+        "l": [3,5,8], # nb of groups
+        "k": [10,20], # nb de noeud
         "p_in": [0.2,0.5,0.7],
         "p_out": [0.1]
     },
@@ -41,26 +43,12 @@ parameters = {
         "sizes": [[random.choice([10,20,30]) for k in range(i)] for i in [3,5,8]],
         "p": [] # Filled later
     },
-    "fast_gnp_random_graph": {
+    "dense_gnm_random_graph": {
         "n": GRAPH_SIZE,
-        "p": [0.4,0.6]
+        "m": EDGE_SIZE
     },
-    "random_powerlaw_tree_sequence": {  # configuration_model
+    "powerlaw_graph": {  # configuration_model
         "n": GRAPH_SIZE,
-        "tries":[10000]
-    },
-    # "random_geometric_graph": {
-    #     "n": GRAPH_SIZE,
-    #     "radius": [0.4,0.6]
-    # },
-    "waxman_graph": {
-        "n": GRAPH_SIZE,
-        "beta": [0.4,0.6],
-        "alpha": [0.4,0.6]
-    },
-    "geographical_threshold_graph": {
-        "n": GRAPH_SIZE,
-        "theta": [0.2,0.3]
     },
 }
 # Generating transition matrices for stochastic block model
diff --git a/lib/random.py b/lib/random.py
new file mode 100644
index 0000000..8b57697
--- /dev/null
+++ b/lib/random.py
@@ -0,0 +1,140 @@
+# coding = utf-8
+from collections import Iterable
+
+import numpy as np
+import networkx as nx
+import pandas as pd
+
+import random
+
+
+def powerlaw(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1):
+    nb_stubs = nb_edges * 2
+    # Draw a first time a powerlaw degree sequence
+    degs = np.round(nx.utils.powerlaw_sequence(nb_nodes, exponent=exponent))
+
+    degs = degs[degs >= min_deg]
+    # Compute de degree sum
+    sum_deg = degs.sum()
+
+    for _ in range(tries):
+        # If the sum of the degree sequence is equal to the number of stubs, then it's good
+        if sum_deg == nb_stubs:
+            return degs
+        # Draw a a new powerlaw degree sequence
+        new_draw = np.round(nx.utils.powerlaw_sequence(nb_nodes, exponent=exponent))
+        new_draw = new_draw[new_draw >= min_deg]
+        new_sum_deg = new_draw.sum()
+
+        # If the new degree sequence is closer to the objective than the previously draw sequence
+        if abs(nb_stubs - new_sum_deg) < abs(nb_stubs - sum_deg):
+            degs = new_draw
+            sum_deg = new_sum_deg
+
+    # Once the final draw is executed and the sequence degree sum is not equal to number of stubs expected
+    if not sum_deg == nb_stubs:
+        # We randomly pick sequence degrees and increment (or decrement) their values
+        diff = abs(sum_deg - nb_stubs)
+        signe = -1 if (nb_stubs - sum_deg) < 0 else 1
+        indexes = np.random.choice(np.arange(len(degs)), int(diff))
+        for ind in indexes:
+            degs[ind] = degs[ind] + signe
+
+    return degs
+
+
+def get_countries_coords():
+    """
+    Return the coordinates of each country in the world.
+    Returns
+    -------
+    np.ndarray
+        coordinates
+    """
+    try:
+        import geopandas as gpd
+    except:
+        raise ImportError("Geopandas is not installed !")
+    gdf = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
+    return np.asarray(gdf.centroid.apply(lambda x: [x.x, x.y]).values.tolist())
+
+
+def powerlaw_graph(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1):
+    return nx.configuration_model(powerlaw(nb_nodes,nb_edges,exponent,tries,min_deg))
+
+def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b), self_link=False):
+    if coords and isinstance(coords, Iterable) and not isinstance(coords, str):
+        if len(coords) != nb_nodes:
+            raise ValueError("number of nodes must match the size of the coords dict")
+    elif coords == "random":
+        coords = np.random.random(nb_nodes * 2).reshape(nb_nodes, 2)
+        coords[:, 0] = (coords[:, 0] * 360) - 180
+        coords[:, 1] = (coords[:, 1] * 180) - 90
+    else:
+        coords = get_countries_coords()
+        if nb_nodes > len(coords):
+            raise ValueError(
+                "Too many nodes for coords = \"country\". Change nb_nodes value or change coords to 'random' or your own list of coords")
+        coords_index = np.random.choice(np.arange(len(coords)), nb_nodes)
+        coords = coords[coords_index]
+    data = []
+    for i in range(nb_nodes):
+        for j in range(nb_nodes):
+            if i == j and not self_link:
+                continue
+            data.append([i, j, dist_func(coords[i], coords[j])])
+    df = pd.DataFrame(data, columns="src tar weight".split())
+    df = df.sample(nb_edges, weights="weight")
+    G = nx.from_pandas_edgelist(df, source="src", target="tar", edge_attr="weight")
+    for n in list(G.nodes()): G.nodes[n]["pos"] = coords[n]
+    return G
+
+def ER_graph(nb_nodes,nb_edges):
+    return nx.dense_gnm_random_graph(nb_nodes,nb_edges)
+
+
+def stochastic_block_model_graph(nb_nodes,nb_edges,nb_com,percentage_edge_betw,verbose=False):
+
+    percentage_edge_within = 1 - percentage_edge_betw
+    if nb_edges > (1 / nb_com) * (nb_nodes * (nb_nodes - 1)) / 2:
+        raise ValueError("nb_edges must be inferior to {0}".format((1 / nb_com) * (nb_nodes * (nb_nodes - 1)) / 2))
+
+    G = nx.planted_partition_graph(nb_com, int(np.round(nb_nodes / nb_com)), 1, 1)
+    if verbose:
+        print(G.size())
+
+    block_assign = nx.get_node_attributes(G, "block")
+    inter_edges,intra_edges = [], []
+    register = set([])
+    for n1 in list(G.nodes()):
+        for n2 in list(G.nodes()):
+            hash_ = "_".join(sorted([str(n1), str(n2)]))
+            if (n1 == n2) or (hash_ in register):
+                continue
+            b1, b2 = block_assign[n1], block_assign[n2]
+            if b1 != b2:
+                inter_edges.append([n1, n2])
+            else:
+                intra_edges.append([n1, n2])
+            register.add(hash_)
+
+    inter_edges = np.asarray(inter_edges)
+    intra_edges = np.asarray(intra_edges)
+    inter_N, intra_N = len(inter_edges), len(intra_edges)
+
+    if verbose:
+        print(inter_N, intra_N)
+        print(int(np.ceil(nb_edges * percentage_edge_betw)), int(np.ceil(nb_edges * percentage_edge_within)))
+
+    final_edges = []
+    index_inter = np.random.choice(np.arange(inter_N), int(np.ceil(nb_edges * percentage_edge_betw)), replace=False)
+    index_intra = np.random.choice(np.arange(intra_N), int(np.ceil(nb_edges * percentage_edge_within)), replace=False)
+    final_edges.extend(inter_edges[index_inter])
+    final_edges.extend(intra_edges[index_intra])
+    if verbose:
+        print(len(final_edges))
+
+    G2 = nx.from_edgelist(final_edges)
+    for n in list(G2.nodes()):
+        G2.nodes[n]["block"] = block_assign[n]
+    return G2
diff --git a/lib/utils.py b/lib/utils.py
index 958dce8..aa53243 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -83,3 +83,5 @@ def load_edgelist(path, weighted=False, is_directed=False, sep=","):
         df = pd.read_csv(path, header=None, names="source target".split(),sep=sep)
         G = nx.from_pandas_edgelist(df, create_using=template)
     return G
+
+
diff --git a/run_eval.py b/run_eval.py
index 21d3958..c4a739b 100644
--- a/run_eval.py
+++ b/run_eval.py
@@ -22,15 +22,14 @@ fns = sorted(glob.glob(args.dataset_dir + "/*." + args.format))
 all_res = []
 pbar = tqdm(fns)
 for fn in pbar:
-    if os.path.exists(fn + "_results_lp"):
-        continue
     pbar.set_description("run eval on "+ fn)
-    command = "python evalNE_script.py {0} -f {1} -n".format(fn, args.format).split()
-    output = subprocess.run(command,stdout=subprocess.DEVNULL,
-    stderr=subprocess.STDOUT)
-    if not output.returncode == 0:
-        print("Error! for the command :", " ".join(command))
-        continue
+
+    if not os.path.exists(fn + "_results_lp"):
+        command = "python evalNE_script.py {0} -f {1} -n".format(fn, args.format).split()
+        output = subprocess.run(command)
+        if not output.returncode == 0:
+            print("Error! for the command :", " ".join(command))
+            continue
     df_results = parse_evalne_output(open(fn + "_results_lp").read())
     name = os.path.basename(fn)
     G = None
-- 
GitLab