From 3f21589136efb04d6404028543df0eb077a71ce4 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Mon, 29 Mar 2021 16:39:38 +0200 Subject: [PATCH] debug --- generate_theoric_random_graph.py | 48 ++++---- lib/random.py | 204 ++++++++++++++++++++++--------- lib/visualisation.py | 9 +- 3 files changed, 180 insertions(+), 81 deletions(-) diff --git a/generate_theoric_random_graph.py b/generate_theoric_random_graph.py index 65c4555..53d6b9b 100644 --- a/generate_theoric_random_graph.py +++ b/generate_theoric_random_graph.py @@ -17,8 +17,8 @@ parser.add_argument("output_dir") args = parser.parse_args() -GRAPH_SIZE = [80,800] -EDGE_SIZE = [2,4,5] +GRAPH_SIZE = [80,800,5000] +EDGE_SIZE = [2,4,5,10] sample_per_params = 10 OUTPUT_DIR = args.output_dir @@ -27,30 +27,36 @@ if not os.path.exists(OUTPUT_DIR): parameters = { - "stochastic_block_model_graph": { + # "stochastic_block_model_graph": { + # "nb_nodes":GRAPH_SIZE, + # "nb_edges":EDGE_SIZE, + # "nb_com" :[2,5,8,16,10,25], + # "percentage_edge_betw":[0.1,0.01] + # }, + # "ER_graph": { + # "nb_nodes":GRAPH_SIZE, + # "nb_edges":EDGE_SIZE + # }, + # "powerlaw_graph": { # configuration_model + # "nb_nodes":GRAPH_SIZE, + # "nb_edges":EDGE_SIZE, + # "exponent":[2,3], + # "tries":[100] + # }, + # "spatial_graph":{ + # "nb_nodes":GRAPH_SIZE, + # "nb_edges":EDGE_SIZE, + # "coords":["random","country"], + # }, + "mixed_model_spat_sbm":{ "nb_nodes":GRAPH_SIZE, "nb_edges":EDGE_SIZE, - "nb_com" :[2,5,8,16], - "percentage_edge_betw":[0.1,0.01] - }, - "ER_graph": { - "nb_nodes":GRAPH_SIZE, - "nb_edges":EDGE_SIZE - }, - "powerlaw_graph": { # configuration_model - "nb_nodes":GRAPH_SIZE, - "nb_edges":EDGE_SIZE, - "exponent":[2,3], - "tries":[100] - }, - "spatial_graph":{ - "nb_nodes":GRAPH_SIZE, - "nb_edges":EDGE_SIZE, - "coords":["random","country"], + "nb_com":[2,4,8,16], + "alpha":[0,0.01,0.1,0.5,0.7,1] + } } - def get_params(inp): return (dict(zip(inp.keys(), values)) for values in itertools.product(*inp.values())) diff --git a/lib/random.py b/lib/random.py index 29f164c..1d09352 100644 --- a/lib/random.py +++ b/lib/random.py @@ -1,10 +1,12 @@ # coding = utf-8 +import copy from collections import Iterable import numpy as np import networkx as nx import pandas as pd from networkx.generators.degree_seq import _to_stublist +from cdlib import algorithms import random @@ -150,7 +152,8 @@ def powerlaw_graph(nb_nodes, nb_edges, exponent=2, tries=1000, min_deg=0): return G -def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b), self_link=False,weighted = False): +def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b) ** 2, + self_link=False, weighted=False): """ Generate a spatial graph with a specific number of vertices and edges Parameters @@ -167,6 +170,9 @@ def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: n nx.Graph generated graph """ + if nb_nodes > nb_edges: + raise ValueError( + "To obtain a specific nb of nodes, the number of edges must be equal or superior to the number of nodes !") if coords and isinstance(coords, Iterable) and not isinstance(coords, str): if len(coords) != nb_nodes: raise ValueError("number of nodes must match the size of the coords dict") @@ -182,43 +188,64 @@ def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: n coords_index = np.random.choice(np.arange(len(coords)), nb_nodes) coords = coords[coords_index] data = [] + float_epsilon = np.finfo(float).eps for i in range(nb_nodes): for j in range(nb_nodes): if i == j and not self_link: continue - data.append([i, j, 1/((1+dist_func(coords[i], coords[j])**4))]) + data.append([i, j, 1 / (float_epsilon+(dist_func(coords[i], coords[j])))]) df = pd.DataFrame(data, columns="src tar weight".split()).astype({"src": int, "tar": int}) df["hash"] = df.apply(lambda x: "_".join(sorted([str(int(x.src)), str(int(x.tar))])), axis=1) df = df.drop_duplicates(subset="hash") - df["weight"] = df.weight/df.weight.sum() - - register = set([]) - - def add_register(hashes): - for hash_ in hashes: - register.add(hash_) - - def in_register(hashes): - return np.array([True if hash_ in register else False for hash_ in hashes]) - - nodes = np.arange(nb_nodes).astype(int) - sizes = [len(x) for x in np.array_split(np.arange(nb_edges), nb_nodes)] - - new_df = df[(df.src == nodes[0]) | (df.tar == nodes[0])].sample(n=sizes[0], weights="weight").copy() - add_register(new_df.hash.values) - df = df[~in_register(df.hash.values)] - - for ix, node in enumerate(nodes[1:]): - sample = df[(df.src == node) | (df.tar == node)].sample(n=sizes[ix + 1], weights="weight").copy() - new_df = pd.concat((new_df, sample)) - add_register(new_df.hash.values) - df = df[~in_register(df.hash.values)] - - if weighted: - G = nx.from_pandas_edgelist(new_df, source="src", target="tar", edge_attr="weight") - else: - G = nx.from_pandas_edgelist(new_df, source="src", target="tar") - for n in list(G.nodes()): G.nodes[n]["pos"] = coords[n] + df["weight"] = df.weight / df.weight.sum() + + iter_ = 0 + f = False + best_G = None + new_df = None + while iter_ < 50 and f == False: + new_df = df.sample(n=nb_edges, weights="weight") + if weighted: + G = nx.from_pandas_edgelist(new_df, source="src", target="tar", edge_attr="weight") + else: + G = nx.from_pandas_edgelist(new_df, source="src", target="tar") + f = (len(G) == nb_nodes and G.size() == nb_edges) + if not best_G == None: + if abs(len(best_G) - nb_nodes) > abs(len(G) - nb_nodes): + best_G = G.copy() + else: + best_G = G.copy() + iter_ += 1 + G = best_G.copy() + + if not len(G) == nb_nodes: + diff = abs(len(G) - nb_nodes) + df_deg = pd.DataFrame(nx.degree(G), columns="node degree".split()) + df_deg = df_deg[df_deg.degree > 2].sort_values(by="degree", ascending=False) + i = 1 + while 1: + new_df = df_deg.head(i) + if ((new_df.degree) - 1).sum() > diff: + break + i += 1 + df_deg = df_deg.head(i) + edges_ = [] + for node in df_deg.node.values: + node_edges = list(G.edges(node)) + random.shuffle(node_edges) + edges_.extend([[ed[0], ed[1]] for ed in node_edges[1:]]) + idx = np.random.choice(np.arange(len(edges_)), size=diff, replace=False) + edges_ = np.array(edges_)[idx] + G.remove_edges_from(edges_) + + missing_nodes = list(set(range(nb_nodes)) - set(list(G.nodes()))) + for node in missing_nodes: + new_df = df[(df.src == node) | (df.tar == node)].sample(n=1, weights="weight") + edges = new_df["src tar".split()].values + G.add_edges_from(edges) + + for n in list(G.nodes()): + G.nodes[n]["pos"] = coords[n] return G @@ -271,44 +298,31 @@ def stochastic_block_model_graph(nb_nodes, nb_edges, nb_com, percentage_edge_bet if verbose: print(G.size()) + u_in = sum([nb_of_pair((b_assign_array==b).sum()) for b in range(nb_com)]) u_out = nb_of_pair(len(G)) - u_in - l_out = u_out*percentage_edge_betw + l_out = nb_edges*percentage_edge_betw + p_out = l_out/u_out l_in = nb_edges - l_out - percentage_edge_within = l_in / u_in + p_in = l_in / u_in if verbose: - print("u_out",u_out) print("u_in",u_in) + print("u_out", u_out) print("l_out",l_out) print("l_in", l_in) - print("p_in",percentage_edge_within) - # percentage_edge_within = 1 - percentage_edge_betw + print("p_in",p_in) + print("p_out", p_out) - inter_edges, intra_edges = [], [] - register = set([]) - for n1 in list(G.nodes()): - for n2 in list(G.nodes()): - hash_ = "_".join(sorted([str(n1), str(n2)])) - if (n1 == n2) or (hash_ in register): - continue - b1, b2 = block_assign[n1], block_assign[n2] - if b1 != b2: - inter_edges.append([n1, n2]) - else: - intra_edges.append([n1, n2]) - register.add(hash_) - + inter_edges, intra_edges = get_inter_intra_edges(G,G.is_directed()) inter_edges = np.asarray(inter_edges) intra_edges = np.asarray(intra_edges) inter_N, intra_N = len(inter_edges), len(intra_edges) - probs_inter = np.ones(inter_N) * percentage_edge_betw - probs_intra = np.ones(intra_N) * percentage_edge_within + probs_inter = np.ones(inter_N) * p_out + probs_intra = np.ones(intra_N) * p_in all_edges = np.concatenate((inter_edges, intra_edges)) - del inter_edges - del intra_edges all_probs = np.concatenate((probs_inter, probs_intra)) del probs_inter del probs_intra @@ -316,7 +330,6 @@ def stochastic_block_model_graph(nb_nodes, nb_edges, nb_com, percentage_edge_bet if verbose: print(inter_N, intra_N) - print(int(np.ceil(nb_edges * percentage_edge_betw)), int(np.ceil(nb_edges * percentage_edge_within))) final_edges = [] index_selected_pairs = np.random.choice(np.arange(len(all_edges)), nb_edges, p=all_probs, replace=False) @@ -327,7 +340,7 @@ def stochastic_block_model_graph(nb_nodes, nb_edges, nb_com, percentage_edge_bet G2 = nx.from_edgelist(final_edges) if len(G2) != nb_nodes: - equilibrate(G2, nb_nodes, percentage_edge_betw, percentage_edge_within, inter_edges, intra_edges, block_assign) + equilibrate(G2, nb_nodes, percentage_edge_betw, 1-percentage_edge_betw, inter_edges, intra_edges, block_assign) for n in list(G2.nodes()): G2.nodes[n]["block"] = block_assign[n] @@ -407,3 +420,82 @@ def equilibrate(G, nb_nodes, percentage_edge_betw, percentage_edge_within, inter for ed in new_edges: G.add_edge(*ed) return G + + +def add_partitions_G(G, nb_com=5): + blocks = np.random.choice(np.arange(nb_com), len(G), p=[1 / nb_com] * nb_com) + for ix, node in enumerate(list(G.nodes())): + try: + G.nodes[node]["block"] = blocks[ix] + except KeyError: + G.nodes[node]["block"] = 0 + return G + + +def get_inter_intra_edges(G, directed=False): + block_assign = nx.get_node_attributes(G, "block") + assert (len(block_assign) ==len(G)) + + inter_edges, intra_edges = [], [] + register = set([]) + for n1 in list(G.nodes()): + for n2 in list(G.nodes()): + if directed: + "_".join([str(n1), str(n2)]) + else: + hash_ = "_".join(sorted([str(n1), str(n2)])) + if (n1 == n2) or (hash_ in register): + continue + b1, b2 = block_assign[n1], block_assign[n2] + if b1 != b2: + inter_edges.append([n1, n2]) + else: + intra_edges.append([n1, n2]) + register.add(hash_) + return inter_edges,intra_edges + +def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw=0.01, + dist_func=lambda p1, p2: np.linalg.norm(p1 - p2) ** 2): + G = spatial_graph(nb_nodes, nb_edges, coords="random", dist_func=dist_func) + G = add_partitions_G(G, nb_com) + nb_edges = G.size() + + def nb_of_pair(N): + return (N * (N - 1)) / 2 + + block_assign = nx.get_node_attributes(G, "block") + assert (len(block_assign) == len(G)) + b_assign_array = np.asarray(list(block_assign.values())) + + u_in = sum([nb_of_pair((b_assign_array == b).sum()) for b in range(np.max(b_assign_array))]) + u_out = nb_of_pair(len(G)) - u_in + l_out = nb_edges * percentage_edge_betw + p_out = l_out / u_out + l_in = nb_edges - l_out + p_in = l_in / u_in + + + inter_edges, intra_edges = get_inter_intra_edges(G,G.is_directed()) + inter_N, intra_N = len(inter_edges), len(intra_edges) + probs_sbm_inter = np.ones(inter_N) * p_out + probs_sbm_intra = np.ones(intra_N) * p_in + + all_edges = np.concatenate((inter_edges, intra_edges)) + all_probs_sbm = np.concatenate((probs_sbm_inter, probs_sbm_intra)) + all_probs_sbm /= all_probs_sbm.sum() + + all_probs_spa = np.asarray([1 / dist_func(edge[0], edge[1]) for edge in all_edges]) + all_probs_spa /= all_probs_spa.sum() + + all_probs = alpha * (all_probs_sbm) + (1 - alpha) * all_probs_spa + + final_edges = [] + index_selected_pairs = np.random.choice(np.arange(len(all_edges)), nb_edges, p=all_probs, replace=False) + final_edges.extend(all_edges[index_selected_pairs]) + G2 = nx.from_edgelist(final_edges) + + for n in list(G2.nodes()): + G2.nodes[n]["block"] = block_assign[n] + G2.nodes[n]["pos"] = G.nodes[n]["pos"] + + return G2 \ No newline at end of file diff --git a/lib/visualisation.py b/lib/visualisation.py index aad6b85..032a64d 100644 --- a/lib/visualisation.py +++ b/lib/visualisation.py @@ -92,7 +92,7 @@ class DrawingResults(): g.fig.set_size_inches(*kwargs["figsize"]) [plt.setp(ax.get_xticklabels(), rotation=kwargs.get("rotation", 90)) for ax in g.axes.flat] - g.fig.subplots_adjust(wspace=.09, hspace=.02) + g.fig.subplots_adjust(wspace=.09)#, hspace=.02) if kwargs.get("output_filename",None): save_params = {} @@ -114,7 +114,7 @@ class DrawingResults(): if type_graph and type_graph in new_df.type_graph.unique(): new_df = new_df[new_df.type_graph == type_graph].copy() - g = sns.FacetGrid(new_df, row="size", col="nb_edge", margin_titles=True, height=2.5) + g = sns.FacetGrid(new_df, row="size", col="nb_edge", margin_titles=True) plot_func = draw_args.get('plot_func', sns.barplot) g.map(plot_func, "name", metric) @@ -134,7 +134,8 @@ class DrawingResults(): else: raise ValueError("Method {0} does not exists in pandas.core.groupby.generic.DataFrameGroupBy".format(agg_func)) - g = sns.FacetGrid(new_df, col="type_graph", margin_titles=True, height=2.5) + + g = sns.FacetGrid(new_df, col="type_graph", col_wrap=2, margin_titles=True) plot_func = draw_args.get('plot_func', sns.barplot) g.map(plot_func, "name", metric, palette="tab20") @@ -153,7 +154,7 @@ class DrawingResults(): g = sns.FacetGrid(_df, row=second_parameter, col=parameter, margin_titles=True, height=2.5) plot_func = draw_args.get('plot_func', sns.barplot) - g.map(plot_func, "name", metric, palette="tab20") + g.map(plot_func, "name", metric, palette=draw_args.get("cmap","tab20")) return self.__draw(g,**draw_args) -- GitLab