From 613e6a5ccec50b8739d136add9780ed5988b36af Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Fri, 12 Feb 2021 14:03:33 +0100 Subject: [PATCH] Add Visualisation tools of link prediction results+ Debug --- draw_visu.py | 148 +++++++++++++++++++++++++++++++ generate_theoric_random_graph.py | 38 ++++---- lib/random.py | 8 +- 3 files changed, 173 insertions(+), 21 deletions(-) create mode 100644 draw_visu.py diff --git a/draw_visu.py b/draw_visu.py new file mode 100644 index 0000000..04e34c3 --- /dev/null +++ b/draw_visu.py @@ -0,0 +1,148 @@ +# coding = utf-8 + +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +import re +import os + +import networkx as nx + + +def get_graph_attr(fn, graph_dir): + g_fn = os.path.join(graph_dir, fn) + if not os.path.exists(g_fn): + raise FileNotFoundError(g_fn) + G = nx.read_gml(g_fn).graph + return G + + +def get_sample_id_old(ch): + id_graph = re.findall("\d+", ch)[0] + if len(id_graph) == 3: + return id_graph[-2:] + else: + return id_graph[-1:] + + +def get_sample_id(fn, file_format="gml"): + return int(fn.strip(".{0}".format(file_format)).split("_")[-1]) + + +def load_data(fn, graph_dir): + df = pd.read_csv(fn, sep="\t") + df["type_graph"] = df.filename.apply(lambda x: x[6:]).apply(lambda x: re.sub("_[\d]+.gml", "", x).replace("_", " ")) + df["parameters"] = df.filename.apply(lambda x: get_graph_attr(x, graph_dir)) + df["sample"] = df.filename.apply(get_sample_id_old) + non_ne = {'random_prediction', 'common_neighbours', 'jaccard_coefficient', 'adamic_adar_index', + 'preferential_attachment', 'resource_allocation_index', 'stochastic_block_model', + 'stochastic_block_model_degree_corrected', 'spatial_link_prediction'} + df["type_method"] = df.name.apply(lambda x: "heuristic" if x in non_ne else "network_embedding_based") + return df + + +def set_custom_palette(x, y, max_color='red', close_color='turquoise', other_color='lightgrey'): + def get_color(x, max_val, min_diff): + if x == max_val: + return max_color + elif x > max_val - (0.01 + min_diff) and x < max_val + (0.01 + min_diff): + return close_color + else: + return other_color + + pal = [] + df = pd.concat((x, y), axis=1) + mean_df = df.groupby(x.name, as_index=False).mean() + mean_per_x = dict(mean_df.values) + max_val = mean_df[y.name].max() + min_diff = (max_val - mean_df[y.name]).median() + col_per_method = {k: get_color(v, max_val, min_diff) for k, v in mean_per_x.items()} + + for i, val in enumerate(x): + pal.append(col_per_method[val]) + + return pal + +def highlight_barplot(x, y, **kwargs): + if kwargs.get("palette", None): + kwargs["palette"] = set_custom_palette(x, y) + sns.barplot(x=x, y=y, **kwargs) + else: + sns.barplot(x=x, y=y, palette=set_custom_palette(x, y), **kwargs) + +class DrawingResults(): + def __init__(self, df_results): + self.df = df_results + + def __draw(self, g, **kwargs): + + if "figsize" in kwargs: + g.fig.set_size_inches(*kwargs["figsize"]) + + [plt.setp(ax.get_xticklabels(), rotation=kwargs.get("rotation", 90)) for ax in g.axes.flat] + g.fig.subplots_adjust(wspace=.09, hspace=.02) + + if kwargs.get("output_filename",None): + save_params = {} + if "save_param" in kwargs and type(kwargs["save_param"]) == dict: + save_params.update(kwargs["save_param"]) + g.savefig(kwargs["output_filename"], **save_params) + else: + plt.show() + + def metric_per_nodes_edges(self, type_graph=None, agg_func=None,metric="auroc", **draw_args): + new_df = self.df.copy() + if agg_func: + if agg_func in "mean max min std".split(): + new_df = new_df.groupby("name nb_edge size type_graph type_method".split(), as_index=False) + new_df = getattr(new_df, agg_func)() + else: + raise ValueError("Method {0} does not exists in pandas.core.groupby.generic.DataFrameGroupBy".format(agg_func)) + + if type_graph and type_graph in new_df.type_graph.unique(): + new_df = new_df[new_df.type_graph == type_graph].copy() + + g = sns.FacetGrid(new_df, row="size", col="nb_edge", margin_titles=True, height=2.5) + + plot_func = draw_args.get('plot_func', sns.barplot) + g.map(plot_func, "name", metric) + + return self.__draw(g, **draw_args) + + def metric_global(self, agg_func=None,metric="auroc", **draw_args): + + new_df = self.df.copy() + if agg_func: + new_df = self.df.groupby("name nb_edge size type_graph type_method".split(), as_index=False) + if agg_func in "mean max min std".split(): + new_df = getattr(new_df,agg_func)() + new_df = new_df.groupby("name type_graph type_method".split(), as_index=False) + new_df = getattr(new_df, agg_func)() + + else: + raise ValueError("Method {0} does not exists in pandas.core.groupby.generic.DataFrameGroupBy".format(agg_func)) + + g = sns.FacetGrid(new_df, col="type_graph", margin_titles=True, height=2.5) + + plot_func = draw_args.get('plot_func', sns.barplot) + g.map(plot_func, "name", metric, palette="tab20") + + return self.__draw(g, **draw_args) + + def caracteristic_distribution(self, caracteristic, **draw_args): + g = sns.FacetGrid(self.df, col="type_graph", col_wrap=4, ) + g.map(sns.histplot, caracteristic) + + return self.__draw(g, **draw_args) + + def parameter_impact(self, type_graph, parameter, second_parameter="size", metric="auroc", **draw_args): + _df = self.df[self.df.type_graph == type_graph].copy() + _df[parameter] = _df.parameters.apply(lambda x: x[parameter]) + + g = sns.FacetGrid(_df, row=second_parameter, col=parameter, margin_titles=True, height=2.5) + plot_func = draw_args.get('plot_func', sns.barplot) + g.map(plot_func, "name", metric, palette="tab20") + + return self.__draw(g,**draw_args) + diff --git a/generate_theoric_random_graph.py b/generate_theoric_random_graph.py index d7f5796..65c4555 100644 --- a/generate_theoric_random_graph.py +++ b/generate_theoric_random_graph.py @@ -19,30 +19,30 @@ args = parser.parse_args() GRAPH_SIZE = [80,800] EDGE_SIZE = [2,4,5] -sample_per_params = 4 +sample_per_params = 10 OUTPUT_DIR = args.output_dir if not os.path.exists(OUTPUT_DIR): - raise FileExistsError("Output directory does not exists !") + os.makedirs(args.output_dir) parameters = { - # "stochastic_block_model_graph": { - # "nb_nodes":GRAPH_SIZE, - # "nb_edges":EDGE_SIZE, - # "nb_com" :[2,5,8,16], - # "percentage_edge_betw":[0.1,0.01] - # }, - # "ER_graph": { - # "nb_nodes":GRAPH_SIZE, - # "nb_edges":EDGE_SIZE - # }, - # "powerlaw_graph": { # configuration_model - # "nb_nodes":GRAPH_SIZE, - # "nb_edges":EDGE_SIZE, - # "exponent":[2,3], - # "tries":[100] - # }, + "stochastic_block_model_graph": { + "nb_nodes":GRAPH_SIZE, + "nb_edges":EDGE_SIZE, + "nb_com" :[2,5,8,16], + "percentage_edge_betw":[0.1,0.01] + }, + "ER_graph": { + "nb_nodes":GRAPH_SIZE, + "nb_edges":EDGE_SIZE + }, + "powerlaw_graph": { # configuration_model + "nb_nodes":GRAPH_SIZE, + "nb_edges":EDGE_SIZE, + "exponent":[2,3], + "tries":[100] + }, "spatial_graph":{ "nb_nodes":GRAPH_SIZE, "nb_edges":EDGE_SIZE, @@ -66,7 +66,7 @@ for method,args in pbar: try: G = func(**params) G.graph.update(params) - nx.write_gml(G, OUTPUT_DIR+"/graph_{method}_{ix}{sp_id}.gml".format(method=method,ix=ix,sp_id=sp_id),stringizer=str) + nx.write_gml(G, OUTPUT_DIR+"/graph_{method}_{ix}_{sp_id}.gml".format(method=method,ix=ix,sp_id=sp_id),stringizer=str) except Exception as e: print(e) print("Can't generate graphs using these parameters") diff --git a/lib/random.py b/lib/random.py index 9b95de8..55867e2 100644 --- a/lib/random.py +++ b/lib/random.py @@ -150,7 +150,7 @@ def powerlaw_graph(nb_nodes, nb_edges, exponent=2, tries=1000, min_deg=0): return G -def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b), self_link=False): +def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b), self_link=False,weighted = False): """ Generate a spatial graph with a specific number of vertices and edges Parameters @@ -202,6 +202,7 @@ def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: n nodes = np.arange(nb_nodes).astype(int) sizes = [len(x) for x in np.array_split(np.arange(nb_edges), nb_nodes)] + new_df = df[(df.src == nodes[0]) | (df.tar == nodes[0])].sample(n=sizes[0], weights="weight").copy() add_register(new_df.hash.values) df = df[~in_register(df.hash.values)] @@ -212,7 +213,10 @@ def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: n add_register(new_df.hash.values) df = df[~in_register(df.hash.values)] - G = nx.from_pandas_edgelist(new_df, source="src", target="tar", edge_attr="weight") + if weighted: + G = nx.from_pandas_edgelist(new_df, source="src", target="tar", edge_attr="weight") + else: + G = nx.from_pandas_edgelist(new_df, source="src", target="tar") for n in list(G.nodes()): G.nodes[n]["pos"] = coords[n] return G -- GitLab