diff --git a/eval_mixed_model.py b/eval_mixed_model.py index bfd0f758c58060cee6094c51ca014544a8662cc4..0e170e5463ed71f6da725eb9c6fc9452cdac7026 100644 --- a/eval_mixed_model.py +++ b/eval_mixed_model.py @@ -1,35 +1,8 @@ # coding = utf-8 import argparse - -from lib.random import mixed_model_spat_sbm, get_spat_probs, get_sbm_probs - -import networkx as nx -import numpy as np -import pandas as pd - -from tqdm import tqdm - -from evalne.evaluation.evaluator import LPEvaluator -from evalne.evaluation.split import EvalSplit as LPEvalSplit - -from evalne.utils import preprocess as pp - - -from sklearn.linear_model import LogisticRegression -from sklearn.tree import DecisionTreeClassifier -from sklearn.svm import SVC,LinearSVC -from sklearn.neighbors import KNeighborsClassifier -from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier -from sklearn.naive_bayes import GaussianNB -from sklearn.neural_network import MLPClassifier -from sklearn.linear_model import SGDClassifier - -from sklearn.model_selection import GridSearchCV -from sklearn.metrics import roc_auc_score,precision_score,recall_score,f1_score -from sklearn.metrics import make_scorer - -roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True, - needs_threshold=True) +from lib.random import mixed_model_spat_sbm +from lib.erosion_model import eval_erosion_model +from joblib import Parallel,delayed parser = argparse.ArgumentParser() parser.add_argument("nb_nodes",type=int) @@ -51,173 +24,12 @@ NB_COM = args.nb_com NB_ITERATION = args.nb_iterations VERBOSE = args.verbose FEATURES = set(args.features.split(",")) -TIMEOUT = args.timeout - -dist = lambda a,b : np.linalg.norm(a-b)**2 -hash_func = lambda x:"_".join(sorted([str(x[0]),str(x[1])])) - -def get_aucs(G): - H, _ = pp.prep_graph(G.copy(),maincc=True) - traintest_split = LPEvalSplit() - traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1) - nee = LPEvaluator(traintest_split) - auc_spatial, auc_sbm = 0, 0 - try: - auc_spatial = nee.evaluate_baseline(method="spatial_link_prediction",timeout=TIMEOUT).test_scores.auroc() - auc_sbm = nee.evaluate_baseline(method="stochastic_block_model",timeout=TIMEOUT).test_scores.auroc() - except: - print("Could not compute AUC ! ") - return auc_sbm,auc_spatial - -dist = lambda a,b : np.linalg.norm(a-b) -G,all_probs_sbm,all_probs_spa = mixed_model_spat_sbm(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,alpha=ALPHA) - -register = set([]) -data = [] -for n1 in list(G.nodes()): - for n2 in list(G.nodes()): - if n1 != n2 and hash_func((n1,n2)) not in register: - data.append([n1,n2]) - register.add(hash_func((n1,n2))) -df_data = pd.DataFrame(data,columns="u v".split()) -df_data["hash_"] = df_data.apply(lambda row:hash_func((int(row.u),int(row.v))), axis=1) -df_data["p_0"] = df_data.apply(lambda x:1 if G.has_edge(x.u,x.v) else 0,axis =1) - - -pos = nx.get_node_attributes(G,"pos") -block_assign = nx.get_node_attributes(G,"block") - -H = G.copy() -float_epsilon = np.finfo(float).eps - - -for i in range(1,NB_ITERATION+1): - if H.size() < 30: - df_data["p_{0}".format(i)] = df_data["p_{0}".format(i-1)] - continue - old_probs = dict(df_data["hash_ p_{0}".format(i-1).split()].values) - auc_sbm,auc_spatial = get_aucs(H) - if VERBOSE : print("SBM: ",auc_sbm,"SPATIAL: ",auc_spatial) - if auc_sbm> auc_spatial: - edges,probs = get_sbm_probs(H,0.01) - else: - edges,probs = get_spat_probs(H) - - edges = np.asarray(edges) - - probs_dom = np.asarray(probs) - probs_dom /= probs_dom.sum() - - edge_prob = dict(zip([hash_func(ed) for ed in edges], probs_dom)) - df_data["p_{0}".format(i)] = df_data.apply(lambda x: edge_prob[hash_func([int(x.u),int(x.v)])] if hash_func([int(x.u),int(x.v)]) in edge_prob else 0,axis=1) - - h_probs = np.asarray([(1 / H.size()) - probs_dom[ix] for ix, ed in enumerate(edges) if H.has_edge(*ed)]) - new_nb_edges = h_probs.sum() * H.size() - if VERBOSE:print("new NB of Edges",new_nb_edges) - - - probs_erosion = np.asarray([old_probs[hash_func(ed)]-probs_dom[ix] for ix,ed in enumerate(edges)]) - probs_erosion[probs_erosion <0] = float_epsilon - probs_erosion /= probs_erosion.sum() - - final_edges = [] - index_selected_pairs = np.random.choice(np.arange(len(edges)),round(new_nb_edges) , p=probs_erosion, replace=False)#round(0.7*H.size()) - final_edges.extend(edges[index_selected_pairs]) - G2 = nx.from_edgelist(final_edges) - for n in list(G2.nodes()): - G2.nodes[n]["block"] = block_assign[n] - G2.nodes[n]["pos"] = pos[n] - H=G2.copy() - -if VERBOSE:print(df_data) - -edge_feature= {hash_func([int(row.u),int(row.v)]):[row["p_{0}".format(i)] for i in range(1,NB_ITERATION+1)] for ix,row in df_data.iterrows()} - -G, _ = pp.prep_graph(G,maincc=True,relabel=False) -traintest_split = LPEvalSplit() -traintest_split.compute_splits(G, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1) - -X_train = traintest_split.train_edges -y_train = traintest_split.train_labels -X_test = traintest_split.test_edges -y_test = traintest_split.test_labels - -if "pos" in FEATURES: - pos = nx.get_node_attributes(G,"pos") - dist_X_train = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_train[:,:2]]).reshape(-1,1) - dist_X_test = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_test[:,:2]]).reshape(-1,1) - - X_train = np.concatenate((X_train, dist_X_train), axis=1) - X_test = np.concatenate((X_test, dist_X_test), axis=1) - -if "centrality" in FEATURES: - centrality = nx.degree_centrality(G) - centrality_X_train = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_train[:,:2]]) - centrality_X_test = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_test[:,:2]]) - - X_train = np.concatenate((X_train, centrality_X_train), axis=1) - X_test = np.concatenate((X_test, centrality_X_test), axis=1) - +TIMEOUT = 10#args.timeout -if "it_probs": - if_not =[0 for i in range(NB_ITERATION)] - feature_X_train = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train[:,:2]]) - feature_X_test = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test[:,:2]]) - X_train = np.concatenate((X_train, feature_X_train), axis=1) - X_test = np.concatenate((X_test, feature_X_test ), axis=1) -X_train = X_train[:,2:] -X_test = X_test[:,2:] -classifier_dict = { - "naive-bayes":GaussianNB(), - #"svm":SVC(), - "sgd":SGDClassifier(), - "knn":KNeighborsClassifier(), - "decision-tree": DecisionTreeClassifier(), - "random-forest":RandomForestClassifier(), - "mlp":MLPClassifier(), - "logistic_reg":LogisticRegression(), - "linear_svm":LinearSVC() -} +G = mixed_model_spat_sbm(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,alpha=ALPHA) -parameters = { - "naive-bayes":[], - #"svm":[{"kernel":["rbf","linear"], 'gamma': [1e-1,1e-2,1e-3, 1,10,100]}], - "sgd":[{"penalty":["l1","l2"],"loss":["hinge","modified_huber","log"]}], - "knn":[{"n_neighbors":list(range(4,8)),"p":[1,2]}], - "decision-tree": [{"criterion":["gini","entropy"]}], - "random-forest":[{"criterion":["gini","entropy"],"n_estimators":[10,50,100]}], - "mlp":[], - "logistic_reg":[], - "linear_svm":[] -} -auc_sbm, auc_spa = get_aucs(G) -if VERBOSE: print("SBM AUUROC",auc_sbm,"SPATIAL AUROC",auc_spa) -data = [] -pbar = tqdm(parameters) -for classi_ in classifier_dict: - pbar.set_description(classi_) - if len(parameters[classi_])>0: - clf = GridSearchCV( - classifier_dict[classi_], parameters[classi_], scoring=roc_auc_scorer, n_jobs=-1 - ) - clf.fit(X_train,y_train) - y_pred = clf.best_estimator_.predict(X_test) - else: - classifier_dict[classi_].fit(X_train,y_train) - y_pred = classifier_dict[classi_].predict(X_test) - data.append([classifier_dict[classi_].__class__.__name__,precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred),roc_auc_score(y_test,y_pred)]) +print(eval_erosion_model(G,NB_ITERATION)) -df = pd.DataFrame(data,columns="method precision recall f1-score auroc".split()) -df["auc_sbm"] = auc_sbm -df["auc_spatial"] = auc_spa -df["alpha"] = ALPHA -df["nb_nodes"] = GRAPH_NODE_NB -df["nb_edges"] = GRAPH_EDGE_NB -df["nb_com"] = NB_COM -df["nb_iterations"] = NB_ITERATION -df["features"] = "_".join(FEATURES) -if VERBOSE : print(df) -df.to_csv("{0}_{1}_{2}_{3}_{4}_{5}.csv".format(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,ALPHA,NB_ITERATION,"_".join(FEATURES)),sep="\t",index=None) \ No newline at end of file diff --git a/generate_mixed_model_graph.py b/generate_mixed_model_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..ee7646a4749979c5c35b39e8458ef330c0196f8a --- /dev/null +++ b/generate_mixed_model_graph.py @@ -0,0 +1,58 @@ +# coding = utf-8 +import itertools +import os + +import networkx as nx +import argparse +import numpy as np +import pandas as pd +import random +import copy +from tqdm import tqdm +import lib.random as ra + +# COMMAND PARSING +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +args = parser.parse_args() + + +GRAPH_SIZE = [300,1000] +EDGE_SIZE = [2] +sample_per_params = 10 + +OUTPUT_DIR = args.output_dir +if not os.path.exists(OUTPUT_DIR): + os.makedirs(args.output_dir) + +parameters = { + "mixed_model_spat_sbm":{ + "nb_nodes":GRAPH_SIZE, + "nb_edges":EDGE_SIZE, + "nb_com":[2,3,4,5], + "alpha":[0,0.2,0.5,0.7,1], + + } +} + +def get_params(inp): + return (dict(zip(inp.keys(), values)) for values in itertools.product(*inp.values())) + +pbar = tqdm(parameters.items(),total=len(parameters)) +for method,args in pbar: + pbar.set_description("Generating graphs using : " + method) + list_of_params = get_params(parameters[method]) + func = getattr(ra,method) + for ix,params in enumerate(list_of_params): + params["nb_edges"] = params["nb_edges"] * params["nb_nodes"] + print("Gen graph using the following parameters : ",params) + for sp_id in range(sample_per_params): + try: + G = func(**params) + G.graph.update(params) + nx.write_gml(G, OUTPUT_DIR+"/graph_{method}_{ix}_{sp_id}.gml".format(method=method,ix=ix,sp_id=sp_id),stringizer=str) + except Exception as e: + print(e) + print("Can't generate graphs using these parameters") + + diff --git a/generate_theoric_random_graph.py b/generate_theoric_random_graph.py index 53d6b9b76cf25d725949534342046b862f6d5ea6..a413ef69e15fe90217815a8db7abd0958b497b0f 100644 --- a/generate_theoric_random_graph.py +++ b/generate_theoric_random_graph.py @@ -27,27 +27,27 @@ if not os.path.exists(OUTPUT_DIR): parameters = { - # "stochastic_block_model_graph": { - # "nb_nodes":GRAPH_SIZE, - # "nb_edges":EDGE_SIZE, - # "nb_com" :[2,5,8,16,10,25], - # "percentage_edge_betw":[0.1,0.01] - # }, - # "ER_graph": { - # "nb_nodes":GRAPH_SIZE, - # "nb_edges":EDGE_SIZE - # }, - # "powerlaw_graph": { # configuration_model - # "nb_nodes":GRAPH_SIZE, - # "nb_edges":EDGE_SIZE, - # "exponent":[2,3], - # "tries":[100] - # }, - # "spatial_graph":{ - # "nb_nodes":GRAPH_SIZE, - # "nb_edges":EDGE_SIZE, - # "coords":["random","country"], - # }, + "stochastic_block_model_graph": { + "nb_nodes":GRAPH_SIZE, + "nb_edges":EDGE_SIZE, + "nb_com" :[2,5,8,16,10,25], + "percentage_edge_betw":[0.1,0.01] + }, + "ER_graph": { + "nb_nodes":GRAPH_SIZE, + "nb_edges":EDGE_SIZE + }, + "powerlaw_graph": { # configuration_model + "nb_nodes":GRAPH_SIZE, + "nb_edges":EDGE_SIZE, + "exponent":[2,3], + "tries":[100] + }, + "spatial_graph":{ + "nb_nodes":GRAPH_SIZE, + "nb_edges":EDGE_SIZE, + "coords":["random","country"], + }, "mixed_model_spat_sbm":{ "nb_nodes":GRAPH_SIZE, "nb_edges":EDGE_SIZE, diff --git a/lib/erosion_model.py b/lib/erosion_model.py new file mode 100644 index 0000000000000000000000000000000000000000..5f503b1ad33d0c48fb066e2d94d9bd3c528c1d7c --- /dev/null +++ b/lib/erosion_model.py @@ -0,0 +1,140 @@ +# coding = utf-8 +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_auc_score + +from .link_prediction_eval import get_auc_heuristics, split_train_test, get_all_possible_edges +from .random import get_spat_probs, get_sbm_probs +from .lambda_func import euclid_dist as dist +from .lambda_func import hash_func + +from evalne.methods.similarity import stochastic_block_model,spatial_link_prediction + +import pandas as pd +import networkx as nx +import numpy as np +float_epsilon = np.finfo(float).eps + + +class ErosionModel(): + def __init__(self, G): + self.G = G + self.coordinates = nx.get_node_attributes(G, "pos") + self.block_assign = nx.get_node_attributes(G, "block") + self.probs_df = pd.DataFrame(get_all_possible_edges(G), columns="u v".split()) + self.initialize() + self.H = G.copy() + + self.nb_of_erosion = 0 + + def erode(self): + self.nb_of_erosion += 1 + + if self.H.size() < 30: + self.probs_df["p_{0}".format(self.nb_of_erosion)] = self.probs_df["p_{0}".format(self.nb_of_erosion - 1)] + return + old_probs = dict(self.probs_df["hash_ p_{0}".format(self.nb_of_erosion - 1).split()].values) + + auc_sbm, auc_spatial = get_auc_heuristics(self.H, 60) + edges = get_all_possible_edges(self.H) + if auc_sbm > auc_spatial: + probs = stochastic_block_model(self.H, edges) + else: + probs = spatial_link_prediction(self.H, edges) + edges = np.asarray(edges) + probs_dom = np.asarray(probs) + probs_dom /= probs_dom.sum() + + edge_prob = dict(zip([hash_func(ed) for ed in edges], probs_dom)) + self.probs_df["p_{0}".format(self.nb_of_erosion)] = self.probs_df.apply( + lambda x: edge_prob[hash_func([int(x.u), int(x.v)])] if hash_func([int(x.u), int(x.v)]) in edge_prob else 0, + axis=1) + + new_nb_edges = (np.asarray( + [(1 / self.H.size()) - probs_dom[ix] for ix, ed in enumerate(edges) if self.H.has_edge(*ed)])).sum() * self.H.size() + + probs_erosion = np.asarray([old_probs[hash_func(ed)] - probs_dom[ix] for ix, ed in enumerate(edges)]) + probs_erosion[probs_erosion < 0] = float_epsilon + probs_erosion /= probs_erosion.sum() + + final_edges = [] + index_selected_pairs = np.random.choice(np.arange(len(edges)), round(new_nb_edges), p=probs_erosion, + replace=False) # round(0.7*H.size()) + final_edges.extend(edges[index_selected_pairs]) + + G2 = nx.from_edgelist(final_edges) + for n in list(G2.nodes()): + G2.nodes[n]["block"] = self.block_assign[n] + G2.nodes[n]["pos"] = self.coordinates[n] + self.H = G2.copy() + + def erode_n_times(self,n): + if self.nb_of_erosion >0: + for i in range(1,self.nb_of_erosion+1): + if "p_{0}".format(i) in self.probs_df: + del self.probs_df["p_{0}".format(i)] + self.nb_of_erosion = 0 + self.H = self.G.copy() + for i in range(n): + self.erode() + + + def initialize(self): + self.probs_df["hash_"] = self.probs_df.apply(lambda row: hash_func((int(row.u), int(row.v))), axis=1) + self.probs_df["p_0"] = self.probs_df.apply(lambda x: 1 if self.G.has_edge(x.u, x.v) else 0, axis=1) + + + def get_features(self,probs_erosion =True,centrality=False,distance=False): + if self.nb_of_erosion <1: + raise ValueError("You must erode your graph before access to computed features") + + if not probs_erosion and not centrality and not distance: + raise ValueError("One feature (probs_erosion, centrality, distance) must be selected !") + + edge_feature = { + hash_func([int(row.u), int(row.v)]): [row["p_{0}".format(i)] for i in range(1, self.nb_of_erosion + 1)] for + ix, row in self.probs_df.iterrows()} + + X_train, X_test, y_train, y_test = split_train_test(self.G) + if distance: + pos = nx.get_node_attributes(self.G, "pos") + dist_X_train = np.asarray([dist(pos[ed[0]], pos[ed[1]]) for ed in X_train[:, :2]]).reshape(-1, 1) + dist_X_test = np.asarray([dist(pos[ed[0]], pos[ed[1]]) for ed in X_test[:, :2]]).reshape(-1, 1) + + X_train = np.concatenate((X_train, dist_X_train), axis=1) + X_test = np.concatenate((X_test, dist_X_test), axis=1) + + if centrality: + centrality = nx.degree_centrality(self.G) + centrality_X_train = np.asarray([[centrality[ed[0]], centrality[ed[1]]] for ed in X_train[:, :2]]) + centrality_X_test = np.asarray([[centrality[ed[0]], centrality[ed[1]]] for ed in X_test[:, :2]]) + + X_train = np.concatenate((X_train, centrality_X_train), axis=1) + X_test = np.concatenate((X_test, centrality_X_test), axis=1) + + if probs_erosion: + if_not = [0 for i in range(self.nb_of_erosion)] + feature_X_train = np.asarray( + [(edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train[:, :2]]) + feature_X_test = np.asarray( + [(edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test[:, :2]]) + X_train = np.concatenate((X_train, feature_X_train), axis=1) + X_test = np.concatenate((X_test, feature_X_test), axis=1) + + X_train = X_train[:, 2:] + X_test = X_test[:, 2:] + + return X_train,X_test,y_train,y_test + + +def eval_erosion_model(G,nb_iter=1,verbose=False): + erod_mod = ErosionModel(G) + erod_mod.erode_n_times(nb_iter) + X_train, X_test, y_train, y_test = erod_mod.get_features() + + auc_sbm, auc_spa = get_auc_heuristics(G, 60) + if verbose:print("SBM: ", auc_sbm, "SPATIAL: ", auc_spa) + + clf = LogisticRegression() + clf.fit(X_train, y_train) + y_pred = clf.predict_proba(X_test)[:, 1] + return auc_sbm,auc_spa,roc_auc_score(y_test, y_pred) \ No newline at end of file diff --git a/lib/lambda_func.py b/lib/lambda_func.py new file mode 100644 index 0000000000000000000000000000000000000000..6e8139594c137f77ceb324f40cd8bfce5219e6a0 --- /dev/null +++ b/lib/lambda_func.py @@ -0,0 +1,5 @@ +# coding = utf-8 +import numpy as np + +euclid_dist = lambda a,b : np.linalg.norm(a-b)**2 +hash_func = lambda x:"_".join(sorted([str(x[0]),str(x[1])])) \ No newline at end of file diff --git a/lib/link_prediction_eval.py b/lib/link_prediction_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..30a8abf1cabfa88df223ddaea7f9675c4ce80ec5 --- /dev/null +++ b/lib/link_prediction_eval.py @@ -0,0 +1,43 @@ +# coding = utf-8 + +from evalne.evaluation.evaluator import LPEvaluator +from evalne.evaluation.split import EvalSplit as LPEvalSplit +from evalne.utils import preprocess as pp + +from .lambda_func import hash_func + +def get_auc_heuristics(G,timeout=60): + H, _ = pp.prep_graph(G.copy(),maincc=True) + traintest_split = LPEvalSplit() + traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1) + nee = LPEvaluator(traintest_split) + auc_spatial, auc_sbm = 0, 0 + try: + auc_spatial = nee.evaluate_baseline(method="spatial_link_prediction",timeout=timeout).test_scores.auroc() + auc_sbm = nee.evaluate_baseline(method="stochastic_block_model",timeout=timeout).test_scores.auroc() + except: + print("Could not compute AUC ! ") + return auc_sbm,auc_spatial + + +def split_train_test(G,train_frac=0.90,fe_ratio=1): + H, _ = pp.prep_graph(G.copy(), maincc=True,relabel=False) + traintest_split = LPEvalSplit() + traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=train_frac, fe_ratio=fe_ratio) + + X_train = traintest_split.train_edges + y_train = traintest_split.train_labels + X_test = traintest_split.test_edges + y_test = traintest_split.test_labels + + return X_train,X_test,y_train,y_test + +def get_all_possible_edges(G): + register = set([]) + data = [] + for n1 in list(G.nodes()): + for n2 in list(G.nodes()): + if n1 != n2 and hash_func((n1, n2)) not in register: + data.append([n1, n2]) + register.add(hash_func((n1, n2))) + return data \ No newline at end of file diff --git a/lib/random.py b/lib/random.py index 3253f2eb63d97db40a442d7f9379ffb6f4c92547..207bcba41e76763b80485567fccdd5bd9b42baef 100644 --- a/lib/random.py +++ b/lib/random.py @@ -503,7 +503,7 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw G2.nodes[n]["block"] = block_assign[n] G2.nodes[n]["pos"] = G.nodes[n]["pos"] - return G2,all_probs_sbm,all_probs_spa + return G2#,all_probs_sbm,all_probs_spa