diff --git a/eval_mixed_model.py b/eval_mixed_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ba303bbb23e7c87f6a3c15cc7b52d2550d6b10cc --- /dev/null +++ b/eval_mixed_model.py @@ -0,0 +1,184 @@ +# coding = utf-8 +import argparse + +from lib.random import mixed_model_spat_sbm, get_spat_probs, get_sbm_probs + +import networkx as nx +import numpy as np +import pandas as pd + +from tqdm import tqdm + +from evalne.evaluation.evaluator import LPEvaluator +from evalne.evaluation.split import EvalSplit as LPEvalSplit + +from evalne.utils import preprocess as pp + + +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.svm import SVC,LinearSVC +from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.neural_network import MLPClassifier +from sklearn.linear_model import SGDClassifier + +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import roc_auc_score,precision_score,recall_score,f1_score +from sklearn.metrics import make_scorer + +roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True, + needs_threshold=True) + +parser = argparse.ArgumentParser() +parser.add_argument("nb_nodes",type=int) +parser.add_argument("nb_edges",type=int) +parser.add_argument("nb_com",type=int) +parser.add_argument("alpha",type=float) +parser.add_argument("-v","--verbose",action="store_true") + +args= parser.parse_args() + +GRAPH_NODE_NB = args.nb_nodes +GRAPH_EDGE_NB = args.nb_edges +ALPHA = args.alpha +NB_COM = args.nb_com +NB_ITERATION = 3 +VERBOSE = args.verbose + +dist = lambda a,b : np.linalg.norm(a-b)**2 +hash_func = lambda x:"_".join(sorted([str(x[0]),str(x[1])])) + +def get_aucs(G): + H, _ = pp.prep_graph(G.copy(),maincc=True) + traintest_split = LPEvalSplit() + traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1) + nee = LPEvaluator(traintest_split) + + auc_spatial = nee.evaluate_baseline(method="spatial_link_prediction").test_scores.auroc() + auc_sbm = nee.evaluate_baseline(method="stochastic_block_model").test_scores.auroc() + return auc_sbm,auc_spatial + +dist = lambda a,b : np.linalg.norm(a-b) +G,all_probs_sbm,all_probs_spa = mixed_model_spat_sbm(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,alpha=ALPHA) + +register = set([]) +data = [] +for n1 in list(G.nodes()): + for n2 in list(G.nodes()): + if n1 != n2 and hash_func((n1,n2)) not in register: + data.append([n1,n2]) + register.add(hash_func((n1,n2))) +df_data = pd.DataFrame(data,columns="u v".split()) + + +pos = nx.get_node_attributes(G,"pos") +block_assign = nx.get_node_attributes(G,"block") +H = G.copy() +float_epsilon = np.finfo(float).eps +for i in range(NB_ITERATION): + auc_sbm,auc_spatial = get_aucs(H) + if VERBOSE : print(auc_sbm,auc_spatial) + if auc_sbm> auc_spatial: + edges,probs = get_sbm_probs(H,ALPHA) + else: + edges,probs = get_spat_probs(H) + probs = np.asarray(probs) + edges = np.asarray(edges) + edge_prob = dict(zip([hash_func(ed) for ed in edges],probs)) + probs = np.asarray([(1 if H.has_edge(*ed) else 0)-probs[ix] for ix,ed in enumerate(edges)]) + probs = np.asarray([ float_epsilon if p<=0 else p for p in probs]) + probs /= probs.sum() + df_data["p_{0}".format(i)] = df_data.apply(lambda x: edge_prob[hash_func([int(x.u),int(x.v)])] if hash_func([int(x.u),int(x.v)]) in edge_prob else 0,axis=1) + final_edges = [] + index_selected_pairs = np.random.choice(np.arange(len(edges)), round((H.size()*0.7)), p=probs, replace=False) + final_edges.extend(edges[index_selected_pairs]) + G2 = nx.from_edgelist(final_edges) + for n in list(G2.nodes()): + G2.nodes[n]["block"] = block_assign[n] + G2.nodes[n]["pos"] = pos[n] + H=G2.copy() + + +edge_feature= {hash_func([int(row.u),int(row.v)]):[row.p_0,row.p_1] for ix,row in df_data.iterrows()} + +G, _ = pp.prep_graph(G,maincc=True) +traintest_split = LPEvalSplit() +traintest_split.compute_splits(G, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1) +nee = LPEvaluator(traintest_split) + +X_train = traintest_split.train_edges +y_train = traintest_split.train_labels +X_test = traintest_split.test_edges +y_test = traintest_split.test_labels + + +pos = nx.get_node_attributes(G,"pos") +dist_X_train = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_train]).reshape(-1,1) +dist_X_test = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_test]).reshape(-1,1) + + +centrality = nx.degree_centrality(G) +centrality_X_train = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_train]) +centrality_X_test = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_test]) + +if_not =[0 for i in range(NB_ITERATION-1)] +feature_X_train = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train]) +feature_X_test = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test]) + +##ADD centrality and distance to X train +X_train = np.concatenate((X_train,dist_X_train,centrality_X_train),axis=1) +X_test = np.concatenate((X_test,dist_X_test,centrality_X_test),axis=1) + + +classifier_dict = { + "naive-bayes":GaussianNB(), + "svm":SVC(), + "sgd":SGDClassifier(), + "knn":KNeighborsClassifier(), + "decision-tree": DecisionTreeClassifier(), + "random-forest":RandomForestClassifier(), + "mlp":MLPClassifier(), + "logistic_reg":LogisticRegression(), + "linear_svm":LinearSVC() +} + +parameters = { + "naive-bayes":[], + #"svm":[{"kernel":["rbf","linear"], 'gamma': [1e-1,1e-2,1e-3, 1,10,100]}], + "sgd":[{"penalty":["l1","l2"],"loss":["hinge","modified_huber","log"]}], + "knn":[{"n_neighbors":list(range(4,8)),"p":[1,2]}], + "decision-tree": [{"criterion":["gini","entropy"]}], + "random-forest":[{"criterion":["gini","entropy"],"n_estimators":[10,50,100]}], + "mlp":[], + "logistic_reg":[], + "linear_svm":[] +} +auc_sbm, auc_spa = get_aucs(G) +if VERBOSE: print("SBM AUUROC",auc_sbm,"SPATIAL AUROC",auc_spa) +data = [] +pbar = tqdm(parameters) +for classi_ in parameters: + pbar.set_description(classi_) + if len(parameters[classi_])>0: + clf = GridSearchCV( + classifier_dict[classi_], parameters[classi_], scoring=roc_auc_scorer, n_jobs=-1 + ) + clf.fit(X_train,y_train) + y_pred = clf.best_estimator_.predict(X_test) + else: + classifier_dict[classi_].fit(X_train,y_train) + y_pred = classifier_dict[classi_].predict(X_test) + data.append([classifier_dict[classi_].__class__.__name__,precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred),roc_auc_score(y_test,y_pred)]) + + +df = pd.DataFrame(data,columns="method precision recall f1-score auroc".split()) +df["auc_sbm"] = auc_sbm +df["auc_spatial"] = auc_spa +df["alpha"] = ALPHA +df["nb_nodes"] = GRAPH_NODE_NB +df["nb_edges"] = GRAPH_EDGE_NB +df["nb_com"] = NB_COM +if VERBOSE : print(df) +df.to_csv("{0}_{1}_{2}_{3}.csv".format(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,ALPHA),sep="\t",index=None) \ No newline at end of file diff --git a/lib/random.py b/lib/random.py index c6f8898887b622f38ee9f0d01219b6e4dd3714df..7980f43d2c9cc0184326cb362af196f5b7c08672 100644 --- a/lib/random.py +++ b/lib/random.py @@ -8,6 +8,9 @@ import pandas as pd from networkx.generators.degree_seq import _to_stublist from cdlib import algorithms import random +float_epsilon = np.finfo(float).eps + + def powerlaw(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1): @@ -441,7 +444,7 @@ def get_inter_intra_edges(G, directed=False): for n1 in list(G.nodes()): for n2 in list(G.nodes()): if directed: - "_".join([str(n1), str(n2)]) + hash_ = "_".join([str(n1), str(n2)]) else: hash_ = "_".join(sorted([str(n1), str(n2)])) if (n1 == n2) or (hash_ in register): @@ -487,6 +490,8 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw pos = nx.get_node_attributes(G,"pos") all_probs_spa = np.asarray([1 / (float_epsilon +dist_func(pos[edge[0]], pos[edge[1]])) for edge in all_edges]) all_probs_spa /= all_probs_spa.sum() + + all_probs = alpha * (all_probs_sbm) + (1 - alpha) * all_probs_spa final_edges = [] @@ -498,4 +503,57 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw G2.nodes[n]["block"] = block_assign[n] G2.nodes[n]["pos"] = G.nodes[n]["pos"] - return G2 \ No newline at end of file + return G2,all_probs_sbm,all_probs_spa + + + +def get_sbm_probs(G, percentage_edge_betw, verbose=False): + hash_func = lambda x: "_".join(sorted([str(x[0]), str(x[1])])) + def nb_of_pair(N): + return (N*(N-1))/2 + + block_assign = nx.get_node_attributes(G, "block") + nb_com = len(set(block_assign.values())) + nb_nodes=len(G) + nb_edges = G.size() + b_assign_array = np.asarray(list(nx.get_node_attributes(G,"block").values())) + + + + u_in = sum([nb_of_pair((b_assign_array==b).sum()) for b in range(nb_com)]) + u_out = nb_of_pair(len(G)) - u_in + l_out = nb_edges*percentage_edge_betw + p_out = l_out/u_out + l_in = nb_edges - l_out + + p_in = l_in / u_in + + inter_edges, intra_edges = get_inter_intra_edges(G,G.is_directed()) + inter_edges = np.asarray(inter_edges) + intra_edges = np.asarray(intra_edges) + inter_N, intra_N = len(inter_edges), len(intra_edges) + probs_inter = np.ones(inter_N) * p_out + probs_intra = np.ones(intra_N) * p_in + + all_edges = np.concatenate((inter_edges, intra_edges)) + all_probs = np.concatenate((probs_inter, probs_intra)) + del probs_inter + del probs_intra + all_probs /= all_probs.sum() + return all_edges,all_probs + + +def get_spat_probs(G,dist = lambda a,b : np.linalg.norm(a-b)**2): + hash_func = lambda x: "_".join(sorted([str(x[0]), str(x[1])])) + pos = nx.get_node_attributes(G, "pos") + spat_model = lambda u, v: 1 / (float_epsilon + dist(pos[u], pos[v])) + register = set([]) + edges, probs = [], [] + for n1 in list(G.nodes()): + for n2 in list(G.nodes()): + if n1 != n2 and hash_func((n1, n2)) not in register: + edges.append([n1, n2]) + probs.append(spat_model(n1, n2)) + register.add(hash_func((n1, n2))) + + return edges, probs \ No newline at end of file diff --git a/run_eval_mixed_model.sh b/run_eval_mixed_model.sh new file mode 100644 index 0000000000000000000000000000000000000000..fed67e7e3c480265ac30d3b54db9ce6a967a4fbc --- /dev/null +++ b/run_eval_mixed_model.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +for alpha in 0 0.2 0.5 0.7 1 +do + for nbcom in 2 3 4 5 + do + echo "alpha= "$alpha", nb_com= "$nbcom + python eval_mixed_model.py 100 200 $nbcom $alpha + python eval_mixed_model.py 300 600 $nbcom $alpha + done +done \ No newline at end of file