From 299c9dcdb73d5888c0799ae76dad4cb1fc4d15ea Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Mon, 12 Apr 2021 14:10:12 +0200
Subject: [PATCH] add evaluation script for mixed model
---
eval_mixed_model.py | 184 ++++++++++++++++++++++++++++++++++++++++
lib/random.py | 62 +++++++++++++-
run_eval_mixed_model.sh | 11 +++
3 files changed, 255 insertions(+), 2 deletions(-)
create mode 100644 eval_mixed_model.py
create mode 100644 run_eval_mixed_model.sh
diff --git a/eval_mixed_model.py b/eval_mixed_model.py
new file mode 100644
index 0000000..ba303bb
--- /dev/null
+++ b/eval_mixed_model.py
@@ -0,0 +1,184 @@
+# coding = utf-8
+import argparse
+
+from lib.random import mixed_model_spat_sbm, get_spat_probs, get_sbm_probs
+
+import networkx as nx
+import numpy as np
+import pandas as pd
+
+from tqdm import tqdm
+
+from evalne.evaluation.evaluator import LPEvaluator
+from evalne.evaluation.split import EvalSplit as LPEvalSplit
+
+from evalne.utils import preprocess as pp
+
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import SVC,LinearSVC
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.neural_network import MLPClassifier
+from sklearn.linear_model import SGDClassifier
+
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import roc_auc_score,precision_score,recall_score,f1_score
+from sklearn.metrics import make_scorer
+
+roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
+ needs_threshold=True)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("nb_nodes",type=int)
+parser.add_argument("nb_edges",type=int)
+parser.add_argument("nb_com",type=int)
+parser.add_argument("alpha",type=float)
+parser.add_argument("-v","--verbose",action="store_true")
+
+args= parser.parse_args()
+
+GRAPH_NODE_NB = args.nb_nodes
+GRAPH_EDGE_NB = args.nb_edges
+ALPHA = args.alpha
+NB_COM = args.nb_com
+NB_ITERATION = 3
+VERBOSE = args.verbose
+
+dist = lambda a,b : np.linalg.norm(a-b)**2
+hash_func = lambda x:"_".join(sorted([str(x[0]),str(x[1])]))
+
+def get_aucs(G):
+ H, _ = pp.prep_graph(G.copy(),maincc=True)
+ traintest_split = LPEvalSplit()
+ traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
+ nee = LPEvaluator(traintest_split)
+
+ auc_spatial = nee.evaluate_baseline(method="spatial_link_prediction").test_scores.auroc()
+ auc_sbm = nee.evaluate_baseline(method="stochastic_block_model").test_scores.auroc()
+ return auc_sbm,auc_spatial
+
+dist = lambda a,b : np.linalg.norm(a-b)
+G,all_probs_sbm,all_probs_spa = mixed_model_spat_sbm(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,alpha=ALPHA)
+
+register = set([])
+data = []
+for n1 in list(G.nodes()):
+ for n2 in list(G.nodes()):
+ if n1 != n2 and hash_func((n1,n2)) not in register:
+ data.append([n1,n2])
+ register.add(hash_func((n1,n2)))
+df_data = pd.DataFrame(data,columns="u v".split())
+
+
+pos = nx.get_node_attributes(G,"pos")
+block_assign = nx.get_node_attributes(G,"block")
+H = G.copy()
+float_epsilon = np.finfo(float).eps
+for i in range(NB_ITERATION):
+ auc_sbm,auc_spatial = get_aucs(H)
+ if VERBOSE : print(auc_sbm,auc_spatial)
+ if auc_sbm> auc_spatial:
+ edges,probs = get_sbm_probs(H,ALPHA)
+ else:
+ edges,probs = get_spat_probs(H)
+ probs = np.asarray(probs)
+ edges = np.asarray(edges)
+ edge_prob = dict(zip([hash_func(ed) for ed in edges],probs))
+ probs = np.asarray([(1 if H.has_edge(*ed) else 0)-probs[ix] for ix,ed in enumerate(edges)])
+ probs = np.asarray([ float_epsilon if p<=0 else p for p in probs])
+ probs /= probs.sum()
+ df_data["p_{0}".format(i)] = df_data.apply(lambda x: edge_prob[hash_func([int(x.u),int(x.v)])] if hash_func([int(x.u),int(x.v)]) in edge_prob else 0,axis=1)
+ final_edges = []
+ index_selected_pairs = np.random.choice(np.arange(len(edges)), round((H.size()*0.7)), p=probs, replace=False)
+ final_edges.extend(edges[index_selected_pairs])
+ G2 = nx.from_edgelist(final_edges)
+ for n in list(G2.nodes()):
+ G2.nodes[n]["block"] = block_assign[n]
+ G2.nodes[n]["pos"] = pos[n]
+ H=G2.copy()
+
+
+edge_feature= {hash_func([int(row.u),int(row.v)]):[row.p_0,row.p_1] for ix,row in df_data.iterrows()}
+
+G, _ = pp.prep_graph(G,maincc=True)
+traintest_split = LPEvalSplit()
+traintest_split.compute_splits(G, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
+nee = LPEvaluator(traintest_split)
+
+X_train = traintest_split.train_edges
+y_train = traintest_split.train_labels
+X_test = traintest_split.test_edges
+y_test = traintest_split.test_labels
+
+
+pos = nx.get_node_attributes(G,"pos")
+dist_X_train = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_train]).reshape(-1,1)
+dist_X_test = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_test]).reshape(-1,1)
+
+
+centrality = nx.degree_centrality(G)
+centrality_X_train = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_train])
+centrality_X_test = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_test])
+
+if_not =[0 for i in range(NB_ITERATION-1)]
+feature_X_train = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train])
+feature_X_test = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test])
+
+##ADD centrality and distance to X train
+X_train = np.concatenate((X_train,dist_X_train,centrality_X_train),axis=1)
+X_test = np.concatenate((X_test,dist_X_test,centrality_X_test),axis=1)
+
+
+classifier_dict = {
+ "naive-bayes":GaussianNB(),
+ "svm":SVC(),
+ "sgd":SGDClassifier(),
+ "knn":KNeighborsClassifier(),
+ "decision-tree": DecisionTreeClassifier(),
+ "random-forest":RandomForestClassifier(),
+ "mlp":MLPClassifier(),
+ "logistic_reg":LogisticRegression(),
+ "linear_svm":LinearSVC()
+}
+
+parameters = {
+ "naive-bayes":[],
+ #"svm":[{"kernel":["rbf","linear"], 'gamma': [1e-1,1e-2,1e-3, 1,10,100]}],
+ "sgd":[{"penalty":["l1","l2"],"loss":["hinge","modified_huber","log"]}],
+ "knn":[{"n_neighbors":list(range(4,8)),"p":[1,2]}],
+ "decision-tree": [{"criterion":["gini","entropy"]}],
+ "random-forest":[{"criterion":["gini","entropy"],"n_estimators":[10,50,100]}],
+ "mlp":[],
+ "logistic_reg":[],
+ "linear_svm":[]
+}
+auc_sbm, auc_spa = get_aucs(G)
+if VERBOSE: print("SBM AUUROC",auc_sbm,"SPATIAL AUROC",auc_spa)
+data = []
+pbar = tqdm(parameters)
+for classi_ in parameters:
+ pbar.set_description(classi_)
+ if len(parameters[classi_])>0:
+ clf = GridSearchCV(
+ classifier_dict[classi_], parameters[classi_], scoring=roc_auc_scorer, n_jobs=-1
+ )
+ clf.fit(X_train,y_train)
+ y_pred = clf.best_estimator_.predict(X_test)
+ else:
+ classifier_dict[classi_].fit(X_train,y_train)
+ y_pred = classifier_dict[classi_].predict(X_test)
+ data.append([classifier_dict[classi_].__class__.__name__,precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred),roc_auc_score(y_test,y_pred)])
+
+
+df = pd.DataFrame(data,columns="method precision recall f1-score auroc".split())
+df["auc_sbm"] = auc_sbm
+df["auc_spatial"] = auc_spa
+df["alpha"] = ALPHA
+df["nb_nodes"] = GRAPH_NODE_NB
+df["nb_edges"] = GRAPH_EDGE_NB
+df["nb_com"] = NB_COM
+if VERBOSE : print(df)
+df.to_csv("{0}_{1}_{2}_{3}.csv".format(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,ALPHA),sep="\t",index=None)
\ No newline at end of file
diff --git a/lib/random.py b/lib/random.py
index c6f8898..7980f43 100644
--- a/lib/random.py
+++ b/lib/random.py
@@ -8,6 +8,9 @@ import pandas as pd
from networkx.generators.degree_seq import _to_stublist
from cdlib import algorithms
import random
+float_epsilon = np.finfo(float).eps
+
+
def powerlaw(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1):
@@ -441,7 +444,7 @@ def get_inter_intra_edges(G, directed=False):
for n1 in list(G.nodes()):
for n2 in list(G.nodes()):
if directed:
- "_".join([str(n1), str(n2)])
+ hash_ = "_".join([str(n1), str(n2)])
else:
hash_ = "_".join(sorted([str(n1), str(n2)]))
if (n1 == n2) or (hash_ in register):
@@ -487,6 +490,8 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw
pos = nx.get_node_attributes(G,"pos")
all_probs_spa = np.asarray([1 / (float_epsilon +dist_func(pos[edge[0]], pos[edge[1]])) for edge in all_edges])
all_probs_spa /= all_probs_spa.sum()
+
+
all_probs = alpha * (all_probs_sbm) + (1 - alpha) * all_probs_spa
final_edges = []
@@ -498,4 +503,57 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw
G2.nodes[n]["block"] = block_assign[n]
G2.nodes[n]["pos"] = G.nodes[n]["pos"]
- return G2
\ No newline at end of file
+ return G2,all_probs_sbm,all_probs_spa
+
+
+
+def get_sbm_probs(G, percentage_edge_betw, verbose=False):
+ hash_func = lambda x: "_".join(sorted([str(x[0]), str(x[1])]))
+ def nb_of_pair(N):
+ return (N*(N-1))/2
+
+ block_assign = nx.get_node_attributes(G, "block")
+ nb_com = len(set(block_assign.values()))
+ nb_nodes=len(G)
+ nb_edges = G.size()
+ b_assign_array = np.asarray(list(nx.get_node_attributes(G,"block").values()))
+
+
+
+ u_in = sum([nb_of_pair((b_assign_array==b).sum()) for b in range(nb_com)])
+ u_out = nb_of_pair(len(G)) - u_in
+ l_out = nb_edges*percentage_edge_betw
+ p_out = l_out/u_out
+ l_in = nb_edges - l_out
+
+ p_in = l_in / u_in
+
+ inter_edges, intra_edges = get_inter_intra_edges(G,G.is_directed())
+ inter_edges = np.asarray(inter_edges)
+ intra_edges = np.asarray(intra_edges)
+ inter_N, intra_N = len(inter_edges), len(intra_edges)
+ probs_inter = np.ones(inter_N) * p_out
+ probs_intra = np.ones(intra_N) * p_in
+
+ all_edges = np.concatenate((inter_edges, intra_edges))
+ all_probs = np.concatenate((probs_inter, probs_intra))
+ del probs_inter
+ del probs_intra
+ all_probs /= all_probs.sum()
+ return all_edges,all_probs
+
+
+def get_spat_probs(G,dist = lambda a,b : np.linalg.norm(a-b)**2):
+ hash_func = lambda x: "_".join(sorted([str(x[0]), str(x[1])]))
+ pos = nx.get_node_attributes(G, "pos")
+ spat_model = lambda u, v: 1 / (float_epsilon + dist(pos[u], pos[v]))
+ register = set([])
+ edges, probs = [], []
+ for n1 in list(G.nodes()):
+ for n2 in list(G.nodes()):
+ if n1 != n2 and hash_func((n1, n2)) not in register:
+ edges.append([n1, n2])
+ probs.append(spat_model(n1, n2))
+ register.add(hash_func((n1, n2)))
+
+ return edges, probs
\ No newline at end of file
diff --git a/run_eval_mixed_model.sh b/run_eval_mixed_model.sh
new file mode 100644
index 0000000..fed67e7
--- /dev/null
+++ b/run_eval_mixed_model.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+for alpha in 0 0.2 0.5 0.7 1
+do
+ for nbcom in 2 3 4 5
+ do
+ echo "alpha= "$alpha", nb_com= "$nbcom
+ python eval_mixed_model.py 100 200 $nbcom $alpha
+ python eval_mixed_model.py 300 600 $nbcom $alpha
+ done
+done
\ No newline at end of file
--
GitLab