Skip to content
Snippets Groups Projects
Commit 299c9dcd authored by Fize Jacques's avatar Fize Jacques
Browse files

add evaluation script for mixed model

parent 0f3317b7
No related branches found
No related tags found
No related merge requests found
# coding = utf-8
import argparse
from lib.random import mixed_model_spat_sbm, get_spat_probs, get_sbm_probs
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm
from evalne.evaluation.evaluator import LPEvaluator
from evalne.evaluation.split import EvalSplit as LPEvalSplit
from evalne.utils import preprocess as pp
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score,precision_score,recall_score,f1_score
from sklearn.metrics import make_scorer
roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
needs_threshold=True)
parser = argparse.ArgumentParser()
parser.add_argument("nb_nodes",type=int)
parser.add_argument("nb_edges",type=int)
parser.add_argument("nb_com",type=int)
parser.add_argument("alpha",type=float)
parser.add_argument("-v","--verbose",action="store_true")
args= parser.parse_args()
GRAPH_NODE_NB = args.nb_nodes
GRAPH_EDGE_NB = args.nb_edges
ALPHA = args.alpha
NB_COM = args.nb_com
NB_ITERATION = 3
VERBOSE = args.verbose
dist = lambda a,b : np.linalg.norm(a-b)**2
hash_func = lambda x:"_".join(sorted([str(x[0]),str(x[1])]))
def get_aucs(G):
H, _ = pp.prep_graph(G.copy(),maincc=True)
traintest_split = LPEvalSplit()
traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
nee = LPEvaluator(traintest_split)
auc_spatial = nee.evaluate_baseline(method="spatial_link_prediction").test_scores.auroc()
auc_sbm = nee.evaluate_baseline(method="stochastic_block_model").test_scores.auroc()
return auc_sbm,auc_spatial
dist = lambda a,b : np.linalg.norm(a-b)
G,all_probs_sbm,all_probs_spa = mixed_model_spat_sbm(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,alpha=ALPHA)
register = set([])
data = []
for n1 in list(G.nodes()):
for n2 in list(G.nodes()):
if n1 != n2 and hash_func((n1,n2)) not in register:
data.append([n1,n2])
register.add(hash_func((n1,n2)))
df_data = pd.DataFrame(data,columns="u v".split())
pos = nx.get_node_attributes(G,"pos")
block_assign = nx.get_node_attributes(G,"block")
H = G.copy()
float_epsilon = np.finfo(float).eps
for i in range(NB_ITERATION):
auc_sbm,auc_spatial = get_aucs(H)
if VERBOSE : print(auc_sbm,auc_spatial)
if auc_sbm> auc_spatial:
edges,probs = get_sbm_probs(H,ALPHA)
else:
edges,probs = get_spat_probs(H)
probs = np.asarray(probs)
edges = np.asarray(edges)
edge_prob = dict(zip([hash_func(ed) for ed in edges],probs))
probs = np.asarray([(1 if H.has_edge(*ed) else 0)-probs[ix] for ix,ed in enumerate(edges)])
probs = np.asarray([ float_epsilon if p<=0 else p for p in probs])
probs /= probs.sum()
df_data["p_{0}".format(i)] = df_data.apply(lambda x: edge_prob[hash_func([int(x.u),int(x.v)])] if hash_func([int(x.u),int(x.v)]) in edge_prob else 0,axis=1)
final_edges = []
index_selected_pairs = np.random.choice(np.arange(len(edges)), round((H.size()*0.7)), p=probs, replace=False)
final_edges.extend(edges[index_selected_pairs])
G2 = nx.from_edgelist(final_edges)
for n in list(G2.nodes()):
G2.nodes[n]["block"] = block_assign[n]
G2.nodes[n]["pos"] = pos[n]
H=G2.copy()
edge_feature= {hash_func([int(row.u),int(row.v)]):[row.p_0,row.p_1] for ix,row in df_data.iterrows()}
G, _ = pp.prep_graph(G,maincc=True)
traintest_split = LPEvalSplit()
traintest_split.compute_splits(G, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
nee = LPEvaluator(traintest_split)
X_train = traintest_split.train_edges
y_train = traintest_split.train_labels
X_test = traintest_split.test_edges
y_test = traintest_split.test_labels
pos = nx.get_node_attributes(G,"pos")
dist_X_train = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_train]).reshape(-1,1)
dist_X_test = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_test]).reshape(-1,1)
centrality = nx.degree_centrality(G)
centrality_X_train = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_train])
centrality_X_test = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_test])
if_not =[0 for i in range(NB_ITERATION-1)]
feature_X_train = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train])
feature_X_test = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test])
##ADD centrality and distance to X train
X_train = np.concatenate((X_train,dist_X_train,centrality_X_train),axis=1)
X_test = np.concatenate((X_test,dist_X_test,centrality_X_test),axis=1)
classifier_dict = {
"naive-bayes":GaussianNB(),
"svm":SVC(),
"sgd":SGDClassifier(),
"knn":KNeighborsClassifier(),
"decision-tree": DecisionTreeClassifier(),
"random-forest":RandomForestClassifier(),
"mlp":MLPClassifier(),
"logistic_reg":LogisticRegression(),
"linear_svm":LinearSVC()
}
parameters = {
"naive-bayes":[],
#"svm":[{"kernel":["rbf","linear"], 'gamma': [1e-1,1e-2,1e-3, 1,10,100]}],
"sgd":[{"penalty":["l1","l2"],"loss":["hinge","modified_huber","log"]}],
"knn":[{"n_neighbors":list(range(4,8)),"p":[1,2]}],
"decision-tree": [{"criterion":["gini","entropy"]}],
"random-forest":[{"criterion":["gini","entropy"],"n_estimators":[10,50,100]}],
"mlp":[],
"logistic_reg":[],
"linear_svm":[]
}
auc_sbm, auc_spa = get_aucs(G)
if VERBOSE: print("SBM AUUROC",auc_sbm,"SPATIAL AUROC",auc_spa)
data = []
pbar = tqdm(parameters)
for classi_ in parameters:
pbar.set_description(classi_)
if len(parameters[classi_])>0:
clf = GridSearchCV(
classifier_dict[classi_], parameters[classi_], scoring=roc_auc_scorer, n_jobs=-1
)
clf.fit(X_train,y_train)
y_pred = clf.best_estimator_.predict(X_test)
else:
classifier_dict[classi_].fit(X_train,y_train)
y_pred = classifier_dict[classi_].predict(X_test)
data.append([classifier_dict[classi_].__class__.__name__,precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred),roc_auc_score(y_test,y_pred)])
df = pd.DataFrame(data,columns="method precision recall f1-score auroc".split())
df["auc_sbm"] = auc_sbm
df["auc_spatial"] = auc_spa
df["alpha"] = ALPHA
df["nb_nodes"] = GRAPH_NODE_NB
df["nb_edges"] = GRAPH_EDGE_NB
df["nb_com"] = NB_COM
if VERBOSE : print(df)
df.to_csv("{0}_{1}_{2}_{3}.csv".format(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,ALPHA),sep="\t",index=None)
\ No newline at end of file
...@@ -8,6 +8,9 @@ import pandas as pd ...@@ -8,6 +8,9 @@ import pandas as pd
from networkx.generators.degree_seq import _to_stublist from networkx.generators.degree_seq import _to_stublist
from cdlib import algorithms from cdlib import algorithms
import random import random
float_epsilon = np.finfo(float).eps
def powerlaw(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1): def powerlaw(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1):
...@@ -441,7 +444,7 @@ def get_inter_intra_edges(G, directed=False): ...@@ -441,7 +444,7 @@ def get_inter_intra_edges(G, directed=False):
for n1 in list(G.nodes()): for n1 in list(G.nodes()):
for n2 in list(G.nodes()): for n2 in list(G.nodes()):
if directed: if directed:
"_".join([str(n1), str(n2)]) hash_ = "_".join([str(n1), str(n2)])
else: else:
hash_ = "_".join(sorted([str(n1), str(n2)])) hash_ = "_".join(sorted([str(n1), str(n2)]))
if (n1 == n2) or (hash_ in register): if (n1 == n2) or (hash_ in register):
...@@ -487,6 +490,8 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw ...@@ -487,6 +490,8 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw
pos = nx.get_node_attributes(G,"pos") pos = nx.get_node_attributes(G,"pos")
all_probs_spa = np.asarray([1 / (float_epsilon +dist_func(pos[edge[0]], pos[edge[1]])) for edge in all_edges]) all_probs_spa = np.asarray([1 / (float_epsilon +dist_func(pos[edge[0]], pos[edge[1]])) for edge in all_edges])
all_probs_spa /= all_probs_spa.sum() all_probs_spa /= all_probs_spa.sum()
all_probs = alpha * (all_probs_sbm) + (1 - alpha) * all_probs_spa all_probs = alpha * (all_probs_sbm) + (1 - alpha) * all_probs_spa
final_edges = [] final_edges = []
...@@ -498,4 +503,57 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw ...@@ -498,4 +503,57 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw
G2.nodes[n]["block"] = block_assign[n] G2.nodes[n]["block"] = block_assign[n]
G2.nodes[n]["pos"] = G.nodes[n]["pos"] G2.nodes[n]["pos"] = G.nodes[n]["pos"]
return G2 return G2,all_probs_sbm,all_probs_spa
\ No newline at end of file
def get_sbm_probs(G, percentage_edge_betw, verbose=False):
hash_func = lambda x: "_".join(sorted([str(x[0]), str(x[1])]))
def nb_of_pair(N):
return (N*(N-1))/2
block_assign = nx.get_node_attributes(G, "block")
nb_com = len(set(block_assign.values()))
nb_nodes=len(G)
nb_edges = G.size()
b_assign_array = np.asarray(list(nx.get_node_attributes(G,"block").values()))
u_in = sum([nb_of_pair((b_assign_array==b).sum()) for b in range(nb_com)])
u_out = nb_of_pair(len(G)) - u_in
l_out = nb_edges*percentage_edge_betw
p_out = l_out/u_out
l_in = nb_edges - l_out
p_in = l_in / u_in
inter_edges, intra_edges = get_inter_intra_edges(G,G.is_directed())
inter_edges = np.asarray(inter_edges)
intra_edges = np.asarray(intra_edges)
inter_N, intra_N = len(inter_edges), len(intra_edges)
probs_inter = np.ones(inter_N) * p_out
probs_intra = np.ones(intra_N) * p_in
all_edges = np.concatenate((inter_edges, intra_edges))
all_probs = np.concatenate((probs_inter, probs_intra))
del probs_inter
del probs_intra
all_probs /= all_probs.sum()
return all_edges,all_probs
def get_spat_probs(G,dist = lambda a,b : np.linalg.norm(a-b)**2):
hash_func = lambda x: "_".join(sorted([str(x[0]), str(x[1])]))
pos = nx.get_node_attributes(G, "pos")
spat_model = lambda u, v: 1 / (float_epsilon + dist(pos[u], pos[v]))
register = set([])
edges, probs = [], []
for n1 in list(G.nodes()):
for n2 in list(G.nodes()):
if n1 != n2 and hash_func((n1, n2)) not in register:
edges.append([n1, n2])
probs.append(spat_model(n1, n2))
register.add(hash_func((n1, n2)))
return edges, probs
\ No newline at end of file
#!/bin/bash
for alpha in 0 0.2 0.5 0.7 1
do
for nbcom in 2 3 4 5
do
echo "alpha= "$alpha", nb_com= "$nbcom
python eval_mixed_model.py 100 200 $nbcom $alpha
python eval_mixed_model.py 300 600 $nbcom $alpha
done
done
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment