From 299c9dcdb73d5888c0799ae76dad4cb1fc4d15ea Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Mon, 12 Apr 2021 14:10:12 +0200
Subject: [PATCH] add evaluation script for mixed model

---
 eval_mixed_model.py     | 184 ++++++++++++++++++++++++++++++++++++++++
 lib/random.py           |  62 +++++++++++++-
 run_eval_mixed_model.sh |  11 +++
 3 files changed, 255 insertions(+), 2 deletions(-)
 create mode 100644 eval_mixed_model.py
 create mode 100644 run_eval_mixed_model.sh

diff --git a/eval_mixed_model.py b/eval_mixed_model.py
new file mode 100644
index 0000000..ba303bb
--- /dev/null
+++ b/eval_mixed_model.py
@@ -0,0 +1,184 @@
+# coding = utf-8
+import argparse
+
+from lib.random import mixed_model_spat_sbm, get_spat_probs, get_sbm_probs
+
+import networkx as nx
+import numpy as np
+import pandas as pd
+
+from tqdm import tqdm
+
+from evalne.evaluation.evaluator import LPEvaluator
+from evalne.evaluation.split import EvalSplit as LPEvalSplit
+
+from evalne.utils import preprocess as pp
+
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import SVC,LinearSVC
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.neural_network import MLPClassifier
+from sklearn.linear_model import SGDClassifier
+
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import roc_auc_score,precision_score,recall_score,f1_score
+from sklearn.metrics import make_scorer
+
+roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
+                             needs_threshold=True)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("nb_nodes",type=int)
+parser.add_argument("nb_edges",type=int)
+parser.add_argument("nb_com",type=int)
+parser.add_argument("alpha",type=float)
+parser.add_argument("-v","--verbose",action="store_true")
+
+args= parser.parse_args()
+
+GRAPH_NODE_NB = args.nb_nodes
+GRAPH_EDGE_NB = args.nb_edges
+ALPHA = args.alpha
+NB_COM = args.nb_com
+NB_ITERATION = 3
+VERBOSE = args.verbose
+
+dist = lambda a,b : np.linalg.norm(a-b)**2
+hash_func = lambda x:"_".join(sorted([str(x[0]),str(x[1])]))
+
+def get_aucs(G):
+    H, _ = pp.prep_graph(G.copy(),maincc=True)
+    traintest_split = LPEvalSplit()
+    traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
+    nee = LPEvaluator(traintest_split)
+
+    auc_spatial = nee.evaluate_baseline(method="spatial_link_prediction").test_scores.auroc()
+    auc_sbm = nee.evaluate_baseline(method="stochastic_block_model").test_scores.auroc()
+    return auc_sbm,auc_spatial
+
+dist = lambda a,b : np.linalg.norm(a-b)
+G,all_probs_sbm,all_probs_spa = mixed_model_spat_sbm(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,alpha=ALPHA)
+
+register = set([])
+data = []
+for n1 in list(G.nodes()):
+    for n2 in list(G.nodes()):
+        if n1 != n2 and hash_func((n1,n2)) not in register:
+                data.append([n1,n2])
+                register.add(hash_func((n1,n2)))
+df_data = pd.DataFrame(data,columns="u v".split())
+
+
+pos = nx.get_node_attributes(G,"pos")
+block_assign = nx.get_node_attributes(G,"block")
+H = G.copy()
+float_epsilon = np.finfo(float).eps
+for i in range(NB_ITERATION):
+    auc_sbm,auc_spatial = get_aucs(H)
+    if VERBOSE : print(auc_sbm,auc_spatial)
+    if auc_sbm> auc_spatial:
+        edges,probs = get_sbm_probs(H,ALPHA)
+    else:
+        edges,probs = get_spat_probs(H)
+    probs = np.asarray(probs)
+    edges = np.asarray(edges)
+    edge_prob = dict(zip([hash_func(ed) for ed in edges],probs))
+    probs = np.asarray([(1 if H.has_edge(*ed) else 0)-probs[ix] for ix,ed in enumerate(edges)])
+    probs = np.asarray([ float_epsilon if p<=0 else p for p in probs])
+    probs /= probs.sum()
+    df_data["p_{0}".format(i)] = df_data.apply(lambda x: edge_prob[hash_func([int(x.u),int(x.v)])] if hash_func([int(x.u),int(x.v)]) in edge_prob else 0,axis=1)
+    final_edges = []
+    index_selected_pairs = np.random.choice(np.arange(len(edges)), round((H.size()*0.7)), p=probs, replace=False)
+    final_edges.extend(edges[index_selected_pairs])
+    G2 = nx.from_edgelist(final_edges)
+    for n in list(G2.nodes()):
+        G2.nodes[n]["block"] = block_assign[n]
+        G2.nodes[n]["pos"] = pos[n]
+    H=G2.copy()
+
+
+edge_feature= {hash_func([int(row.u),int(row.v)]):[row.p_0,row.p_1] for ix,row in df_data.iterrows()}
+
+G, _ = pp.prep_graph(G,maincc=True)
+traintest_split = LPEvalSplit()
+traintest_split.compute_splits(G, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
+nee = LPEvaluator(traintest_split)
+
+X_train = traintest_split.train_edges
+y_train = traintest_split.train_labels
+X_test = traintest_split.test_edges
+y_test = traintest_split.test_labels
+
+
+pos = nx.get_node_attributes(G,"pos")
+dist_X_train = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_train]).reshape(-1,1)
+dist_X_test = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_test]).reshape(-1,1)
+
+
+centrality = nx.degree_centrality(G)
+centrality_X_train = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_train])
+centrality_X_test = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_test])
+
+if_not =[0 for i in range(NB_ITERATION-1)]
+feature_X_train = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train])
+feature_X_test = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test])
+
+##ADD centrality and distance to X train
+X_train = np.concatenate((X_train,dist_X_train,centrality_X_train),axis=1)
+X_test = np.concatenate((X_test,dist_X_test,centrality_X_test),axis=1)
+
+
+classifier_dict = {
+    "naive-bayes":GaussianNB(),
+    "svm":SVC(),
+    "sgd":SGDClassifier(),
+    "knn":KNeighborsClassifier(),
+    "decision-tree": DecisionTreeClassifier(),
+    "random-forest":RandomForestClassifier(),
+    "mlp":MLPClassifier(),
+    "logistic_reg":LogisticRegression(),
+    "linear_svm":LinearSVC()
+}
+
+parameters = {
+    "naive-bayes":[],
+    #"svm":[{"kernel":["rbf","linear"], 'gamma': [1e-1,1e-2,1e-3, 1,10,100]}],
+    "sgd":[{"penalty":["l1","l2"],"loss":["hinge","modified_huber","log"]}],
+    "knn":[{"n_neighbors":list(range(4,8)),"p":[1,2]}],
+    "decision-tree": [{"criterion":["gini","entropy"]}],
+    "random-forest":[{"criterion":["gini","entropy"],"n_estimators":[10,50,100]}],
+    "mlp":[],
+    "logistic_reg":[],
+    "linear_svm":[]
+}
+auc_sbm, auc_spa = get_aucs(G)
+if VERBOSE: print("SBM AUUROC",auc_sbm,"SPATIAL AUROC",auc_spa)
+data = []
+pbar = tqdm(parameters)
+for classi_ in parameters:
+    pbar.set_description(classi_)
+    if len(parameters[classi_])>0:
+        clf = GridSearchCV(
+            classifier_dict[classi_], parameters[classi_], scoring=roc_auc_scorer, n_jobs=-1
+        )
+        clf.fit(X_train,y_train)
+        y_pred = clf.best_estimator_.predict(X_test)
+    else:
+        classifier_dict[classi_].fit(X_train,y_train)
+        y_pred = classifier_dict[classi_].predict(X_test)
+    data.append([classifier_dict[classi_].__class__.__name__,precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred),roc_auc_score(y_test,y_pred)])
+
+
+df = pd.DataFrame(data,columns="method precision recall f1-score auroc".split())
+df["auc_sbm"] = auc_sbm
+df["auc_spatial"] = auc_spa
+df["alpha"] = ALPHA
+df["nb_nodes"] = GRAPH_NODE_NB
+df["nb_edges"] = GRAPH_EDGE_NB
+df["nb_com"] = NB_COM
+if VERBOSE : print(df)
+df.to_csv("{0}_{1}_{2}_{3}.csv".format(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,ALPHA),sep="\t",index=None)
\ No newline at end of file
diff --git a/lib/random.py b/lib/random.py
index c6f8898..7980f43 100644
--- a/lib/random.py
+++ b/lib/random.py
@@ -8,6 +8,9 @@ import pandas as pd
 from networkx.generators.degree_seq import _to_stublist
 from cdlib import algorithms
 import random
+float_epsilon = np.finfo(float).eps
+
+
 
 
 def powerlaw(nb_nodes, nb_edges, exponent=2, tries=100, min_deg=1):
@@ -441,7 +444,7 @@ def get_inter_intra_edges(G, directed=False):
     for n1 in list(G.nodes()):
         for n2 in list(G.nodes()):
             if directed:
-                "_".join([str(n1), str(n2)])
+                hash_ = "_".join([str(n1), str(n2)])
             else:
                 hash_ = "_".join(sorted([str(n1), str(n2)]))
             if (n1 == n2) or (hash_ in register):
@@ -487,6 +490,8 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw
     pos = nx.get_node_attributes(G,"pos")
     all_probs_spa = np.asarray([1 / (float_epsilon +dist_func(pos[edge[0]], pos[edge[1]])) for edge in all_edges])
     all_probs_spa /= all_probs_spa.sum()
+
+
     all_probs = alpha * (all_probs_sbm) + (1 - alpha) * all_probs_spa
 
     final_edges = []
@@ -498,4 +503,57 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw
         G2.nodes[n]["block"] = block_assign[n]
         G2.nodes[n]["pos"] = G.nodes[n]["pos"]
 
-    return G2
\ No newline at end of file
+    return G2,all_probs_sbm,all_probs_spa
+
+
+
+def get_sbm_probs(G, percentage_edge_betw, verbose=False):
+    hash_func = lambda x: "_".join(sorted([str(x[0]), str(x[1])]))
+    def nb_of_pair(N):
+        return (N*(N-1))/2
+
+    block_assign = nx.get_node_attributes(G, "block")
+    nb_com = len(set(block_assign.values()))
+    nb_nodes=len(G)
+    nb_edges = G.size()
+    b_assign_array = np.asarray(list(nx.get_node_attributes(G,"block").values()))
+
+
+
+    u_in = sum([nb_of_pair((b_assign_array==b).sum()) for b in range(nb_com)])
+    u_out = nb_of_pair(len(G)) - u_in
+    l_out = nb_edges*percentage_edge_betw
+    p_out = l_out/u_out
+    l_in = nb_edges - l_out
+
+    p_in = l_in / u_in
+
+    inter_edges, intra_edges = get_inter_intra_edges(G,G.is_directed())
+    inter_edges = np.asarray(inter_edges)
+    intra_edges = np.asarray(intra_edges)
+    inter_N, intra_N = len(inter_edges), len(intra_edges)
+    probs_inter = np.ones(inter_N) * p_out
+    probs_intra = np.ones(intra_N) * p_in
+
+    all_edges = np.concatenate((inter_edges, intra_edges))
+    all_probs = np.concatenate((probs_inter, probs_intra))
+    del probs_inter
+    del probs_intra
+    all_probs /= all_probs.sum()
+    return all_edges,all_probs
+
+
+def get_spat_probs(G,dist = lambda a,b : np.linalg.norm(a-b)**2):
+    hash_func = lambda x: "_".join(sorted([str(x[0]), str(x[1])]))
+    pos = nx.get_node_attributes(G, "pos")
+    spat_model = lambda u, v: 1 / (float_epsilon + dist(pos[u], pos[v]))
+    register = set([])
+    edges, probs = [], []
+    for n1 in list(G.nodes()):
+        for n2 in list(G.nodes()):
+            if n1 != n2 and hash_func((n1, n2)) not in register:
+                edges.append([n1, n2])
+                probs.append(spat_model(n1, n2))
+                register.add(hash_func((n1, n2)))
+
+    return edges, probs
\ No newline at end of file
diff --git a/run_eval_mixed_model.sh b/run_eval_mixed_model.sh
new file mode 100644
index 0000000..fed67e7
--- /dev/null
+++ b/run_eval_mixed_model.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+for alpha in 0 0.2 0.5 0.7 1
+do
+  for nbcom in 2 3 4 5
+  do
+    echo "alpha= "$alpha", nb_com= "$nbcom
+    python eval_mixed_model.py 100 200 $nbcom $alpha
+    python eval_mixed_model.py 300 600 $nbcom $alpha
+  done
+done
\ No newline at end of file
-- 
GitLab