erosionn model v1 done

5235a480 · Fize Jacques · f743ec77 · 5235a480 · 5235a480 · 5235a480
Commit 5235a480 authored 4 years ago by Fize Jacques
--- a/eval_mixed_model.py
+++ b/eval_mixed_model.py
 # coding = utf-8
 import argparse
-
-from lib.random import mixed_model_spat_sbm, get_spat_probs, get_sbm_probs
-
-import networkx as nx
-import numpy as np
-import pandas as pd
-
-from tqdm import tqdm
-
-from evalne.evaluation.evaluator import LPEvaluator
-from evalne.evaluation.split import EvalSplit as LPEvalSplit
-
-from evalne.utils import preprocess as pp
-
-
-from sklearn.linear_model import LogisticRegression
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.svm import SVC,LinearSVC
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
-from sklearn.naive_bayes import GaussianNB
-from sklearn.neural_network import MLPClassifier
-from sklearn.linear_model import SGDClassifier
-
-from sklearn.model_selection import GridSearchCV
-from sklearn.metrics import roc_auc_score,precision_score,recall_score,f1_score
-from sklearn.metrics import make_scorer
-
-roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
-                             needs_threshold=True)
+from lib.random import mixed_model_spat_sbm
+from lib.erosion_model import  eval_erosion_model
+from joblib import Parallel,delayed

 parser = argparse.ArgumentParser()
 parser.add_argument("nb_nodes",type=int)
@@ -51,173 +24,12 @@ NB_COM = args.nb_com
 NB_ITERATION = args.nb_iterations
 VERBOSE = args.verbose
 FEATURES = set(args.features.split(","))
-TIMEOUT = args.timeout
-
-dist = lambda a,b : np.linalg.norm(a-b)**2
-hash_func = lambda x:"_".join(sorted([str(x[0]),str(x[1])]))
-
-def get_aucs(G):
-    H, _ = pp.prep_graph(G.copy(),maincc=True)
-    traintest_split = LPEvalSplit()
-    traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
-    nee = LPEvaluator(traintest_split)
-    auc_spatial, auc_sbm = 0, 0
-    try:
-        auc_spatial = nee.evaluate_baseline(method="spatial_link_prediction",timeout=TIMEOUT).test_scores.auroc()
-        auc_sbm = nee.evaluate_baseline(method="stochastic_block_model",timeout=TIMEOUT).test_scores.auroc()
-    except:
-        print("Could not compute AUC ! ")
-    return auc_sbm,auc_spatial
-
-dist = lambda a,b : np.linalg.norm(a-b)
-G,all_probs_sbm,all_probs_spa = mixed_model_spat_sbm(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,alpha=ALPHA)
-
-register = set([])
-data = []
-for n1 in list(G.nodes()):
-    for n2 in list(G.nodes()):
-        if n1 != n2 and hash_func((n1,n2)) not in register:
-                data.append([n1,n2])
-                register.add(hash_func((n1,n2)))
-df_data = pd.DataFrame(data,columns="u v".split())
-df_data["hash_"] = df_data.apply(lambda row:hash_func((int(row.u),int(row.v))), axis=1)
-df_data["p_0"] = df_data.apply(lambda x:1 if G.has_edge(x.u,x.v) else 0,axis =1)
-
-
-pos = nx.get_node_attributes(G,"pos")
-block_assign = nx.get_node_attributes(G,"block")
-
-H = G.copy()
-float_epsilon = np.finfo(float).eps
-
-
-for i in range(1,NB_ITERATION+1):
-    if H.size() < 30:
-        df_data["p_{0}".format(i)] = df_data["p_{0}".format(i-1)]
-        continue
-    old_probs = dict(df_data["hash_ p_{0}".format(i-1).split()].values)
-    auc_sbm,auc_spatial = get_aucs(H)
-    if VERBOSE : print("SBM: ",auc_sbm,"SPATIAL: ",auc_spatial)
-    if auc_sbm> auc_spatial:
-        edges,probs = get_sbm_probs(H,0.01)
-    else:
-        edges,probs = get_spat_probs(H)
-
-    edges = np.asarray(edges)
-
-    probs_dom = np.asarray(probs)
-    probs_dom /= probs_dom.sum()
-
-    edge_prob = dict(zip([hash_func(ed) for ed in edges], probs_dom))
-    df_data["p_{0}".format(i)] = df_data.apply(lambda x: edge_prob[hash_func([int(x.u),int(x.v)])] if hash_func([int(x.u),int(x.v)]) in edge_prob else 0,axis=1)
-
-    h_probs = np.asarray([(1 / H.size()) - probs_dom[ix] for ix, ed in enumerate(edges) if H.has_edge(*ed)])
-    new_nb_edges = h_probs.sum() * H.size()
-    if VERBOSE:print("new NB of Edges",new_nb_edges)
-
-
-    probs_erosion = np.asarray([old_probs[hash_func(ed)]-probs_dom[ix] for ix,ed in enumerate(edges)])
-    probs_erosion[probs_erosion <0] = float_epsilon
-    probs_erosion /= probs_erosion.sum()
-
-    final_edges = []
-    index_selected_pairs = np.random.choice(np.arange(len(edges)),round(new_nb_edges) , p=probs_erosion, replace=False)#round(0.7*H.size())
-    final_edges.extend(edges[index_selected_pairs])
-    G2 = nx.from_edgelist(final_edges)
-    for n in list(G2.nodes()):
-        G2.nodes[n]["block"] = block_assign[n]
-        G2.nodes[n]["pos"] = pos[n]
-    H=G2.copy()
-
-if VERBOSE:print(df_data)
-
-edge_feature= {hash_func([int(row.u),int(row.v)]):[row["p_{0}".format(i)] for i in range(1,NB_ITERATION+1)] for ix,row in df_data.iterrows()}
-
-G, _ = pp.prep_graph(G,maincc=True,relabel=False)
-traintest_split = LPEvalSplit()
-traintest_split.compute_splits(G, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
-
-X_train = traintest_split.train_edges
-y_train = traintest_split.train_labels
-X_test = traintest_split.test_edges
-y_test = traintest_split.test_labels
-
-if "pos" in FEATURES:
-    pos = nx.get_node_attributes(G,"pos")
-    dist_X_train = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_train[:,:2]]).reshape(-1,1)
-    dist_X_test = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_test[:,:2]]).reshape(-1,1)
-
-    X_train = np.concatenate((X_train, dist_X_train), axis=1)
-    X_test = np.concatenate((X_test, dist_X_test), axis=1)
-
-if "centrality" in FEATURES:
-    centrality = nx.degree_centrality(G)
-    centrality_X_train = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_train[:,:2]])
-    centrality_X_test = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_test[:,:2]])
-
-    X_train = np.concatenate((X_train, centrality_X_train), axis=1)
-    X_test = np.concatenate((X_test, centrality_X_test), axis=1)
-
+TIMEOUT = 10#args.timeout

-if "it_probs":
-    if_not =[0 for i in range(NB_ITERATION)]
-    feature_X_train = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train[:,:2]])
-    feature_X_test = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test[:,:2]])
-    X_train = np.concatenate((X_train, feature_X_train), axis=1)
-    X_test = np.concatenate((X_test, feature_X_test ), axis=1)

-X_train = X_train[:,2:]
-X_test = X_test[:,2:]

-classifier_dict = {
-    "naive-bayes":GaussianNB(),
-    #"svm":SVC(),
-    "sgd":SGDClassifier(),
-    "knn":KNeighborsClassifier(),
-    "decision-tree": DecisionTreeClassifier(),
-    "random-forest":RandomForestClassifier(),
-    "mlp":MLPClassifier(),
-    "logistic_reg":LogisticRegression(),
-    "linear_svm":LinearSVC()
-}
+G = mixed_model_spat_sbm(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,alpha=ALPHA)

-parameters = {
-    "naive-bayes":[],
-    #"svm":[{"kernel":["rbf","linear"], 'gamma': [1e-1,1e-2,1e-3, 1,10,100]}],
-    "sgd":[{"penalty":["l1","l2"],"loss":["hinge","modified_huber","log"]}],
-    "knn":[{"n_neighbors":list(range(4,8)),"p":[1,2]}],
-    "decision-tree": [{"criterion":["gini","entropy"]}],
-    "random-forest":[{"criterion":["gini","entropy"],"n_estimators":[10,50,100]}],
-    "mlp":[],
-    "logistic_reg":[],
-    "linear_svm":[]
-}
-auc_sbm, auc_spa = get_aucs(G)
-if VERBOSE: print("SBM AUUROC",auc_sbm,"SPATIAL AUROC",auc_spa)
-data = []
-pbar = tqdm(parameters)
-for classi_ in classifier_dict:
-    pbar.set_description(classi_)
-    if len(parameters[classi_])>0:
-        clf = GridSearchCV(
-            classifier_dict[classi_], parameters[classi_], scoring=roc_auc_scorer, n_jobs=-1
-        )
-        clf.fit(X_train,y_train)
-        y_pred = clf.best_estimator_.predict(X_test)
-    else:
-        classifier_dict[classi_].fit(X_train,y_train)
-        y_pred = classifier_dict[classi_].predict(X_test)
-    data.append([classifier_dict[classi_].__class__.__name__,precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred),roc_auc_score(y_test,y_pred)])
+print(eval_erosion_model(G,NB_ITERATION))


-df = pd.DataFrame(data,columns="method precision recall f1-score auroc".split())
-df["auc_sbm"] = auc_sbm
-df["auc_spatial"] = auc_spa
-df["alpha"] = ALPHA
-df["nb_nodes"] = GRAPH_NODE_NB
-df["nb_edges"] = GRAPH_EDGE_NB
-df["nb_com"] = NB_COM
-df["nb_iterations"] = NB_ITERATION
-df["features"] = "_".join(FEATURES)
-if VERBOSE : print(df)
-df.to_csv("{0}_{1}_{2}_{3}_{4}_{5}.csv".format(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,ALPHA,NB_ITERATION,"_".join(FEATURES)),sep="\t",index=None)
\ No newline at end of file
--- a/generate_mixed_model_graph.py
+++ b/generate_mixed_model_graph.py
+# coding = utf-8
+import itertools
+import os
+
+import networkx as nx
+import argparse
+import numpy as np
+import pandas as pd
+import random
+import copy
+from tqdm import tqdm
+import lib.random as ra
+
+# COMMAND PARSING
+parser = argparse.ArgumentParser()
+parser.add_argument("output_dir")
+args = parser.parse_args()
+
+
+GRAPH_SIZE = [300,1000]
+EDGE_SIZE = [2]
+sample_per_params  = 10
+
+OUTPUT_DIR = args.output_dir
+if not os.path.exists(OUTPUT_DIR):
+    os.makedirs(args.output_dir)
+
+parameters = {
+    "mixed_model_spat_sbm":{
+        "nb_nodes":GRAPH_SIZE,
+        "nb_edges":EDGE_SIZE,
+        "nb_com":[2,3,4,5],
+        "alpha":[0,0.2,0.5,0.7,1],
+
+    }
+}
+
+def get_params(inp):
+    return (dict(zip(inp.keys(), values)) for values in itertools.product(*inp.values()))
+
+pbar = tqdm(parameters.items(),total=len(parameters))
+for method,args in pbar:
+    pbar.set_description("Generating graphs using : " + method)
+    list_of_params = get_params(parameters[method])
+    func = getattr(ra,method)
+    for ix,params in enumerate(list_of_params):
+        params["nb_edges"] = params["nb_edges"] * params["nb_nodes"]
+        print("Gen graph using the following parameters : ",params)
+        for sp_id in range(sample_per_params):
+            try:
+                G = func(**params)
+                G.graph.update(params)
+                nx.write_gml(G, OUTPUT_DIR+"/graph_{method}_{ix}_{sp_id}.gml".format(method=method,ix=ix,sp_id=sp_id),stringizer=str)
+            except Exception as e:
+                print(e)
+                print("Can't generate graphs using these parameters")
+
+
--- a/generate_theoric_random_graph.py
+++ b/generate_theoric_random_graph.py
@@ -27,27 +27,27 @@ if not os.path.exists(OUTPUT_DIR):


 parameters = {
-    # "stochastic_block_model_graph": {
-    #     "nb_nodes":GRAPH_SIZE,
-    #     "nb_edges":EDGE_SIZE,
-    #     "nb_com" :[2,5,8,16,10,25],
-    #     "percentage_edge_betw":[0.1,0.01]
-    # },
-    # "ER_graph": {
-    #     "nb_nodes":GRAPH_SIZE,
-    #     "nb_edges":EDGE_SIZE
-    # },
-    # "powerlaw_graph": {  # configuration_model
-    #     "nb_nodes":GRAPH_SIZE,
-    #     "nb_edges":EDGE_SIZE,
-    #     "exponent":[2,3],
-    #     "tries":[100]
-    # },
-    # "spatial_graph":{
-    #     "nb_nodes":GRAPH_SIZE,
-    #     "nb_edges":EDGE_SIZE,
-    #     "coords":["random","country"],
-    # },
+    "stochastic_block_model_graph": {
+        "nb_nodes":GRAPH_SIZE,
+        "nb_edges":EDGE_SIZE,
+        "nb_com" :[2,5,8,16,10,25],
+        "percentage_edge_betw":[0.1,0.01]
+    },
+    "ER_graph": {
+        "nb_nodes":GRAPH_SIZE,
+        "nb_edges":EDGE_SIZE
+    },
+    "powerlaw_graph": {  # configuration_model
+        "nb_nodes":GRAPH_SIZE,
+        "nb_edges":EDGE_SIZE,
+        "exponent":[2,3],
+        "tries":[100]
+    },
+    "spatial_graph":{
+        "nb_nodes":GRAPH_SIZE,
+        "nb_edges":EDGE_SIZE,
+        "coords":["random","country"],
+    },
    "mixed_model_spat_sbm":{
        "nb_nodes":GRAPH_SIZE,
        "nb_edges":EDGE_SIZE,

--- a/lib/erosion_model.py
+++ b/lib/erosion_model.py
+# coding = utf-8
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import roc_auc_score
+
+from .link_prediction_eval import get_auc_heuristics, split_train_test, get_all_possible_edges
+from .random import get_spat_probs, get_sbm_probs
+from .lambda_func import euclid_dist as dist
+from .lambda_func import  hash_func
+
+from evalne.methods.similarity import stochastic_block_model,spatial_link_prediction
+
+import pandas as pd
+import networkx as nx
+import numpy as np
+float_epsilon = np.finfo(float).eps
+
+
+class ErosionModel():
+    def __init__(self, G):
+        self.G = G
+        self.coordinates = nx.get_node_attributes(G, "pos")
+        self.block_assign = nx.get_node_attributes(G, "block")
+        self.probs_df = pd.DataFrame(get_all_possible_edges(G), columns="u v".split())
+        self.initialize()
+        self.H = G.copy()
+
+        self.nb_of_erosion = 0
+
+    def erode(self):
+        self.nb_of_erosion += 1
+
+        if self.H.size() < 30:
+            self.probs_df["p_{0}".format(self.nb_of_erosion)] = self.probs_df["p_{0}".format(self.nb_of_erosion - 1)]
+            return
+        old_probs = dict(self.probs_df["hash_ p_{0}".format(self.nb_of_erosion - 1).split()].values)
+
+        auc_sbm, auc_spatial = get_auc_heuristics(self.H, 60)
+        edges = get_all_possible_edges(self.H)
+        if auc_sbm > auc_spatial:
+            probs = stochastic_block_model(self.H, edges)
+        else:
+            probs = spatial_link_prediction(self.H, edges)
+        edges = np.asarray(edges)
+        probs_dom = np.asarray(probs)
+        probs_dom /= probs_dom.sum()
+
+        edge_prob = dict(zip([hash_func(ed) for ed in edges], probs_dom))
+        self.probs_df["p_{0}".format(self.nb_of_erosion)] = self.probs_df.apply(
+            lambda x: edge_prob[hash_func([int(x.u), int(x.v)])] if hash_func([int(x.u), int(x.v)]) in edge_prob else 0,
+            axis=1)
+
+        new_nb_edges = (np.asarray(
+            [(1 / self.H.size()) - probs_dom[ix] for ix, ed in enumerate(edges) if self.H.has_edge(*ed)])).sum() * self.H.size()
+
+        probs_erosion = np.asarray([old_probs[hash_func(ed)] - probs_dom[ix] for ix, ed in enumerate(edges)])
+        probs_erosion[probs_erosion < 0] = float_epsilon
+        probs_erosion /= probs_erosion.sum()
+
+        final_edges = []
+        index_selected_pairs = np.random.choice(np.arange(len(edges)), round(new_nb_edges), p=probs_erosion,
+                                                replace=False)  # round(0.7*H.size())
+        final_edges.extend(edges[index_selected_pairs])
+
+        G2 = nx.from_edgelist(final_edges)
+        for n in list(G2.nodes()):
+            G2.nodes[n]["block"] = self.block_assign[n]
+            G2.nodes[n]["pos"] = self.coordinates[n]
+        self.H = G2.copy()
+
+    def erode_n_times(self,n):
+        if self.nb_of_erosion >0:
+            for i in range(1,self.nb_of_erosion+1):
+                if "p_{0}".format(i) in self.probs_df:
+                    del self.probs_df["p_{0}".format(i)]
+        self.nb_of_erosion = 0
+        self.H = self.G.copy()
+        for i in range(n):
+            self.erode()
+
+
+    def initialize(self):
+        self.probs_df["hash_"] = self.probs_df.apply(lambda row: hash_func((int(row.u), int(row.v))), axis=1)
+        self.probs_df["p_0"] = self.probs_df.apply(lambda x: 1 if self.G.has_edge(x.u, x.v) else 0, axis=1)
+
+
+    def get_features(self,probs_erosion =True,centrality=False,distance=False):
+        if  self.nb_of_erosion <1:
+            raise ValueError("You must erode your graph before access to computed features")
+
+        if not probs_erosion and not centrality and not distance:
+            raise ValueError("One feature (probs_erosion, centrality, distance) must be selected !")
+
+        edge_feature = {
+            hash_func([int(row.u), int(row.v)]): [row["p_{0}".format(i)] for i in range(1, self.nb_of_erosion + 1)] for
+            ix, row in self.probs_df.iterrows()}
+
+        X_train, X_test, y_train, y_test = split_train_test(self.G)
+        if distance:
+            pos = nx.get_node_attributes(self.G, "pos")
+            dist_X_train = np.asarray([dist(pos[ed[0]], pos[ed[1]]) for ed in X_train[:, :2]]).reshape(-1, 1)
+            dist_X_test = np.asarray([dist(pos[ed[0]], pos[ed[1]]) for ed in X_test[:, :2]]).reshape(-1, 1)
+
+            X_train = np.concatenate((X_train, dist_X_train), axis=1)
+            X_test = np.concatenate((X_test, dist_X_test), axis=1)
+
+        if centrality:
+            centrality = nx.degree_centrality(self.G)
+            centrality_X_train = np.asarray([[centrality[ed[0]], centrality[ed[1]]] for ed in X_train[:, :2]])
+            centrality_X_test = np.asarray([[centrality[ed[0]], centrality[ed[1]]] for ed in X_test[:, :2]])
+
+            X_train = np.concatenate((X_train, centrality_X_train), axis=1)
+            X_test = np.concatenate((X_test, centrality_X_test), axis=1)
+
+        if probs_erosion:
+            if_not = [0 for i in range(self.nb_of_erosion)]
+            feature_X_train = np.asarray(
+                [(edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train[:, :2]])
+            feature_X_test = np.asarray(
+                [(edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test[:, :2]])
+            X_train = np.concatenate((X_train, feature_X_train), axis=1)
+            X_test = np.concatenate((X_test, feature_X_test), axis=1)
+
+        X_train = X_train[:, 2:]
+        X_test = X_test[:, 2:]
+
+        return X_train,X_test,y_train,y_test
+
+
+def eval_erosion_model(G,nb_iter=1,verbose=False):
+    erod_mod = ErosionModel(G)
+    erod_mod.erode_n_times(nb_iter)
+    X_train, X_test, y_train, y_test = erod_mod.get_features()
+
+    auc_sbm, auc_spa = get_auc_heuristics(G, 60)
+    if verbose:print("SBM: ", auc_sbm, "SPATIAL: ", auc_spa)
+
+    clf = LogisticRegression()
+    clf.fit(X_train, y_train)
+    y_pred = clf.predict_proba(X_test)[:, 1]
+    return auc_sbm,auc_spa,roc_auc_score(y_test, y_pred)
\ No newline at end of file
--- a/lib/lambda_func.py
+++ b/lib/lambda_func.py
+# coding = utf-8
+import numpy as np
+
+euclid_dist = lambda a,b : np.linalg.norm(a-b)**2
+hash_func = lambda x:"_".join(sorted([str(x[0]),str(x[1])]))
\ No newline at end of file
--- a/lib/link_prediction_eval.py
+++ b/lib/link_prediction_eval.py
+# coding = utf-8
+
+from evalne.evaluation.evaluator import LPEvaluator
+from evalne.evaluation.split import EvalSplit as LPEvalSplit
+from evalne.utils import preprocess as pp
+
+from .lambda_func import hash_func
+
+def get_auc_heuristics(G,timeout=60):
+    H, _ = pp.prep_graph(G.copy(),maincc=True)
+    traintest_split = LPEvalSplit()
+    traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
+    nee = LPEvaluator(traintest_split)
+    auc_spatial, auc_sbm = 0, 0
+    try:
+        auc_spatial = nee.evaluate_baseline(method="spatial_link_prediction",timeout=timeout).test_scores.auroc()
+        auc_sbm = nee.evaluate_baseline(method="stochastic_block_model",timeout=timeout).test_scores.auroc()
+    except:
+        print("Could not compute AUC ! ")
+    return auc_sbm,auc_spatial
+
+
+def split_train_test(G,train_frac=0.90,fe_ratio=1):
+    H, _ = pp.prep_graph(G.copy(), maincc=True,relabel=False)
+    traintest_split = LPEvalSplit()
+    traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=train_frac, fe_ratio=fe_ratio)
+
+    X_train = traintest_split.train_edges
+    y_train = traintest_split.train_labels
+    X_test = traintest_split.test_edges
+    y_test = traintest_split.test_labels
+
+    return X_train,X_test,y_train,y_test
+
+def get_all_possible_edges(G):
+    register = set([])
+    data = []
+    for n1 in list(G.nodes()):
+        for n2 in list(G.nodes()):
+            if n1 != n2 and hash_func((n1, n2)) not in register:
+                data.append([n1, n2])
+                register.add(hash_func((n1, n2)))
+    return data
\ No newline at end of file
--- a/lib/random.py
+++ b/lib/random.py
@@ -503,7 +503,7 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw
        G2.nodes[n]["block"] = block_assign[n]
        G2.nodes[n]["pos"] = G.nodes[n]["pos"]

-    return G2,all_probs_sbm,all_probs_spa
+    return G2#,all_probs_sbm,all_probs_spa