Skip to content
Snippets Groups Projects
Commit 5235a480 authored by Fize Jacques's avatar Fize Jacques
Browse files

erosionn model v1 done

parent f743ec77
No related branches found
No related tags found
No related merge requests found
# coding = utf-8
import argparse
from lib.random import mixed_model_spat_sbm, get_spat_probs, get_sbm_probs
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm
from evalne.evaluation.evaluator import LPEvaluator
from evalne.evaluation.split import EvalSplit as LPEvalSplit
from evalne.utils import preprocess as pp
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score,precision_score,recall_score,f1_score
from sklearn.metrics import make_scorer
roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
needs_threshold=True)
from lib.random import mixed_model_spat_sbm
from lib.erosion_model import eval_erosion_model
from joblib import Parallel,delayed
parser = argparse.ArgumentParser()
parser.add_argument("nb_nodes",type=int)
......@@ -51,173 +24,12 @@ NB_COM = args.nb_com
NB_ITERATION = args.nb_iterations
VERBOSE = args.verbose
FEATURES = set(args.features.split(","))
TIMEOUT = args.timeout
dist = lambda a,b : np.linalg.norm(a-b)**2
hash_func = lambda x:"_".join(sorted([str(x[0]),str(x[1])]))
def get_aucs(G):
H, _ = pp.prep_graph(G.copy(),maincc=True)
traintest_split = LPEvalSplit()
traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
nee = LPEvaluator(traintest_split)
auc_spatial, auc_sbm = 0, 0
try:
auc_spatial = nee.evaluate_baseline(method="spatial_link_prediction",timeout=TIMEOUT).test_scores.auroc()
auc_sbm = nee.evaluate_baseline(method="stochastic_block_model",timeout=TIMEOUT).test_scores.auroc()
except:
print("Could not compute AUC ! ")
return auc_sbm,auc_spatial
dist = lambda a,b : np.linalg.norm(a-b)
G,all_probs_sbm,all_probs_spa = mixed_model_spat_sbm(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,alpha=ALPHA)
register = set([])
data = []
for n1 in list(G.nodes()):
for n2 in list(G.nodes()):
if n1 != n2 and hash_func((n1,n2)) not in register:
data.append([n1,n2])
register.add(hash_func((n1,n2)))
df_data = pd.DataFrame(data,columns="u v".split())
df_data["hash_"] = df_data.apply(lambda row:hash_func((int(row.u),int(row.v))), axis=1)
df_data["p_0"] = df_data.apply(lambda x:1 if G.has_edge(x.u,x.v) else 0,axis =1)
pos = nx.get_node_attributes(G,"pos")
block_assign = nx.get_node_attributes(G,"block")
H = G.copy()
float_epsilon = np.finfo(float).eps
for i in range(1,NB_ITERATION+1):
if H.size() < 30:
df_data["p_{0}".format(i)] = df_data["p_{0}".format(i-1)]
continue
old_probs = dict(df_data["hash_ p_{0}".format(i-1).split()].values)
auc_sbm,auc_spatial = get_aucs(H)
if VERBOSE : print("SBM: ",auc_sbm,"SPATIAL: ",auc_spatial)
if auc_sbm> auc_spatial:
edges,probs = get_sbm_probs(H,0.01)
else:
edges,probs = get_spat_probs(H)
edges = np.asarray(edges)
probs_dom = np.asarray(probs)
probs_dom /= probs_dom.sum()
edge_prob = dict(zip([hash_func(ed) for ed in edges], probs_dom))
df_data["p_{0}".format(i)] = df_data.apply(lambda x: edge_prob[hash_func([int(x.u),int(x.v)])] if hash_func([int(x.u),int(x.v)]) in edge_prob else 0,axis=1)
h_probs = np.asarray([(1 / H.size()) - probs_dom[ix] for ix, ed in enumerate(edges) if H.has_edge(*ed)])
new_nb_edges = h_probs.sum() * H.size()
if VERBOSE:print("new NB of Edges",new_nb_edges)
probs_erosion = np.asarray([old_probs[hash_func(ed)]-probs_dom[ix] for ix,ed in enumerate(edges)])
probs_erosion[probs_erosion <0] = float_epsilon
probs_erosion /= probs_erosion.sum()
final_edges = []
index_selected_pairs = np.random.choice(np.arange(len(edges)),round(new_nb_edges) , p=probs_erosion, replace=False)#round(0.7*H.size())
final_edges.extend(edges[index_selected_pairs])
G2 = nx.from_edgelist(final_edges)
for n in list(G2.nodes()):
G2.nodes[n]["block"] = block_assign[n]
G2.nodes[n]["pos"] = pos[n]
H=G2.copy()
if VERBOSE:print(df_data)
edge_feature= {hash_func([int(row.u),int(row.v)]):[row["p_{0}".format(i)] for i in range(1,NB_ITERATION+1)] for ix,row in df_data.iterrows()}
G, _ = pp.prep_graph(G,maincc=True,relabel=False)
traintest_split = LPEvalSplit()
traintest_split.compute_splits(G, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
X_train = traintest_split.train_edges
y_train = traintest_split.train_labels
X_test = traintest_split.test_edges
y_test = traintest_split.test_labels
if "pos" in FEATURES:
pos = nx.get_node_attributes(G,"pos")
dist_X_train = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_train[:,:2]]).reshape(-1,1)
dist_X_test = np.asarray([dist(pos[ed[0]],pos[ed[1]]) for ed in X_test[:,:2]]).reshape(-1,1)
X_train = np.concatenate((X_train, dist_X_train), axis=1)
X_test = np.concatenate((X_test, dist_X_test), axis=1)
if "centrality" in FEATURES:
centrality = nx.degree_centrality(G)
centrality_X_train = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_train[:,:2]])
centrality_X_test = np.asarray([[centrality[ed[0]],centrality[ed[1]]] for ed in X_test[:,:2]])
X_train = np.concatenate((X_train, centrality_X_train), axis=1)
X_test = np.concatenate((X_test, centrality_X_test), axis=1)
TIMEOUT = 10#args.timeout
if "it_probs":
if_not =[0 for i in range(NB_ITERATION)]
feature_X_train = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train[:,:2]])
feature_X_test = np.asarray([ (edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test[:,:2]])
X_train = np.concatenate((X_train, feature_X_train), axis=1)
X_test = np.concatenate((X_test, feature_X_test ), axis=1)
X_train = X_train[:,2:]
X_test = X_test[:,2:]
classifier_dict = {
"naive-bayes":GaussianNB(),
#"svm":SVC(),
"sgd":SGDClassifier(),
"knn":KNeighborsClassifier(),
"decision-tree": DecisionTreeClassifier(),
"random-forest":RandomForestClassifier(),
"mlp":MLPClassifier(),
"logistic_reg":LogisticRegression(),
"linear_svm":LinearSVC()
}
G = mixed_model_spat_sbm(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,alpha=ALPHA)
parameters = {
"naive-bayes":[],
#"svm":[{"kernel":["rbf","linear"], 'gamma': [1e-1,1e-2,1e-3, 1,10,100]}],
"sgd":[{"penalty":["l1","l2"],"loss":["hinge","modified_huber","log"]}],
"knn":[{"n_neighbors":list(range(4,8)),"p":[1,2]}],
"decision-tree": [{"criterion":["gini","entropy"]}],
"random-forest":[{"criterion":["gini","entropy"],"n_estimators":[10,50,100]}],
"mlp":[],
"logistic_reg":[],
"linear_svm":[]
}
auc_sbm, auc_spa = get_aucs(G)
if VERBOSE: print("SBM AUUROC",auc_sbm,"SPATIAL AUROC",auc_spa)
data = []
pbar = tqdm(parameters)
for classi_ in classifier_dict:
pbar.set_description(classi_)
if len(parameters[classi_])>0:
clf = GridSearchCV(
classifier_dict[classi_], parameters[classi_], scoring=roc_auc_scorer, n_jobs=-1
)
clf.fit(X_train,y_train)
y_pred = clf.best_estimator_.predict(X_test)
else:
classifier_dict[classi_].fit(X_train,y_train)
y_pred = classifier_dict[classi_].predict(X_test)
data.append([classifier_dict[classi_].__class__.__name__,precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred),roc_auc_score(y_test,y_pred)])
print(eval_erosion_model(G,NB_ITERATION))
df = pd.DataFrame(data,columns="method precision recall f1-score auroc".split())
df["auc_sbm"] = auc_sbm
df["auc_spatial"] = auc_spa
df["alpha"] = ALPHA
df["nb_nodes"] = GRAPH_NODE_NB
df["nb_edges"] = GRAPH_EDGE_NB
df["nb_com"] = NB_COM
df["nb_iterations"] = NB_ITERATION
df["features"] = "_".join(FEATURES)
if VERBOSE : print(df)
df.to_csv("{0}_{1}_{2}_{3}_{4}_{5}.csv".format(GRAPH_NODE_NB,GRAPH_EDGE_NB,NB_COM,ALPHA,NB_ITERATION,"_".join(FEATURES)),sep="\t",index=None)
\ No newline at end of file
# coding = utf-8
import itertools
import os
import networkx as nx
import argparse
import numpy as np
import pandas as pd
import random
import copy
from tqdm import tqdm
import lib.random as ra
# COMMAND PARSING
parser = argparse.ArgumentParser()
parser.add_argument("output_dir")
args = parser.parse_args()
GRAPH_SIZE = [300,1000]
EDGE_SIZE = [2]
sample_per_params = 10
OUTPUT_DIR = args.output_dir
if not os.path.exists(OUTPUT_DIR):
os.makedirs(args.output_dir)
parameters = {
"mixed_model_spat_sbm":{
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE,
"nb_com":[2,3,4,5],
"alpha":[0,0.2,0.5,0.7,1],
}
}
def get_params(inp):
return (dict(zip(inp.keys(), values)) for values in itertools.product(*inp.values()))
pbar = tqdm(parameters.items(),total=len(parameters))
for method,args in pbar:
pbar.set_description("Generating graphs using : " + method)
list_of_params = get_params(parameters[method])
func = getattr(ra,method)
for ix,params in enumerate(list_of_params):
params["nb_edges"] = params["nb_edges"] * params["nb_nodes"]
print("Gen graph using the following parameters : ",params)
for sp_id in range(sample_per_params):
try:
G = func(**params)
G.graph.update(params)
nx.write_gml(G, OUTPUT_DIR+"/graph_{method}_{ix}_{sp_id}.gml".format(method=method,ix=ix,sp_id=sp_id),stringizer=str)
except Exception as e:
print(e)
print("Can't generate graphs using these parameters")
......@@ -27,27 +27,27 @@ if not os.path.exists(OUTPUT_DIR):
parameters = {
# "stochastic_block_model_graph": {
# "nb_nodes":GRAPH_SIZE,
# "nb_edges":EDGE_SIZE,
# "nb_com" :[2,5,8,16,10,25],
# "percentage_edge_betw":[0.1,0.01]
# },
# "ER_graph": {
# "nb_nodes":GRAPH_SIZE,
# "nb_edges":EDGE_SIZE
# },
# "powerlaw_graph": { # configuration_model
# "nb_nodes":GRAPH_SIZE,
# "nb_edges":EDGE_SIZE,
# "exponent":[2,3],
# "tries":[100]
# },
# "spatial_graph":{
# "nb_nodes":GRAPH_SIZE,
# "nb_edges":EDGE_SIZE,
# "coords":["random","country"],
# },
"stochastic_block_model_graph": {
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE,
"nb_com" :[2,5,8,16,10,25],
"percentage_edge_betw":[0.1,0.01]
},
"ER_graph": {
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE
},
"powerlaw_graph": { # configuration_model
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE,
"exponent":[2,3],
"tries":[100]
},
"spatial_graph":{
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE,
"coords":["random","country"],
},
"mixed_model_spat_sbm":{
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE,
......
# coding = utf-8
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from .link_prediction_eval import get_auc_heuristics, split_train_test, get_all_possible_edges
from .random import get_spat_probs, get_sbm_probs
from .lambda_func import euclid_dist as dist
from .lambda_func import hash_func
from evalne.methods.similarity import stochastic_block_model,spatial_link_prediction
import pandas as pd
import networkx as nx
import numpy as np
float_epsilon = np.finfo(float).eps
class ErosionModel():
def __init__(self, G):
self.G = G
self.coordinates = nx.get_node_attributes(G, "pos")
self.block_assign = nx.get_node_attributes(G, "block")
self.probs_df = pd.DataFrame(get_all_possible_edges(G), columns="u v".split())
self.initialize()
self.H = G.copy()
self.nb_of_erosion = 0
def erode(self):
self.nb_of_erosion += 1
if self.H.size() < 30:
self.probs_df["p_{0}".format(self.nb_of_erosion)] = self.probs_df["p_{0}".format(self.nb_of_erosion - 1)]
return
old_probs = dict(self.probs_df["hash_ p_{0}".format(self.nb_of_erosion - 1).split()].values)
auc_sbm, auc_spatial = get_auc_heuristics(self.H, 60)
edges = get_all_possible_edges(self.H)
if auc_sbm > auc_spatial:
probs = stochastic_block_model(self.H, edges)
else:
probs = spatial_link_prediction(self.H, edges)
edges = np.asarray(edges)
probs_dom = np.asarray(probs)
probs_dom /= probs_dom.sum()
edge_prob = dict(zip([hash_func(ed) for ed in edges], probs_dom))
self.probs_df["p_{0}".format(self.nb_of_erosion)] = self.probs_df.apply(
lambda x: edge_prob[hash_func([int(x.u), int(x.v)])] if hash_func([int(x.u), int(x.v)]) in edge_prob else 0,
axis=1)
new_nb_edges = (np.asarray(
[(1 / self.H.size()) - probs_dom[ix] for ix, ed in enumerate(edges) if self.H.has_edge(*ed)])).sum() * self.H.size()
probs_erosion = np.asarray([old_probs[hash_func(ed)] - probs_dom[ix] for ix, ed in enumerate(edges)])
probs_erosion[probs_erosion < 0] = float_epsilon
probs_erosion /= probs_erosion.sum()
final_edges = []
index_selected_pairs = np.random.choice(np.arange(len(edges)), round(new_nb_edges), p=probs_erosion,
replace=False) # round(0.7*H.size())
final_edges.extend(edges[index_selected_pairs])
G2 = nx.from_edgelist(final_edges)
for n in list(G2.nodes()):
G2.nodes[n]["block"] = self.block_assign[n]
G2.nodes[n]["pos"] = self.coordinates[n]
self.H = G2.copy()
def erode_n_times(self,n):
if self.nb_of_erosion >0:
for i in range(1,self.nb_of_erosion+1):
if "p_{0}".format(i) in self.probs_df:
del self.probs_df["p_{0}".format(i)]
self.nb_of_erosion = 0
self.H = self.G.copy()
for i in range(n):
self.erode()
def initialize(self):
self.probs_df["hash_"] = self.probs_df.apply(lambda row: hash_func((int(row.u), int(row.v))), axis=1)
self.probs_df["p_0"] = self.probs_df.apply(lambda x: 1 if self.G.has_edge(x.u, x.v) else 0, axis=1)
def get_features(self,probs_erosion =True,centrality=False,distance=False):
if self.nb_of_erosion <1:
raise ValueError("You must erode your graph before access to computed features")
if not probs_erosion and not centrality and not distance:
raise ValueError("One feature (probs_erosion, centrality, distance) must be selected !")
edge_feature = {
hash_func([int(row.u), int(row.v)]): [row["p_{0}".format(i)] for i in range(1, self.nb_of_erosion + 1)] for
ix, row in self.probs_df.iterrows()}
X_train, X_test, y_train, y_test = split_train_test(self.G)
if distance:
pos = nx.get_node_attributes(self.G, "pos")
dist_X_train = np.asarray([dist(pos[ed[0]], pos[ed[1]]) for ed in X_train[:, :2]]).reshape(-1, 1)
dist_X_test = np.asarray([dist(pos[ed[0]], pos[ed[1]]) for ed in X_test[:, :2]]).reshape(-1, 1)
X_train = np.concatenate((X_train, dist_X_train), axis=1)
X_test = np.concatenate((X_test, dist_X_test), axis=1)
if centrality:
centrality = nx.degree_centrality(self.G)
centrality_X_train = np.asarray([[centrality[ed[0]], centrality[ed[1]]] for ed in X_train[:, :2]])
centrality_X_test = np.asarray([[centrality[ed[0]], centrality[ed[1]]] for ed in X_test[:, :2]])
X_train = np.concatenate((X_train, centrality_X_train), axis=1)
X_test = np.concatenate((X_test, centrality_X_test), axis=1)
if probs_erosion:
if_not = [0 for i in range(self.nb_of_erosion)]
feature_X_train = np.asarray(
[(edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train[:, :2]])
feature_X_test = np.asarray(
[(edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test[:, :2]])
X_train = np.concatenate((X_train, feature_X_train), axis=1)
X_test = np.concatenate((X_test, feature_X_test), axis=1)
X_train = X_train[:, 2:]
X_test = X_test[:, 2:]
return X_train,X_test,y_train,y_test
def eval_erosion_model(G,nb_iter=1,verbose=False):
erod_mod = ErosionModel(G)
erod_mod.erode_n_times(nb_iter)
X_train, X_test, y_train, y_test = erod_mod.get_features()
auc_sbm, auc_spa = get_auc_heuristics(G, 60)
if verbose:print("SBM: ", auc_sbm, "SPATIAL: ", auc_spa)
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:, 1]
return auc_sbm,auc_spa,roc_auc_score(y_test, y_pred)
\ No newline at end of file
# coding = utf-8
import numpy as np
euclid_dist = lambda a,b : np.linalg.norm(a-b)**2
hash_func = lambda x:"_".join(sorted([str(x[0]),str(x[1])]))
\ No newline at end of file
# coding = utf-8
from evalne.evaluation.evaluator import LPEvaluator
from evalne.evaluation.split import EvalSplit as LPEvalSplit
from evalne.utils import preprocess as pp
from .lambda_func import hash_func
def get_auc_heuristics(G,timeout=60):
H, _ = pp.prep_graph(G.copy(),maincc=True)
traintest_split = LPEvalSplit()
traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=0.90, fe_ratio=1)
nee = LPEvaluator(traintest_split)
auc_spatial, auc_sbm = 0, 0
try:
auc_spatial = nee.evaluate_baseline(method="spatial_link_prediction",timeout=timeout).test_scores.auroc()
auc_sbm = nee.evaluate_baseline(method="stochastic_block_model",timeout=timeout).test_scores.auroc()
except:
print("Could not compute AUC ! ")
return auc_sbm,auc_spatial
def split_train_test(G,train_frac=0.90,fe_ratio=1):
H, _ = pp.prep_graph(G.copy(), maincc=True,relabel=False)
traintest_split = LPEvalSplit()
traintest_split.compute_splits(H, split_alg="spanning_tree", train_frac=train_frac, fe_ratio=fe_ratio)
X_train = traintest_split.train_edges
y_train = traintest_split.train_labels
X_test = traintest_split.test_edges
y_test = traintest_split.test_labels
return X_train,X_test,y_train,y_test
def get_all_possible_edges(G):
register = set([])
data = []
for n1 in list(G.nodes()):
for n2 in list(G.nodes()):
if n1 != n2 and hash_func((n1, n2)) not in register:
data.append([n1, n2])
register.add(hash_func((n1, n2)))
return data
\ No newline at end of file
......@@ -503,7 +503,7 @@ def mixed_model_spat_sbm(nb_nodes, nb_edges, nb_com, alpha, percentage_edge_betw
G2.nodes[n]["block"] = block_assign[n]
G2.nodes[n]["pos"] = G.nodes[n]["pos"]
return G2,all_probs_sbm,all_probs_spa
return G2#,all_probs_sbm,all_probs_spa
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment