Newer
Older
# coding = utf-8
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from .link_prediction_eval import get_auc_heuristics, split_train_test, get_all_possible_edges
from .random import get_spat_probs, get_sbm_probs
from .lambda_func import euclid_dist as dist
from .lambda_func import hash_func
from evalne.utils import preprocess as pp
from evalne.methods.similarity import stochastic_block_model,spatial_link_prediction
import pandas as pd
import networkx as nx
import numpy as np
float_epsilon = np.finfo(float).eps
def log(x):
if VERBOSE:
print(x)
class ErosionModel():
def __init__(self, G):
self.G = G
self.coordinates = nx.get_node_attributes(G, "pos")
self.block_assign = nx.get_node_attributes(G, "block")
self.probs_df = pd.DataFrame(get_all_possible_edges(G), columns="u v".split())
self.initialize()
self.H = G.copy()
self.nb_of_erosion = 0
def erode(self):
test_H, _ = pp.prep_graph(self.H.copy(), maincc=True, relabel=False)
if len(test_H) < 30:
return False
return False
self.nb_of_erosion += 1
old_probs = dict(self.probs_df["hash_ p_{0}".format(self.nb_of_erosion - 1).split()].values)
auc_sbm, auc_spatial = get_auc_heuristics(self.H, 60)
edges = get_all_possible_edges(self.H)
if auc_sbm > auc_spatial:
probs = stochastic_block_model(self.H, edges)
else:
probs = spatial_link_prediction(self.H, edges)
edges = np.asarray(edges)
probs_dom = np.asarray(probs)
sum_prob_dom = probs_dom.sum()
sum_prob_dom_H = sum([probs[ix] for ix, ed in enumerate(edges) if self.H.has_edge(*ed)])
probs_dom /= sum_prob_dom
edge_prob = dict(zip([hash_func(ed) for ed in edges], probs_dom))
self.probs_df["p_{0}".format(self.nb_of_erosion)] = self.probs_df.apply(
lambda x: edge_prob[hash_func([int(x.u), int(x.v)])] if hash_func([int(x.u), int(x.v)]) in edge_prob else 0,
axis=1)
hhh = np.asarray(
[(1 / self.H.size()) - ((probs_dom[ix]*sum_prob_dom)/sum_prob_dom_H) for ix, ed in enumerate(edges) if self.H.has_edge(*ed)])
hhh[hhh < 0] = 0
new_nb_edges = hhh.sum() * self.H.size()
probs_erosion = np.asarray([old_probs[hash_func(ed)] - probs_dom[ix] for ix, ed in enumerate(edges)])
probs_erosion[probs_erosion <= 0] = float_epsilon
final_edges = []
index_selected_pairs = np.random.choice(np.arange(len(edges)), round(new_nb_edges), p=probs_erosion,
replace=False) # round(0.7*H.size())
final_edges.extend(edges[index_selected_pairs])
G2 = nx.from_edgelist(final_edges)
for n in list(G2.nodes()):
G2.nodes[n]["block"] = self.block_assign[n]
G2.nodes[n]["pos"] = self.coordinates[n]
self.H = G2.copy()
def erode_n_times(self,n):
if self.nb_of_erosion >0:
for i in range(1,self.nb_of_erosion+1):
if "p_{0}".format(i) in self.probs_df:
del self.probs_df["p_{0}".format(i)]
self.nb_of_erosion = 0
self.H = self.G.copy()
for i in range(n):
log(i)
log(self.H.size())
r = self.erode()
if r == False: # we cannot erode further
log("Cannot erode further")
break
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def initialize(self):
self.probs_df["hash_"] = self.probs_df.apply(lambda row: hash_func((int(row.u), int(row.v))), axis=1)
self.probs_df["p_0"] = self.probs_df.apply(lambda x: 1 if self.G.has_edge(x.u, x.v) else 0, axis=1)
def get_features(self,probs_erosion =True,centrality=False,distance=False):
if self.nb_of_erosion <1:
raise ValueError("You must erode your graph before access to computed features")
if not probs_erosion and not centrality and not distance:
raise ValueError("One feature (probs_erosion, centrality, distance) must be selected !")
edge_feature = {
hash_func([int(row.u), int(row.v)]): [row["p_{0}".format(i)] for i in range(1, self.nb_of_erosion + 1)] for
ix, row in self.probs_df.iterrows()}
X_train, X_test, y_train, y_test = split_train_test(self.G)
if distance:
pos = nx.get_node_attributes(self.G, "pos")
dist_X_train = np.asarray([dist(pos[ed[0]], pos[ed[1]]) for ed in X_train[:, :2]]).reshape(-1, 1)
dist_X_test = np.asarray([dist(pos[ed[0]], pos[ed[1]]) for ed in X_test[:, :2]]).reshape(-1, 1)
X_train = np.concatenate((X_train, dist_X_train), axis=1)
X_test = np.concatenate((X_test, dist_X_test), axis=1)
if centrality:
centrality = nx.degree_centrality(self.G)
centrality_X_train = np.asarray([[centrality[ed[0]], centrality[ed[1]]] for ed in X_train[:, :2]])
centrality_X_test = np.asarray([[centrality[ed[0]], centrality[ed[1]]] for ed in X_test[:, :2]])
X_train = np.concatenate((X_train, centrality_X_train), axis=1)
X_test = np.concatenate((X_test, centrality_X_test), axis=1)
if probs_erosion:
if_not = [0 for i in range(self.nb_of_erosion)]
feature_X_train = np.asarray(
[(edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_train[:, :2]])
feature_X_test = np.asarray(
[(edge_feature[hash_func(ed)] if hash_func(ed) in edge_feature else if_not) for ed in X_test[:, :2]])
X_train = np.concatenate((X_train, feature_X_train), axis=1)
X_test = np.concatenate((X_test, feature_X_test), axis=1)
X_train = X_train[:, 2:]
X_test = X_test[:, 2:]
return X_train,X_test,y_train,y_test
def position_str_process(G):
def foo(x):
return [eval(f) for f in re.findall("[-]?\d+.[-]?[\de+-]+", x)]
is_pos=True
H = G.copy()
for n in list(H.nodes()):
if not "pos" in H.nodes[n]:
is_pos=False
break
if is_pos:
import re
for node in list(G.nodes()):
try:
H.nodes[node]["pos"] = foo(H.nodes[node]["pos"])
except TypeError:
pass
return H
def eval_erosion_model(G,nb_iter=1,verbose=False):
erod_mod = ErosionModel(G)
erod_mod.erode_n_times(nb_iter)
X_train, X_test, y_train, y_test = erod_mod.get_features(True,True,True)
auc_sbm, auc_spa = get_auc_heuristics(G, 60)
if verbose:print("SBM: ", auc_sbm, "SPATIAL: ", auc_spa)
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:, 1]
return auc_sbm,auc_spa,roc_auc_score(y_test, y_pred)