Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hseba/deepdense
  • wmegherbi/deepdense
2 results
Show changes
Showing
with 1641 additions and 0 deletions
File added
File added
File added
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Tang J, Qu M, Wang M, et al. Line: Large-scale information network embedding[C]//Proceedings of the 24th International Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2015: 1067-1077.(https://arxiv.org/pdf/1503.03578.pdf)
"""
import math
import random
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.layers import Embedding, Input, Lambda
from tensorflow.python.keras.models import Model
import alias
import utils
import time
def line_loss(y_true, y_pred):
return -K.mean(K.log(K.sigmoid(y_true*y_pred)))
def create_model(numNodes, embedding_size, order='second'):
v_i = Input(shape=(1,))
v_j = Input(shape=(1,))
first_emb = Embedding(numNodes, embedding_size, name='first_emb')
second_emb = Embedding(numNodes, embedding_size, name='second_emb')
context_emb = Embedding(numNodes, embedding_size, name='context_emb')
v_i_emb = first_emb(v_i)
v_j_emb = first_emb(v_j)
v_i_emb_second = second_emb(v_i)
v_j_context_emb = context_emb(v_j)
first = Lambda(lambda x: tf.reduce_sum(
x[0]*x[1], axis=-1, keepdims=False), name='first_order')([v_i_emb, v_j_emb])
second = Lambda(lambda x: tf.reduce_sum(
x[0]*x[1], axis=-1, keepdims=False), name='second_order')([v_i_emb_second, v_j_context_emb])
if order == 'first':
output_list = [first]
elif order == 'second':
output_list = [second]
else:
output_list = [first, second]
model = Model(inputs=[v_i, v_j], outputs=output_list)
return model, {'first': first_emb, 'second': second_emb}
class LINE:
def __init__(self, graph, embedding_size=8, negative_ratio=5, order='second',):
"""
:param graph:
:param embedding_size:
:param negative_ratio:
:param order: 'first','second','all'
"""
if order not in ['first', 'second', 'all']:
raise ValueError('mode must be fisrt,second,or all')
self.graph = graph
self.idx2node, self.node2idx = utils.preprocess_nxgraph(graph)
self.use_alias = True
self.rep_size = embedding_size
self.order = order
self._embeddings = {}
self.negative_ratio = negative_ratio
self.order = order
self.node_size = graph.number_of_nodes()
self.edge_size = graph.number_of_edges()
self.samples_per_epoch = self.edge_size*(1+negative_ratio)
self._gen_sampling_table()
self.reset_model()
def reset_training_config(self, batch_size, times):
self.batch_size = batch_size
self.steps_per_epoch = (
(self.samples_per_epoch - 1) // self.batch_size + 1)*times
def reset_model(self, opt='adam'):
self.model, self.embedding_dict = create_model(
self.node_size, self.rep_size, self.order)
self.model.compile(opt, line_loss)
self.batch_it = self.batch_iter(self.node2idx)
def _gen_sampling_table(self):
# create sampling table for vertex
power = 0.75
numNodes = self.node_size
node_degree = np.zeros(numNodes) # out degree
node2idx = self.node2idx
for edge in self.graph.edges():
node_degree[node2idx[edge[0]]
] += self.graph[edge[0]][edge[1]].get('weight', 1.0)
total_sum = sum([math.pow(node_degree[i], power)
for i in range(numNodes)])
norm_prob = [float(math.pow(node_degree[j], power)) /
total_sum for j in range(numNodes)]
self.node_accept, self.node_alias = alias.create_alias_table(norm_prob)
# create sampling table for edge
numEdges = self.graph.number_of_edges()
total_sum = sum([self.graph[edge[0]][edge[1]].get('weight', 1.0)
for edge in self.graph.edges()])
norm_prob = [self.graph[edge[0]][edge[1]].get('weight', 1.0) *
numEdges / total_sum for edge in self.graph.edges()]
self.edge_accept, self.edge_alias = alias.create_alias_table(norm_prob)
def batch_iter(self, node2idx):
edges = [(node2idx[x[0]], node2idx[x[1]]) for x in self.graph.edges()]
data_size = self.graph.number_of_edges()
shuffle_indices = np.random.permutation(np.arange(data_size))
# positive or negative mod
mod = 0
mod_size = 1 + self.negative_ratio
h = []
t = []
sign = 0
count = 0
start_index = 0
end_index = min(start_index + self.batch_size, data_size)
while True:
if mod == 0:
h = []
t = []
for i in range(start_index, end_index):
if random.random() >= self.edge_accept[shuffle_indices[i]]:
shuffle_indices[i] = self.edge_alias[shuffle_indices[i]]
cur_h = edges[shuffle_indices[i]][0]
cur_t = edges[shuffle_indices[i]][1]
h.append(cur_h)
t.append(cur_t)
sign = np.ones(len(h))
else:
sign = np.ones(len(h))*-1
t = []
for i in range(len(h)):
t.append(alias.alias_sample(
self.node_accept, self.node_alias))
if self.order == 'all':
yield ([np.array(h), np.array(t)], [sign, sign])
else:
yield ([np.array(h), np.array(t)], [sign])
mod += 1
mod %= mod_size
if mod == 0:
start_index = end_index
end_index = min(start_index + self.batch_size, data_size)
if start_index >= data_size:
count += 1
mod = 0
h = []
shuffle_indices = np.random.permutation(np.arange(data_size))
start_index = 0
end_index = min(start_index + self.batch_size, data_size)
def get_embeddings(self,):
self._embeddings = {}
result = np.zeros((100, int(64)))
if self.order == 'first':
embeddings = self.embedding_dict['first'].get_weights()[0]
elif self.order == 'second':
embeddings = self.embedding_dict['second'].get_weights()[0]
else:
embeddings = np.hstack((self.embedding_dict['first'].get_weights()[
0], self.embedding_dict['second'].get_weights()[0]))
idx2node = self.idx2node
for i, embedding in enumerate(embeddings):
self._embeddings[idx2node[i]] = embedding
result[int(idx2node[i])] = embedding
return self._embeddings, result
def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1):
self.reset_training_config(batch_size, times)
hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch,
verbose=verbose)
return hist
import numpy as np
import classify
from sklearn.linear_model import LogisticRegression
import line
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = classify.read_node_label('../data/wiki/wiki_labels.txt')
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = classify.Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = classify.read_node_label('../data/wiki/wiki_labels.txt')
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
plt.legend()
plt.show()
def read(arr):
G = nx.Graph()
for a,b in arr:
if not G.has_node(a):
G.add_node(a)
if not G.has_node(b):
G.add_node(b)
G.add_edge(a,b,weight=1)
return G
def read_all():
data = np.load('graph/data_val.npy',allow_pickle=True)
id=0
for x in data:
G=read(x)
model = line.LINE(G, embedding_size=64, order='second')
model.train(batch_size=120, epochs=100, verbose=2)
embeddings, result = model.get_embeddings()
result = np.asarray(result)
result = np.asarray(result)
name = str('graph/data_val.npy')
name = name[:name.index('.')]
np.save(name+"\\transformed_"+str(id),result)
print(id,"DONE")
id+=1
return model
if __name__ == "__main__":
'''G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
model = line.LINE(G, embedding_size=64, order='second')
model.train(batch_size=1024, epochs=50, verbose=2)
print("hnaya",model)
embeddings, result = model.get_embeddings()
#print(embeddings)
result = np.asarray(embeddings)
#print(result)
#evaluate_embeddings(embeddings)
#plot_embeddings(embeddings)'''
model = read_all()
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Grover A, Leskovec J. node2vec: Scalable feature learning for networks[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 855-864.(https://www.kdd.org/kdd2016/papers/files/rfp0218-groverA.pdf)
"""
from gensim.models import Word2Vec
import pandas as pd
from ..walker import RandomWalker
class Node2Vec:
def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0):
self.graph = graph
self._embeddings = {}
self.walker = RandomWalker(
graph, p=p, q=q, use_rejection_sampling=use_rejection_sampling)
print("Preprocess transition probs...")
self.walker.preprocess_transition_probs()
self.sentences = self.walker.simulate_walks(
num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
kwargs["sentences"] = self.sentences
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["size"] = embed_size
kwargs["sg"] = 1
kwargs["hs"] = 0 # node2vec not use Hierarchical Softmax
kwargs["workers"] = workers
kwargs["window"] = window_size
kwargs["iter"] = iter
print("Learning embedding vectors...")
model = Word2Vec(**kwargs)
print("Learning embedding vectors done!")
self.w2v_model = model
return model
def get_embeddings(self,):
if self.w2v_model is None:
print("model not train")
return {}
self._embeddings = {}
for word in self.graph.nodes():
self._embeddings[word] = self.w2v_model.wv[word]
return self._embeddings
import numpy as np
from ge.classify import read_node_label,Classifier
from ge import Node2Vec
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True)
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True)
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors)
plt.legend()
plt.show()
if __name__ == "__main__":
G = nx.read_edgelist('../data/flight/brazil-airports.edgelist', create_using=nx.DiGraph(), nodetype=None,
data=[('weight', int)])
model = Node2Vec(G, 10, 80, workers=1, p=0.25, q=2, use_rejection_sampling=0)
model.train()
embeddings = model.get_embeddings()
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)
import numpy as np
from ge.classify import read_node_label, Classifier
from ge import Node2Vec
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
plt.legend()
plt.show()
if __name__ == "__main__":
G=nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
create_using = nx.DiGraph(), nodetype = None, data = [('weight', int)])
model = Node2Vec(G, walk_length=10, num_walks=80,
p=0.25, q=4, workers=1, use_rejection_sampling=0)
model.train(window_size = 5, iter = 3)
embeddings=model.get_embeddings()
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Wang D, Cui P, Zhu W. Structural deep network embedding[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 1225-1234.(https://www.kdd.org/kdd2016/papers/files/rfp0191-wangAemb.pdf)
"""
import time
import numpy as np
import scipy.sparse as sp
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.callbacks import History
from tensorflow.python.keras.layers import Dense, Input
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.regularizers import l1_l2
from ..utils import preprocess_nxgraph
def l_2nd(beta):
def loss_2nd(y_true, y_pred):
b_ = np.ones_like(y_true)
b_[y_true != 0] = beta
x = K.square((y_true - y_pred) * b_)
t = K.sum(x, axis=-1, )
return K.mean(t)
return loss_2nd
def l_1st(alpha):
def loss_1st(y_true, y_pred):
L = y_true
Y = y_pred
batch_size = tf.to_float(K.shape(L)[0])
return alpha * 2 * tf.linalg.trace(tf.matmul(tf.matmul(Y, L, transpose_a=True), Y)) / batch_size
return loss_1st
def create_model(node_size, hidden_size=[256, 128], l1=1e-5, l2=1e-4):
A = Input(shape=(node_size,))
L = Input(shape=(None,))
fc = A
for i in range(len(hidden_size)):
if i == len(hidden_size) - 1:
fc = Dense(hidden_size[i], activation='relu',
kernel_regularizer=l1_l2(l1, l2), name='1st')(fc)
else:
fc = Dense(hidden_size[i], activation='relu',
kernel_regularizer=l1_l2(l1, l2))(fc)
Y = fc
for i in reversed(range(len(hidden_size) - 1)):
fc = Dense(hidden_size[i], activation='relu',
kernel_regularizer=l1_l2(l1, l2))(fc)
A_ = Dense(node_size, 'relu', name='2nd')(fc)
model = Model(inputs=[A, L], outputs=[A_, Y])
emb = Model(inputs=A, outputs=Y)
return model, emb
class SDNE(object):
def __init__(self, graph, hidden_size=[32, 16], alpha=1e-6, beta=5., nu1=1e-5, nu2=1e-4, ):
self.graph = graph
# self.g.remove_edges_from(self.g.selfloop_edges())
self.idx2node, self.node2idx = preprocess_nxgraph(self.graph)
self.node_size = self.graph.number_of_nodes()
self.hidden_size = hidden_size
self.alpha = alpha
self.beta = beta
self.nu1 = nu1
self.nu2 = nu2
self.A, self.L = self._create_A_L(
self.graph, self.node2idx) # Adj Matrix,L Matrix
self.reset_model()
self.inputs = [self.A, self.L]
self._embeddings = {}
def reset_model(self, opt='adam'):
self.model, self.emb_model = create_model(self.node_size, hidden_size=self.hidden_size, l1=self.nu1,
l2=self.nu2)
self.model.compile(opt, [l_2nd(self.beta), l_1st(self.alpha)])
self.get_embeddings()
def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1):
if batch_size >= self.node_size:
if batch_size > self.node_size:
print('batch_size({0}) > node_size({1}),set batch_size = {1}'.format(
batch_size, self.node_size))
batch_size = self.node_size
return self.model.fit([self.A.todense(), self.L.todense()], [self.A.todense(), self.L.todense()],
batch_size=batch_size, epochs=epochs, initial_epoch=initial_epoch, verbose=verbose,
shuffle=False, )
else:
steps_per_epoch = (self.node_size - 1) // batch_size + 1
hist = History()
hist.on_train_begin()
logs = {}
for epoch in range(initial_epoch, epochs):
start_time = time.time()
losses = np.zeros(3)
for i in range(steps_per_epoch):
index = np.arange(
i * batch_size, min((i + 1) * batch_size, self.node_size))
A_train = self.A[index, :].todense()
L_mat_train = self.L[index][:, index].todense()
inp = [A_train, L_mat_train]
batch_losses = self.model.train_on_batch(inp, inp)
losses += batch_losses
losses = losses / steps_per_epoch
logs['loss'] = losses[0]
logs['2nd_loss'] = losses[1]
logs['1st_loss'] = losses[2]
epoch_time = int(time.time() - start_time)
hist.on_epoch_end(epoch, logs)
if verbose > 0:
print('Epoch {0}/{1}'.format(epoch + 1, epochs))
print('{0}s - loss: {1: .4f} - 2nd_loss: {2: .4f} - 1st_loss: {3: .4f}'.format(
epoch_time, losses[0], losses[1], losses[2]))
return hist
def evaluate(self, ):
return self.model.evaluate(x=self.inputs, y=self.inputs, batch_size=self.node_size)
def get_embeddings(self):
self._embeddings = {}
embeddings = self.emb_model.predict(self.A.todense(), batch_size=self.node_size)
look_back = self.idx2node
for i, embedding in enumerate(embeddings):
self._embeddings[look_back[i]] = embedding
return self._embeddings
def _create_A_L(self, graph, node2idx):
node_size = graph.number_of_nodes()
A_data = []
A_row_index = []
A_col_index = []
for edge in graph.edges():
v1, v2 = edge
edge_weight = graph[v1][v2].get('weight', 1)
A_data.append(edge_weight)
A_row_index.append(node2idx[v1])
A_col_index.append(node2idx[v2])
A = sp.csr_matrix((A_data, (A_row_index, A_col_index)), shape=(node_size, node_size))
A_ = sp.csr_matrix((A_data + A_data, (A_row_index + A_col_index, A_col_index + A_row_index)),
shape=(node_size, node_size))
D = sp.diags(A_.sum(axis=1).flatten().tolist()[0])
L = D - A_
return A, L
import numpy as np
from ge.classify import read_node_label, Classifier
from ge import SDNE
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1],
label=c) # c=node_colors)
plt.legend()
plt.show()
if __name__ == "__main__":
G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
model = SDNE(G, hidden_size=[256, 128],)
model.train(batch_size=3000, epochs=40, verbose=2)
embeddings = model.get_embeddings()
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Ribeiro L F R, Saverese P H P, Figueiredo D R. struc2vec: Learning node representations from structural identity[C]//Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM, 2017: 385-394.(https://arxiv.org/pdf/1704.03165.pdf)
"""
import math
import os
import shutil
from collections import ChainMap, deque
import numpy as np
import pandas as pd
from fastdtw import fastdtw
from gensim.models import Word2Vec
from joblib import Parallel, delayed
from tqdm import tqdm
from ..alias import create_alias_table
from ..utils import partition_dict, preprocess_nxgraph
from ..walker import BiasedWalker
class Struc2Vec():
def __init__(self, graph, walk_length=10, num_walks=100, workers=1, verbose=0, stay_prob=0.3, opt1_reduce_len=True, opt2_reduce_sim_calc=True, opt3_num_layers=None, temp_path='./temp_struc2vec/', reuse=False):
self.graph = graph
self.idx2node, self.node2idx = preprocess_nxgraph(graph)
self.idx = list(range(len(self.idx2node)))
self.opt1_reduce_len = opt1_reduce_len
self.opt2_reduce_sim_calc = opt2_reduce_sim_calc
self.opt3_num_layers = opt3_num_layers
self.resue = reuse
self.temp_path = temp_path
if not os.path.exists(self.temp_path):
os.mkdir(self.temp_path)
if not reuse:
shutil.rmtree(self.temp_path)
os.mkdir(self.temp_path)
self.create_context_graph(self.opt3_num_layers, workers, verbose)
self.prepare_biased_walk()
self.walker = BiasedWalker(self.idx2node, self.temp_path)
self.sentences = self.walker.simulate_walks(
num_walks, walk_length, stay_prob, workers, verbose)
self._embeddings = {}
def create_context_graph(self, max_num_layers, workers=1, verbose=0,):
pair_distances = self._compute_structural_distance(
max_num_layers, workers, verbose,)
layers_adj, layers_distances = self._get_layer_rep(pair_distances)
pd.to_pickle(layers_adj, self.temp_path + 'layers_adj.pkl')
layers_accept, layers_alias = self._get_transition_probs(
layers_adj, layers_distances)
pd.to_pickle(layers_alias, self.temp_path + 'layers_alias.pkl')
pd.to_pickle(layers_accept, self.temp_path + 'layers_accept.pkl')
def prepare_biased_walk(self,):
sum_weights = {}
sum_edges = {}
average_weight = {}
gamma = {}
layer = 0
while (os.path.exists(self.temp_path+'norm_weights_distance-layer-' + str(layer)+'.pkl')):
probs = pd.read_pickle(
self.temp_path+'norm_weights_distance-layer-' + str(layer)+'.pkl')
for v, list_weights in probs.items():
sum_weights.setdefault(layer, 0)
sum_edges.setdefault(layer, 0)
sum_weights[layer] += sum(list_weights)
sum_edges[layer] += len(list_weights)
average_weight[layer] = sum_weights[layer] / sum_edges[layer]
gamma.setdefault(layer, {})
for v, list_weights in probs.items():
num_neighbours = 0
for w in list_weights:
if (w > average_weight[layer]):
num_neighbours += 1
gamma[layer][v] = num_neighbours
layer += 1
pd.to_pickle(average_weight, self.temp_path + 'average_weight')
pd.to_pickle(gamma, self.temp_path + 'gamma.pkl')
def train(self, embed_size=128, window_size=5, workers=3, iter=5):
# pd.read_pickle(self.temp_path+'walks.pkl')
sentences = self.sentences
print("Learning representation...")
model = Word2Vec(sentences, size=embed_size, window=window_size, min_count=0, hs=1, sg=1, workers=workers,
iter=iter)
print("Learning representation done!")
self.w2v_model = model
return model
def get_embeddings(self,):
if self.w2v_model is None:
print("model not train")
return {}
self._embeddings = {}
for word in self.graph.nodes():
self._embeddings[word] = self.w2v_model.wv[word]
return self._embeddings
def _compute_ordered_degreelist(self, max_num_layers):
degreeList = {}
vertices = self.idx # self.g.nodes()
for v in vertices:
degreeList[v] = self._get_order_degreelist_node(v, max_num_layers)
return degreeList
def _get_order_degreelist_node(self, root, max_num_layers=None):
if max_num_layers is None:
max_num_layers = float('inf')
ordered_degree_sequence_dict = {}
visited = [False] * len(self.graph.nodes())
queue = deque()
level = 0
queue.append(root)
visited[root] = True
while (len(queue) > 0 and level <= max_num_layers):
count = len(queue)
if self.opt1_reduce_len:
degree_list = {}
else:
degree_list = []
while (count > 0):
top = queue.popleft()
node = self.idx2node[top]
degree = len(self.graph[node])
if self.opt1_reduce_len:
degree_list[degree] = degree_list.get(degree, 0) + 1
else:
degree_list.append(degree)
for nei in self.graph[node]:
nei_idx = self.node2idx[nei]
if not visited[nei_idx]:
visited[nei_idx] = True
queue.append(nei_idx)
count -= 1
if self.opt1_reduce_len:
orderd_degree_list = [(degree, freq)
for degree, freq in degree_list.items()]
orderd_degree_list.sort(key=lambda x: x[0])
else:
orderd_degree_list = sorted(degree_list)
ordered_degree_sequence_dict[level] = orderd_degree_list
level += 1
return ordered_degree_sequence_dict
def _compute_structural_distance(self, max_num_layers, workers=1, verbose=0,):
if os.path.exists(self.temp_path+'structural_dist.pkl'):
structural_dist = pd.read_pickle(
self.temp_path+'structural_dist.pkl')
else:
if self.opt1_reduce_len:
dist_func = cost_max
else:
dist_func = cost
if os.path.exists(self.temp_path + 'degreelist.pkl'):
degreeList = pd.read_pickle(self.temp_path + 'degreelist.pkl')
else:
degreeList = self._compute_ordered_degreelist(max_num_layers)
pd.to_pickle(degreeList, self.temp_path + 'degreelist.pkl')
if self.opt2_reduce_sim_calc:
degrees = self._create_vectors()
degreeListsSelected = {}
vertices = {}
n_nodes = len(self.idx)
for v in self.idx: # c:list of vertex
nbs = get_vertices(
v, len(self.graph[self.idx2node[v]]), degrees, n_nodes)
vertices[v] = nbs # store nbs
degreeListsSelected[v] = degreeList[v] # store dist
for n in nbs:
# store dist of nbs
degreeListsSelected[n] = degreeList[n]
else:
vertices = {}
for v in degreeList:
vertices[v] = [vd for vd in degreeList.keys() if vd > v]
results = Parallel(n_jobs=workers, verbose=verbose,)(
delayed(compute_dtw_dist)(part_list, degreeList, dist_func) for part_list in partition_dict(vertices, workers))
dtw_dist = dict(ChainMap(*results))
structural_dist = convert_dtw_struc_dist(dtw_dist)
pd.to_pickle(structural_dist, self.temp_path +
'structural_dist.pkl')
return structural_dist
def _create_vectors(self):
degrees = {} # sotre v list of degree
degrees_sorted = set() # store degree
G = self.graph
for v in self.idx:
degree = len(G[self.idx2node[v]])
degrees_sorted.add(degree)
if (degree not in degrees):
degrees[degree] = {}
degrees[degree]['vertices'] = []
degrees[degree]['vertices'].append(v)
degrees_sorted = np.array(list(degrees_sorted), dtype='int')
degrees_sorted = np.sort(degrees_sorted)
l = len(degrees_sorted)
for index, degree in enumerate(degrees_sorted):
if (index > 0):
degrees[degree]['before'] = degrees_sorted[index - 1]
if (index < (l - 1)):
degrees[degree]['after'] = degrees_sorted[index + 1]
return degrees
def _get_layer_rep(self, pair_distances):
layer_distances = {}
layer_adj = {}
for v_pair, layer_dist in pair_distances.items():
for layer, distance in layer_dist.items():
vx = v_pair[0]
vy = v_pair[1]
layer_distances.setdefault(layer, {})
layer_distances[layer][vx, vy] = distance
layer_adj.setdefault(layer, {})
layer_adj[layer].setdefault(vx, [])
layer_adj[layer].setdefault(vy, [])
layer_adj[layer][vx].append(vy)
layer_adj[layer][vy].append(vx)
return layer_adj, layer_distances
def _get_transition_probs(self, layers_adj, layers_distances):
layers_alias = {}
layers_accept = {}
for layer in layers_adj:
neighbors = layers_adj[layer]
layer_distances = layers_distances[layer]
node_alias_dict = {}
node_accept_dict = {}
norm_weights = {}
for v, neighbors in neighbors.items():
e_list = []
sum_w = 0.0
for n in neighbors:
if (v, n) in layer_distances:
wd = layer_distances[v, n]
else:
wd = layer_distances[n, v]
w = np.exp(-float(wd))
e_list.append(w)
sum_w += w
e_list = [x / sum_w for x in e_list]
norm_weights[v] = e_list
accept, alias = create_alias_table(e_list)
node_alias_dict[v] = alias
node_accept_dict[v] = accept
pd.to_pickle(
norm_weights, self.temp_path + 'norm_weights_distance-layer-' + str(layer)+'.pkl')
layers_alias[layer] = node_alias_dict
layers_accept[layer] = node_accept_dict
return layers_accept, layers_alias
def cost(a, b):
ep = 0.5
m = max(a, b) + ep
mi = min(a, b) + ep
return ((m / mi) - 1)
def cost_min(a, b):
ep = 0.5
m = max(a[0], b[0]) + ep
mi = min(a[0], b[0]) + ep
return ((m / mi) - 1) * min(a[1], b[1])
def cost_max(a, b):
ep = 0.5
m = max(a[0], b[0]) + ep
mi = min(a[0], b[0]) + ep
return ((m / mi) - 1) * max(a[1], b[1])
def convert_dtw_struc_dist(distances, startLayer=1):
"""
:param distances: dict of dict
:param startLayer:
:return:
"""
for vertices, layers in distances.items():
keys_layers = sorted(layers.keys())
startLayer = min(len(keys_layers), startLayer)
for layer in range(0, startLayer):
keys_layers.pop(0)
for layer in keys_layers:
layers[layer] += layers[layer - 1]
return distances
def get_vertices(v, degree_v, degrees, n_nodes):
a_vertices_selected = 2 * math.log(n_nodes, 2)
vertices = []
try:
c_v = 0
for v2 in degrees[degree_v]['vertices']:
if (v != v2):
vertices.append(v2) # same degree
c_v += 1
if (c_v > a_vertices_selected):
raise StopIteration
if ('before' not in degrees[degree_v]):
degree_b = -1
else:
degree_b = degrees[degree_v]['before']
if ('after' not in degrees[degree_v]):
degree_a = -1
else:
degree_a = degrees[degree_v]['after']
if (degree_b == -1 and degree_a == -1):
raise StopIteration # not anymore v
degree_now = verifyDegrees(degrees, degree_v, degree_a, degree_b)
# nearest valid degree
while True:
for v2 in degrees[degree_now]['vertices']:
if (v != v2):
vertices.append(v2)
c_v += 1
if (c_v > a_vertices_selected):
raise StopIteration
if (degree_now == degree_b):
if ('before' not in degrees[degree_b]):
degree_b = -1
else:
degree_b = degrees[degree_b]['before']
else:
if ('after' not in degrees[degree_a]):
degree_a = -1
else:
degree_a = degrees[degree_a]['after']
if (degree_b == -1 and degree_a == -1):
raise StopIteration
degree_now = verifyDegrees(degrees, degree_v, degree_a, degree_b)
except StopIteration:
return list(vertices)
return list(vertices)
def verifyDegrees(degrees, degree_v_root, degree_a, degree_b):
if(degree_b == -1):
degree_now = degree_a
elif(degree_a == -1):
degree_now = degree_b
elif(abs(degree_b - degree_v_root) < abs(degree_a - degree_v_root)):
degree_now = degree_b
else:
degree_now = degree_a
return degree_now
def compute_dtw_dist(part_list, degreeList, dist_func):
dtw_dist = {}
for v1, nbs in part_list:
lists_v1 = degreeList[v1] # lists_v1 :orderd degree list of v1
for v2 in nbs:
lists_v2 = degreeList[v2] # lists_v1 :orderd degree list of v2
max_layer = min(len(lists_v1), len(lists_v2)) # valid layer
dtw_dist[v1, v2] = {}
for layer in range(0, max_layer):
dist, path = fastdtw(
lists_v1[layer], lists_v2[layer], radius=1, dist=dist_func)
dtw_dist[v1, v2][layer] = dist
return dtw_dist
import numpy as np
from ge.classify import read_node_label,Classifier
from ge import Struc2Vec
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True)
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True)
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors)
plt.legend()
plt.show()
if __name__ == "__main__":
G = nx.read_edgelist('../data/flight/brazil-airports.edgelist', create_using=nx.DiGraph(), nodetype=None,
data=[('weight', int)])
model = Struc2Vec(G, 10, 80, workers=4, verbose=40, )
model.train()
embeddings = model.get_embeddings()
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)
\ No newline at end of file
def preprocess_nxgraph(graph):
node2idx = {}
idx2node = []
node_size = 0
for node in graph.nodes():
node2idx[node] = node_size
idx2node.append(node)
node_size += 1
return idx2node, node2idx
def partition_dict(vertices, workers):
batch_size = (len(vertices) - 1) // workers + 1
part_list = []
part = []
count = 0
for v1, nbs in vertices.items():
part.append((v1, nbs))
count += 1
if count % batch_size == 0:
part_list.append(part)
part = []
if len(part) > 0:
part_list.append(part)
return part_list
def partition_list(vertices, workers):
batch_size = (len(vertices) - 1) // workers + 1
part_list = []
part = []
count = 0
for v1, nbs in enumerate(vertices):
part.append((v1, nbs))
count += 1
if count % batch_size == 0:
part_list.append(part)
part = []
if len(part) > 0:
part_list.append(part)
return part_list
def partition_num(num, workers):
if num % workers == 0:
return [num//workers]*workers
else:
return [num//workers]*workers + [num % workers]
import itertools
import math
import random
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from tqdm import trange
import alias
import utils
class RandomWalker:
def __init__(self, G, p=1, q=1, use_rejection_sampling=0):
"""
:param G:
:param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk.
:param q: In-out parameter,allows the search to differentiate between “inward” and “outward” nodes
:param use_rejection_sampling: Whether to use the rejection sampling strategy in node2vec.
"""
self.G = G
self.p = p
self.q = q
self.use_rejection_sampling = use_rejection_sampling
#print("khel hna")
def deepwalk_walk(self, walk_length, start_node):
walk = [start_node]
#print("walk",start_node)
while len(walk) < walk_length:
cur = walk[-1]
cur_nbrs = list(self.G.neighbors(cur))
if len(cur_nbrs) > 0:
walk.append(random.choice(cur_nbrs))
else:
break
return walk
def node2vec_walk(self, walk_length, start_node):
G = self.G
alias_nodes = self.alias_nodes
alias_edges = self.alias_edges
walk = [start_node]
while len(walk) < walk_length:
cur = walk[-1]
cur_nbrs = list(G.neighbors(cur))
if len(cur_nbrs) > 0:
if len(walk) == 1:
walk.append(
cur_nbrs[alias.alias_sample(alias_nodes[cur][0], alias.alias_nodes[cur][1])])
else:
prev = walk[-2]
edge = (prev, cur)
next_node = cur_nbrs[alias.alias_sample(alias_edges[edge][0],
alias_edges[edge][1])]
walk.append(next_node)
else:
break
return walk
def node2vec_walk2(self, walk_length, start_node):
"""
Reference:
KnightKing: A Fast Distributed Graph Random Walk Engine
http://madsys.cs.tsinghua.edu.cn/publications/SOSP19-yang.pdf
"""
def rejection_sample(inv_p, inv_q, nbrs_num):
upper_bound = max(1.0, max(inv_p, inv_q))
lower_bound = min(1.0, min(inv_p, inv_q))
shatter = 0
second_upper_bound = max(1.0, inv_q)
if (inv_p > second_upper_bound):
shatter = second_upper_bound / nbrs_num
upper_bound = second_upper_bound + shatter
return upper_bound, lower_bound, shatter
G = self.G
alias_nodes = self.alias_nodes
inv_p = 1.0 / self.p
inv_q = 1.0 / self.q
walk = [start_node]
while len(walk) < walk_length:
cur = walk[-1]
cur_nbrs = list(G.neighbors(cur))
if len(cur_nbrs) > 0:
if len(walk) == 1:
walk.append(
cur_nbrs[alias.alias_sample(alias_nodes[cur][0], alias_nodes[cur][1])])
else:
upper_bound, lower_bound, shatter = rejection_sample(
inv_p, inv_q, len(cur_nbrs))
prev = walk[-2]
prev_nbrs = set(G.neighbors(prev))
while True:
prob = random.random() * upper_bound
if (prob + shatter >= upper_bound):
next_node = prev
break
next_node = cur_nbrs[alias.alias_sample(
alias_nodes[cur][0], alias_nodes[cur][1])]
if (prob < lower_bound):
break
if (prob < inv_p and next_node == prev):
break
_prob = 1.0 if next_node in prev_nbrs else inv_q
if (prob < _prob):
break
walk.append(next_node)
else:
break
return walk
def simulate_walks(self, num_walks, walk_length, workers=16, verbose=0):
G = self.G
nodes = list(G.nodes())
results = Parallel(n_jobs=workers, verbose=verbose, )(
delayed(self._simulate_walks)(nodes, num, walk_length) for num in
utils.partition_num(num_walks, workers))
walks = list(itertools.chain(*results))
return walks
def _simulate_walks(self, nodes, num_walks, walk_length,):
walks = []
for _ in range(num_walks):
print("num_walks",num_walks,len(nodes))
random.shuffle(nodes)
for v in nodes:
#if self.p == 1 and self.q == 1:
walks.append(self.deepwalk_walk(
walk_length=walk_length, start_node=v))
#elif self.use_rejection_sampling:
# walks.append(self.node2vec_walk2(
# walk_length=walk_length, start_node=v))
#else:
# walks.append(self.node2vec_walk(
# walk_length=walk_length, start_node=v))
return walks
def get_alias_edge(self, t, v):
"""
compute unnormalized transition probability between nodes v and its neighbors give the previous visited node t.
:param t:
:param v:
:return:
"""
G = self.G
p = self.p
q = self.q
unnormalized_probs = []
for x in G.neighbors(v):
weight = G[v][x].get('weight', 1.0) # w_vx
if x == t: # d_tx == 0
unnormalized_probs.append(weight/p)
elif G.has_edge(x, t): # d_tx == 1
unnormalized_probs.append(weight)
else: # d_tx > 1
unnormalized_probs.append(weight/q)
norm_const = sum(unnormalized_probs)
normalized_probs = [
float(u_prob)/norm_const for u_prob in unnormalized_probs]
return alias.create_alias_table(normalized_probs)
def preprocess_transition_probs(self):
"""
Preprocessing of transition probabilities for guiding the random walks.
"""
G = self.G
alias_nodes = {}
for node in G.nodes():
unnormalized_probs = [G[node][nbr].get('weight', 1.0)
for nbr in G.neighbors(node)]
norm_const = sum(unnormalized_probs)
normalized_probs = [
float(u_prob)/norm_const for u_prob in unnormalized_probs]
alias_nodes[node] = alias.create_alias_table(normalized_probs)
if not self.use_rejection_sampling:
alias_edges = {}
for edge in G.edges():
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
if not G.is_directed():
alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
self.alias_edges = alias_edges
self.alias_nodes = alias_nodes
return
class BiasedWalker:
def __init__(self, idx2node, temp_path):
self.idx2node = idx2node
self.idx = list(range(len(self.idx2node)))
self.temp_path = temp_path
pass
def simulate_walks(self, num_walks, walk_length, stay_prob=0.3, workers=1, verbose=0):
layers_adj = pd.read_pickle(self.temp_path+'layers_adj.pkl')
layers_alias = pd.read_pickle(self.temp_path+'layers_alias.pkl')
layers_accept = pd.read_pickle(self.temp_path+'layers_accept.pkl')
gamma = pd.read_pickle(self.temp_path+'gamma.pkl')
walks = []
initialLayer = 0
nodes = self.idx # list(self.g.nodes())
results = Parallel(n_jobs=workers, verbose=verbose, )(
delayed(self._simulate_walks)(nodes, num, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma) for num in
utils.partition_num(num_walks, workers))
walks = list(itertools.chain(*results))
return walks
def _simulate_walks(self, nodes, num_walks, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma):
walks = []
for _ in range(num_walks):
random.shuffle(nodes)
for v in nodes:
walks.append(self._exec_random_walk(layers_adj, layers_accept, layers_alias,
v, walk_length, gamma, stay_prob))
return walks
def _exec_random_walk(self, graphs, layers_accept, layers_alias, v, walk_length, gamma, stay_prob=0.3):
initialLayer = 0
layer = initialLayer
path = []
path.append(self.idx2node[v])
while len(path) < walk_length:
r = random.random()
if(r < stay_prob): # same layer
v = chooseNeighbor(v, graphs, layers_alias,
layers_accept, layer)
path.append(self.idx2node[v])
else: # different layer
r = random.random()
try:
x = math.log(gamma[layer][v] + math.e)
p_moveup = (x / (x + 1))
except:
print(layer, v)
raise ValueError()
if(r > p_moveup):
if(layer > initialLayer):
layer = layer - 1
else:
if((layer + 1) in graphs and v in graphs[layer + 1]):
layer = layer + 1
return path
def chooseNeighbor(v, graphs, layers_alias, layers_accept, layer):
v_list = graphs[layer][v]
idx = alias.alias_sample(layers_accept[layer][v], layers_alias[layer][v])
v = v_list[idx]
return v
from .deepwalk import DeepWalk
from .node2vec import Node2Vec
from .line import LINE
from .sdne import SDNE
from .struc2vec import Struc2Vec
__all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec"]
GraphEmbedding-master/pics/code.png

28.7 KiB

GraphEmbedding-master/pics/deepctrbot.png

20.8 KiB

GraphEmbedding-master/pics/edge_list.png

25.6 KiB

GraphEmbedding-master/pics/weichennote.png

68.6 KiB

import setuptools
with open("README.md", "r", encoding="utf8") as fh:
long_description = fh.read()
REQUIRED_PACKAGES = [
# 'tensorflow>=1.4.0,<=1.12.0',
'gensim==3.6.0',
'networkx==2.1',
'joblib==0.13.0',
'fastdtw==0.3.2',
'tqdm',
'numpy',
'scikit-learn',
'pandas',
'matplotlib',
]
setuptools.setup(
name="ge",
version="0.0.0",
author="Weichen Shen",
author_email="wcshen1994@163.com",
url="https://github.com/shenweichen/GraphEmbedding",
packages=setuptools.find_packages(exclude=[]),
python_requires='>=3.4', # 3.4.6
install_requires=REQUIRED_PACKAGES,
extras_require={
"cpu": ['tensorflow>=1.4.0,!=1.7.*,!=1.8.*'],
"gpu": ['tensorflow-gpu>=1.4.0,!=1.7.*,!=1.8.*'],
},
entry_points={
},
license="MIT license",
)