Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hseba/deepdense
  • wmegherbi/deepdense
2 results
Show changes
Showing
with 1641 additions and 0 deletions
File added
File added
File added
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Tang J, Qu M, Wang M, et al. Line: Large-scale information network embedding[C]//Proceedings of the 24th International Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2015: 1067-1077.(https://arxiv.org/pdf/1503.03578.pdf)
"""
import math
import random
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.layers import Embedding, Input, Lambda
from tensorflow.python.keras.models import Model
import alias
import utils
import time
def line_loss(y_true, y_pred):
return -K.mean(K.log(K.sigmoid(y_true*y_pred)))
def create_model(numNodes, embedding_size, order='second'):
v_i = Input(shape=(1,))
v_j = Input(shape=(1,))
first_emb = Embedding(numNodes, embedding_size, name='first_emb')
second_emb = Embedding(numNodes, embedding_size, name='second_emb')
context_emb = Embedding(numNodes, embedding_size, name='context_emb')
v_i_emb = first_emb(v_i)
v_j_emb = first_emb(v_j)
v_i_emb_second = second_emb(v_i)
v_j_context_emb = context_emb(v_j)
first = Lambda(lambda x: tf.reduce_sum(
x[0]*x[1], axis=-1, keepdims=False), name='first_order')([v_i_emb, v_j_emb])
second = Lambda(lambda x: tf.reduce_sum(
x[0]*x[1], axis=-1, keepdims=False), name='second_order')([v_i_emb_second, v_j_context_emb])
if order == 'first':
output_list = [first]
elif order == 'second':
output_list = [second]
else:
output_list = [first, second]
model = Model(inputs=[v_i, v_j], outputs=output_list)
return model, {'first': first_emb, 'second': second_emb}
class LINE:
def __init__(self, graph, embedding_size=8, negative_ratio=5, order='second',):
"""
:param graph:
:param embedding_size:
:param negative_ratio:
:param order: 'first','second','all'
"""
if order not in ['first', 'second', 'all']:
raise ValueError('mode must be fisrt,second,or all')
self.graph = graph
self.idx2node, self.node2idx = utils.preprocess_nxgraph(graph)
self.use_alias = True
self.rep_size = embedding_size
self.order = order
self._embeddings = {}
self.negative_ratio = negative_ratio
self.order = order
self.node_size = graph.number_of_nodes()
self.edge_size = graph.number_of_edges()
self.samples_per_epoch = self.edge_size*(1+negative_ratio)
self._gen_sampling_table()
self.reset_model()
def reset_training_config(self, batch_size, times):
self.batch_size = batch_size
self.steps_per_epoch = (
(self.samples_per_epoch - 1) // self.batch_size + 1)*times
def reset_model(self, opt='adam'):
self.model, self.embedding_dict = create_model(
self.node_size, self.rep_size, self.order)
self.model.compile(opt, line_loss)
self.batch_it = self.batch_iter(self.node2idx)
def _gen_sampling_table(self):
# create sampling table for vertex
power = 0.75
numNodes = self.node_size
node_degree = np.zeros(numNodes) # out degree
node2idx = self.node2idx
for edge in self.graph.edges():
node_degree[node2idx[edge[0]]
] += self.graph[edge[0]][edge[1]].get('weight', 1.0)
total_sum = sum([math.pow(node_degree[i], power)
for i in range(numNodes)])
norm_prob = [float(math.pow(node_degree[j], power)) /
total_sum for j in range(numNodes)]
self.node_accept, self.node_alias = alias.create_alias_table(norm_prob)
# create sampling table for edge
numEdges = self.graph.number_of_edges()
total_sum = sum([self.graph[edge[0]][edge[1]].get('weight', 1.0)
for edge in self.graph.edges()])
norm_prob = [self.graph[edge[0]][edge[1]].get('weight', 1.0) *
numEdges / total_sum for edge in self.graph.edges()]
self.edge_accept, self.edge_alias = alias.create_alias_table(norm_prob)
def batch_iter(self, node2idx):
edges = [(node2idx[x[0]], node2idx[x[1]]) for x in self.graph.edges()]
data_size = self.graph.number_of_edges()
shuffle_indices = np.random.permutation(np.arange(data_size))
# positive or negative mod
mod = 0
mod_size = 1 + self.negative_ratio
h = []
t = []
sign = 0
count = 0
start_index = 0
end_index = min(start_index + self.batch_size, data_size)
while True:
if mod == 0:
h = []
t = []
for i in range(start_index, end_index):
if random.random() >= self.edge_accept[shuffle_indices[i]]:
shuffle_indices[i] = self.edge_alias[shuffle_indices[i]]
cur_h = edges[shuffle_indices[i]][0]
cur_t = edges[shuffle_indices[i]][1]
h.append(cur_h)
t.append(cur_t)
sign = np.ones(len(h))
else:
sign = np.ones(len(h))*-1
t = []
for i in range(len(h)):
t.append(alias.alias_sample(
self.node_accept, self.node_alias))
if self.order == 'all':
yield ([np.array(h), np.array(t)], [sign, sign])
else:
yield ([np.array(h), np.array(t)], [sign])
mod += 1
mod %= mod_size
if mod == 0:
start_index = end_index
end_index = min(start_index + self.batch_size, data_size)
if start_index >= data_size:
count += 1
mod = 0
h = []
shuffle_indices = np.random.permutation(np.arange(data_size))
start_index = 0
end_index = min(start_index + self.batch_size, data_size)
def get_embeddings(self,):
self._embeddings = {}
result = np.zeros((100, int(64)))
if self.order == 'first':
embeddings = self.embedding_dict['first'].get_weights()[0]
elif self.order == 'second':
embeddings = self.embedding_dict['second'].get_weights()[0]
else:
embeddings = np.hstack((self.embedding_dict['first'].get_weights()[
0], self.embedding_dict['second'].get_weights()[0]))
idx2node = self.idx2node
for i, embedding in enumerate(embeddings):
self._embeddings[idx2node[i]] = embedding
result[int(idx2node[i])] = embedding
return self._embeddings, result
def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1):
self.reset_training_config(batch_size, times)
hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch,
verbose=verbose)
return hist
import numpy as np
import classify
from sklearn.linear_model import LogisticRegression
import line
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = classify.read_node_label('../data/wiki/wiki_labels.txt')
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = classify.Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = classify.read_node_label('../data/wiki/wiki_labels.txt')
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
plt.legend()
plt.show()
def read(arr):
G = nx.Graph()
for a,b in arr:
if not G.has_node(a):
G.add_node(a)
if not G.has_node(b):
G.add_node(b)
G.add_edge(a,b,weight=1)
return G
def read_all():
data = np.load('graph/data_val.npy',allow_pickle=True)
id=0
for x in data:
G=read(x)
model = line.LINE(G, embedding_size=64, order='second')
model.train(batch_size=120, epochs=100, verbose=2)
embeddings, result = model.get_embeddings()
result = np.asarray(result)
result = np.asarray(result)
name = str('graph/data_val.npy')
name = name[:name.index('.')]
np.save(name+"\\transformed_"+str(id),result)
print(id,"DONE")
id+=1
return model
if __name__ == "__main__":
'''G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
model = line.LINE(G, embedding_size=64, order='second')
model.train(batch_size=1024, epochs=50, verbose=2)
print("hnaya",model)
embeddings, result = model.get_embeddings()
#print(embeddings)
result = np.asarray(embeddings)
#print(result)
#evaluate_embeddings(embeddings)
#plot_embeddings(embeddings)'''
model = read_all()
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Grover A, Leskovec J. node2vec: Scalable feature learning for networks[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 855-864.(https://www.kdd.org/kdd2016/papers/files/rfp0218-groverA.pdf)
"""
from gensim.models import Word2Vec
import pandas as pd
from ..walker import RandomWalker
class Node2Vec:
def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0):
self.graph = graph
self._embeddings = {}
self.walker = RandomWalker(
graph, p=p, q=q, use_rejection_sampling=use_rejection_sampling)
print("Preprocess transition probs...")
self.walker.preprocess_transition_probs()
self.sentences = self.walker.simulate_walks(
num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
kwargs["sentences"] = self.sentences
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["size"] = embed_size
kwargs["sg"] = 1
kwargs["hs"] = 0 # node2vec not use Hierarchical Softmax
kwargs["workers"] = workers
kwargs["window"] = window_size
kwargs["iter"] = iter
print("Learning embedding vectors...")
model = Word2Vec(**kwargs)
print("Learning embedding vectors done!")
self.w2v_model = model
return model
def get_embeddings(self,):
if self.w2v_model is None:
print("model not train")
return {}
self._embeddings = {}
for word in self.graph.nodes():
self._embeddings[word] = self.w2v_model.wv[word]
return self._embeddings
import numpy as np
from ge.classify import read_node_label,Classifier
from ge import Node2Vec
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True)
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True)
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors)
plt.legend()
plt.show()
if __name__ == "__main__":
G = nx.read_edgelist('../data/flight/brazil-airports.edgelist', create_using=nx.DiGraph(), nodetype=None,
data=[('weight', int)])
model = Node2Vec(G, 10, 80, workers=1, p=0.25, q=2, use_rejection_sampling=0)
model.train()
embeddings = model.get_embeddings()
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)
import numpy as np
from ge.classify import read_node_label, Classifier
from ge import Node2Vec
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
plt.legend()
plt.show()
if __name__ == "__main__":
G=nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
create_using = nx.DiGraph(), nodetype = None, data = [('weight', int)])
model = Node2Vec(G, walk_length=10, num_walks=80,
p=0.25, q=4, workers=1, use_rejection_sampling=0)
model.train(window_size = 5, iter = 3)
embeddings=model.get_embeddings()
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Wang D, Cui P, Zhu W. Structural deep network embedding[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 1225-1234.(https://www.kdd.org/kdd2016/papers/files/rfp0191-wangAemb.pdf)
"""
import time
import numpy as np
import scipy.sparse as sp
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.callbacks import History
from tensorflow.python.keras.layers import Dense, Input
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.regularizers import l1_l2
from ..utils import preprocess_nxgraph
def l_2nd(beta):
def loss_2nd(y_true, y_pred):
b_ = np.ones_like(y_true)
b_[y_true != 0] = beta
x = K.square((y_true - y_pred) * b_)
t = K.sum(x, axis=-1, )
return K.mean(t)
return loss_2nd
def l_1st(alpha):
def loss_1st(y_true, y_pred):
L = y_true
Y = y_pred
batch_size = tf.to_float(K.shape(L)[0])
return alpha * 2 * tf.linalg.trace(tf.matmul(tf.matmul(Y, L, transpose_a=True), Y)) / batch_size
return loss_1st
def create_model(node_size, hidden_size=[256, 128], l1=1e-5, l2=1e-4):
A = Input(shape=(node_size,))
L = Input(shape=(None,))
fc = A
for i in range(len(hidden_size)):
if i == len(hidden_size) - 1:
fc = Dense(hidden_size[i], activation='relu',
kernel_regularizer=l1_l2(l1, l2), name='1st')(fc)
else:
fc = Dense(hidden_size[i], activation='relu',
kernel_regularizer=l1_l2(l1, l2))(fc)
Y = fc
for i in reversed(range(len(hidden_size) - 1)):
fc = Dense(hidden_size[i], activation='relu',
kernel_regularizer=l1_l2(l1, l2))(fc)
A_ = Dense(node_size, 'relu', name='2nd')(fc)
model = Model(inputs=[A, L], outputs=[A_, Y])
emb = Model(inputs=A, outputs=Y)
return model, emb
class SDNE(object):
def __init__(self, graph, hidden_size=[32, 16], alpha=1e-6, beta=5., nu1=1e-5, nu2=1e-4, ):
self.graph = graph
# self.g.remove_edges_from(self.g.selfloop_edges())
self.idx2node, self.node2idx = preprocess_nxgraph(self.graph)
self.node_size = self.graph.number_of_nodes()
self.hidden_size = hidden_size
self.alpha = alpha
self.beta = beta
self.nu1 = nu1
self.nu2 = nu2
self.A, self.L = self._create_A_L(
self.graph, self.node2idx) # Adj Matrix,L Matrix
self.reset_model()
self.inputs = [self.A, self.L]
self._embeddings = {}
def reset_model(self, opt='adam'):
self.model, self.emb_model = create_model(self.node_size, hidden_size=self.hidden_size, l1=self.nu1,
l2=self.nu2)
self.model.compile(opt, [l_2nd(self.beta), l_1st(self.alpha)])
self.get_embeddings()
def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1):
if batch_size >= self.node_size:
if batch_size > self.node_size:
print('batch_size({0}) > node_size({1}),set batch_size = {1}'.format(
batch_size, self.node_size))
batch_size = self.node_size
return self.model.fit([self.A.todense(), self.L.todense()], [self.A.todense(), self.L.todense()],
batch_size=batch_size, epochs=epochs, initial_epoch=initial_epoch, verbose=verbose,
shuffle=False, )
else:
steps_per_epoch = (self.node_size - 1) // batch_size + 1
hist = History()
hist.on_train_begin()
logs = {}
for epoch in range(initial_epoch, epochs):
start_time = time.time()
losses = np.zeros(3)
for i in range(steps_per_epoch):
index = np.arange(
i * batch_size, min((i + 1) * batch_size, self.node_size))
A_train = self.A[index, :].todense()
L_mat_train = self.L[index][:, index].todense()
inp = [A_train, L_mat_train]
batch_losses = self.model.train_on_batch(inp, inp)
losses += batch_losses
losses = losses / steps_per_epoch
logs['loss'] = losses[0]
logs['2nd_loss'] = losses[1]
logs['1st_loss'] = losses[2]
epoch_time = int(time.time() - start_time)
hist.on_epoch_end(epoch, logs)
if verbose > 0:
print('Epoch {0}/{1}'.format(epoch + 1, epochs))
print('{0}s - loss: {1: .4f} - 2nd_loss: {2: .4f} - 1st_loss: {3: .4f}'.format(
epoch_time, losses[0], losses[1], losses[2]))
return hist
def evaluate(self, ):
return self.model.evaluate(x=self.inputs, y=self.inputs, batch_size=self.node_size)
def get_embeddings(self):
self._embeddings = {}
embeddings = self.emb_model.predict(self.A.todense(), batch_size=self.node_size)
look_back = self.idx2node
for i, embedding in enumerate(embeddings):
self._embeddings[look_back[i]] = embedding
return self._embeddings
def _create_A_L(self, graph, node2idx):
node_size = graph.number_of_nodes()
A_data = []
A_row_index = []
A_col_index = []
for edge in graph.edges():
v1, v2 = edge
edge_weight = graph[v1][v2].get('weight', 1)
A_data.append(edge_weight)
A_row_index.append(node2idx[v1])
A_col_index.append(node2idx[v2])
A = sp.csr_matrix((A_data, (A_row_index, A_col_index)), shape=(node_size, node_size))
A_ = sp.csr_matrix((A_data + A_data, (A_row_index + A_col_index, A_col_index + A_row_index)),
shape=(node_size, node_size))
D = sp.diags(A_.sum(axis=1).flatten().tolist()[0])
L = D - A_
return A, L
import numpy as np
from ge.classify import read_node_label, Classifier
from ge import SDNE
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1],
label=c) # c=node_colors)
plt.legend()
plt.show()
if __name__ == "__main__":
G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
model = SDNE(G, hidden_size=[256, 128],)
model.train(batch_size=3000, epochs=40, verbose=2)
embeddings = model.get_embeddings()
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from .deepwalk import DeepWalk
from .node2vec import Node2Vec
from .line import LINE
from .sdne import SDNE
from .struc2vec import Struc2Vec
__all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec"]
GraphEmbedding-master/pics/code.png

28.7 KiB

GraphEmbedding-master/pics/deepctrbot.png

20.8 KiB

GraphEmbedding-master/pics/edge_list.png

25.6 KiB

GraphEmbedding-master/pics/weichennote.png

68.6 KiB

This diff is collapsed.