Skip to content
Snippets Groups Projects
Commit 9be413ae authored by Abd Errahmane Kiouche's avatar Abd Errahmane Kiouche :speech_balloon:
Browse files

Update src/Training_Phase/Clustering/create_seed_clusters.py,...

Update src/Training_Phase/Clustering/create_seed_clusters.py, src/Training_Phase/Clustering/constants.py, src/Training_Phase/Clustering/medoids.py files
parent 99742c94
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
SEED = 30
NUM_TRIALS = 30
NUM_DEVS = 3.
#!/usr/bin/env python
"""
Source: https://github.com/sbustreamspot/sbustreamspot-train
"""
import argparse
from constants import *
import numpy as np
import random
from medoids import _k_medoids_spawn_once, k_medoids
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score
# finding best number of clusters
# http://stackoverflow.com/questions/15376075/cluster-analysis-in-r-determine-the-optimal-number-of-clusters/15376462#15376462
random.seed(SEED)
np.random.seed(SEED)
parser = argparse.ArgumentParser()
parser.add_argument('--input', help='Training graph vectors',
required=True)
args = vars(parser.parse_args())
input_file = args['input']
with open(input_file, 'r') as f:
X = [] # note: row i = graph ID i
G = []
for line in f:
fields = list(map(float, line.strip().split('\t')))
graph_id = fields[0]
graph_vector = fields[1:]
G.append(graph_id)
X.append(graph_vector)
X = np.array(X)
dists = squareform(pdist(X, metric='euclidean'))
def distance(a, b):
return dists[a][b]
best_n_clusters = -1
best_silhouette_avg = -1
best_cluster_labels = None
best_cluster_centers = None
for n_clusters in range(3,20):
for trial in range(NUM_TRIALS):
# run many trials for a given number of clusters
_, medoids = _k_medoids_spawn_once(points=list(range(X.shape[0])),
k=n_clusters,
distance=distance,
max_iterations=10000,
verbose=False)
cluster_labels = [-1] * X.shape[0]
size = 0
for medoid_idx, medoid in enumerate(medoids):
size +=1
graphs = medoid.elements
for graph in graphs:
cluster_labels[graph] = medoid_idx
cluster_labels = np.array(cluster_labels)
if (size >1):
silhouette_avg = silhouette_score(X, cluster_labels, metric='euclidean')
#print(str(n_clusters)+'\t'+str(silhouette_avg)+'\n')
#print n_clusters, trial, 'silhouette score =', silhouette_avg
if silhouette_avg > best_silhouette_avg or\
(silhouette_avg == best_silhouette_avg and\
n_clusters > best_n_clusters): # favour more clusters
best_silhouette_avg = silhouette_avg
best_n_clusters = n_clusters
best_cluster_labels = cluster_labels
best_cluster_centers = medoids
all_cluster_dists = []
cluster_threshold = [-1] * best_n_clusters
for cluster_idx in range(best_n_clusters):
print(best_n_clusters)
cluster_center = best_cluster_centers[cluster_idx].kernel
cluster_graphs = best_cluster_centers[cluster_idx].elements
cluster_dists = [dists[cluster_center][graph] for graph in cluster_graphs
if graph != cluster_center]
all_cluster_dists.extend(cluster_dists)
mean_dist = np.mean(cluster_dists)
std_dist = np.std(cluster_dists)
if len(cluster_dists) == 0: # singleton clusters, shouldnt happen
mean_dist = 0.0
std_dist = 0.0
all_cluster_dists.append(0.0)
cluster_threshold[cluster_idx] = mean_dist + NUM_DEVS * std_dist # P(>) <= 10%
mean_all_cluster_dists = np.mean(all_cluster_dists)
std_all_cluster_dists = np.mean(all_cluster_dists)
all_cluster_threshold = mean_all_cluster_dists + NUM_DEVS * std_all_cluster_dists
print((str(best_n_clusters) + '\t' + str(X.shape[0]) + '\t'), end=' ')
print("{:3.4f}".format(all_cluster_threshold))
for cluster_idx in range(best_n_clusters):
cluster_graphs = best_cluster_centers[cluster_idx].elements
threshold = cluster_threshold[cluster_idx]
print("{:3.4f}".format(threshold) + '\t','\t'.join([str(G[graph]) for graph in cluster_graphs]))
#print('\t'.join([str(graph) for graph in cluster_graphs]))
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Source: https://github.com/alexprengere/medoids
"""
try:
range = xrange
except NameError:
pass
import random
from operator import itemgetter
_MAX_ITER = int(1e3)
from constants import *
random.seed(SEED)
class Medoid(object):
__slots__ = ['kernel', 'elements']
def __init__(self, kernel, elements=None):
self.kernel = kernel
self.elements = [] if elements is None else elements
def __repr__(self):
return 'Medoid({0}, {1})'.format(self.kernel, self.elements)
def __iter__(self):
return iter(self.elements)
def compute_kernel(self, distance):
return min(self, key=lambda e: sum(distance(e, other) for other in self))
def compute_diameter(self, distance):
return max(distance(a, b) for a in self for b in self)
def _k_medoids_spawn_once(points, k, distance, max_iterations=_MAX_ITER, verbose=True):
"""K-medoids algorithm with one spawn of medoid kernels.
:param points: the list of points
:param k: the number of clusters
:param distance: the distance function, distance(p, q) = ||q - p||
:param max_iterations: the maximum number of iterations
:param verbose: verbosity
:returns: the partition, structured as \
a list of [kernel of the cluster, [elements in the cluster]]
>>> points = [1, 2, 3, 4, 5, 6, 7]
>>> def distance(a, b):
... return abs(b - a)
>>> diameter, medoids = _k_medoids_spawn_once(points, k=2, distance=distance) #doctest: +SKIP
* New chosen kernels: [6, 3]
* Iteration over after 3 steps, max diameter 3
"""
if k <= 0:
raise ValueError('Number of medoids must be strictly positive')
if k > len(points):
raise ValueError('Number of medoids exceeds number of points')
# Medoids initialization
medoids = [Medoid(kernel=p) for p in random.sample(points, k)]
if verbose:
print('* New chosen kernels: {0}'.format([m.kernel for m in medoids]))
for n in range(1, 1 + max_iterations):
# Resetting medoids
for m in medoids:
m.elements = []
# Putting points in closest medoids
for p in points:
closest_medoid = min(medoids, key=lambda m: distance(m.kernel, p))
closest_medoid.elements.append(p)
# Removing empty medoids
medoids = [m for m in medoids if m.elements]
# Electing new kernels for each medoids
change = False
for m in medoids:
new_kernel = m.compute_kernel(distance)
if new_kernel != m.kernel:
m.kernel = new_kernel
change = True
if not change:
break
diameter = max(m.compute_diameter(distance) for m in medoids)
if verbose:
print('* Iteration over after {0} steps, max diameter {1}'.format(n, diameter))
return diameter, medoids
def k_medoids(points, k, distance, spawn, max_iterations=_MAX_ITER, verbose=True):
"""
Same as _k_medoids_spawn_once, but we iterate also the spawning process.
We keep the minimum of the biggest diameter as a reference for the best spawn.
:param points: the list of points
:param k: the number of clusters
:param distance: the distance function, distance(p, q) = ||q - p||
:param spawn: the number of spawns
:param max_iterations: the maximum number of iterations
:param verbose: boolean, verbosity status
:returns: the partition, structured as \
a list of [kernel of the cluster, [elements in the cluster]]
"""
kw = {
'points': points,
'k': k,
'distance': distance,
'max_iterations': max_iterations,
'verbose': verbose,
}
# Here the result of _k_medoids_spawn_once function is a tuple containing
# in the second element the diameter of the biggest medoid, so the min
# function will return the best medoids arrangement, in the sense that the
# diameter max will be minimum
diameter, medoids = min((_k_medoids_spawn_once(**kw) for _ in range(spawn)), key=itemgetter(0))
if verbose:
print(('~~ Spawn end: min of max diameters {0:.3f} '
'for medoids: {1}').format(diameter, medoids))
return diameter, medoids
def k_medoids_auto_k(points, distance, spawn, diam_max, max_iterations=_MAX_ITER, verbose=True):
"""
Same as k_medoids, but we increase the number of clusters until we have a
good enough similarity between points.
:param points: the list of points
:param diam_max: the maximum diameter allowed, otherwise \
the algorithm will start over and increment the number of clusters
:param distance: the distance function, distance(p, q) = ||q - p||
:param spawn: the number of spawns
:param iteration: the maximum number of iterations
:param verbose: verbosity
:returns: the partition, structured as \
a list of [kernel of the cluster, [elements in the cluster]]
"""
if not points:
raise ValueError('No points given!')
kw = {
'distance': distance,
'spawn': spawn,
'max_iterations': max_iterations,
'verbose': verbose,
}
for k, _ in enumerate(points, start=1):
diameter, medoids = k_medoids(points, k, **kw)
if diameter <= diam_max:
break
if verbose:
print('*** Diameter too big {0:.3f} > {1:.3f}'.format(diameter, diam_max))
print('*** Now trying {0} clusters\n'.format(k + 1))
if verbose:
print('*** Diameter ok {0:.3f} <= {1:.3f}'.format(diameter, diam_max))
print('*** Stopping, {0} clusters enough ({1} points initially)'.format(k, len(points)))
return diameter, medoids
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment