diff --git a/src/Training_Phase/Clustering/constants.py b/src/Training_Phase/Clustering/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..efe7dd36a07184ea8b6b7809a0c2c07574e595fb --- /dev/null +++ b/src/Training_Phase/Clustering/constants.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python + +SEED = 30 +NUM_TRIALS = 30 +NUM_DEVS = 3. diff --git a/src/Training_Phase/Clustering/create_seed_clusters.py b/src/Training_Phase/Clustering/create_seed_clusters.py new file mode 100644 index 0000000000000000000000000000000000000000..2605d6be87c61aa5516d93bad4322b042324ffb2 --- /dev/null +++ b/src/Training_Phase/Clustering/create_seed_clusters.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python + +""" +Source: https://github.com/sbustreamspot/sbustreamspot-train +""" + + +import argparse +from constants import * +import numpy as np +import random +from medoids import _k_medoids_spawn_once, k_medoids +from scipy.spatial.distance import pdist, squareform +from sklearn.metrics import silhouette_score + +# finding best number of clusters +# http://stackoverflow.com/questions/15376075/cluster-analysis-in-r-determine-the-optimal-number-of-clusters/15376462#15376462 + +random.seed(SEED) +np.random.seed(SEED) + +parser = argparse.ArgumentParser() +parser.add_argument('--input', help='Training graph vectors', + required=True) +args = vars(parser.parse_args()) + +input_file = args['input'] +with open(input_file, 'r') as f: + + X = [] # note: row i = graph ID i + G = [] + for line in f: + fields = list(map(float, line.strip().split('\t'))) + graph_id = fields[0] + graph_vector = fields[1:] + G.append(graph_id) + X.append(graph_vector) + + X = np.array(X) + dists = squareform(pdist(X, metric='euclidean')) + def distance(a, b): + return dists[a][b] + + best_n_clusters = -1 + best_silhouette_avg = -1 + best_cluster_labels = None + best_cluster_centers = None + for n_clusters in range(3,20): + for trial in range(NUM_TRIALS): + # run many trials for a given number of clusters + _, medoids = _k_medoids_spawn_once(points=list(range(X.shape[0])), + k=n_clusters, + distance=distance, + max_iterations=10000, + verbose=False) + cluster_labels = [-1] * X.shape[0] + size = 0 + for medoid_idx, medoid in enumerate(medoids): + size +=1 + graphs = medoid.elements + for graph in graphs: + cluster_labels[graph] = medoid_idx + cluster_labels = np.array(cluster_labels) + if (size >1): + silhouette_avg = silhouette_score(X, cluster_labels, metric='euclidean') + #print(str(n_clusters)+'\t'+str(silhouette_avg)+'\n') + #print n_clusters, trial, 'silhouette score =', silhouette_avg + if silhouette_avg > best_silhouette_avg or\ + (silhouette_avg == best_silhouette_avg and\ + n_clusters > best_n_clusters): # favour more clusters + best_silhouette_avg = silhouette_avg + best_n_clusters = n_clusters + best_cluster_labels = cluster_labels + best_cluster_centers = medoids + + all_cluster_dists = [] + cluster_threshold = [-1] * best_n_clusters + for cluster_idx in range(best_n_clusters): + print(best_n_clusters) + cluster_center = best_cluster_centers[cluster_idx].kernel + cluster_graphs = best_cluster_centers[cluster_idx].elements + + cluster_dists = [dists[cluster_center][graph] for graph in cluster_graphs + if graph != cluster_center] + all_cluster_dists.extend(cluster_dists) + mean_dist = np.mean(cluster_dists) + std_dist = np.std(cluster_dists) + + if len(cluster_dists) == 0: # singleton clusters, shouldnt happen + mean_dist = 0.0 + std_dist = 0.0 + all_cluster_dists.append(0.0) + + cluster_threshold[cluster_idx] = mean_dist + NUM_DEVS * std_dist # P(>) <= 10% + mean_all_cluster_dists = np.mean(all_cluster_dists) + std_all_cluster_dists = np.mean(all_cluster_dists) + all_cluster_threshold = mean_all_cluster_dists + NUM_DEVS * std_all_cluster_dists + + print((str(best_n_clusters) + '\t' + str(X.shape[0]) + '\t'), end=' ') + print("{:3.4f}".format(all_cluster_threshold)) + + for cluster_idx in range(best_n_clusters): + cluster_graphs = best_cluster_centers[cluster_idx].elements + threshold = cluster_threshold[cluster_idx] + print("{:3.4f}".format(threshold) + '\t','\t'.join([str(G[graph]) for graph in cluster_graphs])) + #print('\t'.join([str(graph) for graph in cluster_graphs])) diff --git a/src/Training_Phase/Clustering/medoids.py b/src/Training_Phase/Clustering/medoids.py new file mode 100644 index 0000000000000000000000000000000000000000..06441f7cdb06c2148711bf7a09376a3dd1e53307 --- /dev/null +++ b/src/Training_Phase/Clustering/medoids.py @@ -0,0 +1,169 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Source: https://github.com/alexprengere/medoids +""" + + +try: + range = xrange +except NameError: + pass + +import random +from operator import itemgetter +_MAX_ITER = int(1e3) + +from constants import * +random.seed(SEED) + +class Medoid(object): + __slots__ = ['kernel', 'elements'] + + def __init__(self, kernel, elements=None): + self.kernel = kernel + self.elements = [] if elements is None else elements + + def __repr__(self): + return 'Medoid({0}, {1})'.format(self.kernel, self.elements) + + def __iter__(self): + return iter(self.elements) + + def compute_kernel(self, distance): + return min(self, key=lambda e: sum(distance(e, other) for other in self)) + + def compute_diameter(self, distance): + return max(distance(a, b) for a in self for b in self) + + +def _k_medoids_spawn_once(points, k, distance, max_iterations=_MAX_ITER, verbose=True): + """K-medoids algorithm with one spawn of medoid kernels. + + :param points: the list of points + :param k: the number of clusters + :param distance: the distance function, distance(p, q) = ||q - p|| + :param max_iterations: the maximum number of iterations + :param verbose: verbosity + :returns: the partition, structured as \ + a list of [kernel of the cluster, [elements in the cluster]] + + >>> points = [1, 2, 3, 4, 5, 6, 7] + >>> def distance(a, b): + ... return abs(b - a) + >>> diameter, medoids = _k_medoids_spawn_once(points, k=2, distance=distance) #doctest: +SKIP + * New chosen kernels: [6, 3] + * Iteration over after 3 steps, max diameter 3 + """ + if k <= 0: + raise ValueError('Number of medoids must be strictly positive') + if k > len(points): + raise ValueError('Number of medoids exceeds number of points') + + # Medoids initialization + medoids = [Medoid(kernel=p) for p in random.sample(points, k)] + if verbose: + print('* New chosen kernels: {0}'.format([m.kernel for m in medoids])) + + for n in range(1, 1 + max_iterations): + # Resetting medoids + for m in medoids: + m.elements = [] + + # Putting points in closest medoids + for p in points: + closest_medoid = min(medoids, key=lambda m: distance(m.kernel, p)) + closest_medoid.elements.append(p) + + # Removing empty medoids + medoids = [m for m in medoids if m.elements] + # Electing new kernels for each medoids + change = False + for m in medoids: + new_kernel = m.compute_kernel(distance) + if new_kernel != m.kernel: + m.kernel = new_kernel + change = True + + if not change: + break + + diameter = max(m.compute_diameter(distance) for m in medoids) + if verbose: + print('* Iteration over after {0} steps, max diameter {1}'.format(n, diameter)) + + return diameter, medoids + + +def k_medoids(points, k, distance, spawn, max_iterations=_MAX_ITER, verbose=True): + """ + Same as _k_medoids_spawn_once, but we iterate also the spawning process. + We keep the minimum of the biggest diameter as a reference for the best spawn. + + :param points: the list of points + :param k: the number of clusters + :param distance: the distance function, distance(p, q) = ||q - p|| + :param spawn: the number of spawns + :param max_iterations: the maximum number of iterations + :param verbose: boolean, verbosity status + :returns: the partition, structured as \ + a list of [kernel of the cluster, [elements in the cluster]] + """ + kw = { + 'points': points, + 'k': k, + 'distance': distance, + 'max_iterations': max_iterations, + 'verbose': verbose, + } + # Here the result of _k_medoids_spawn_once function is a tuple containing + # in the second element the diameter of the biggest medoid, so the min + # function will return the best medoids arrangement, in the sense that the + # diameter max will be minimum + diameter, medoids = min((_k_medoids_spawn_once(**kw) for _ in range(spawn)), key=itemgetter(0)) + if verbose: + print(('~~ Spawn end: min of max diameters {0:.3f} ' + 'for medoids: {1}').format(diameter, medoids)) + + return diameter, medoids + + +def k_medoids_auto_k(points, distance, spawn, diam_max, max_iterations=_MAX_ITER, verbose=True): + """ + Same as k_medoids, but we increase the number of clusters until we have a + good enough similarity between points. + + :param points: the list of points + :param diam_max: the maximum diameter allowed, otherwise \ + the algorithm will start over and increment the number of clusters + :param distance: the distance function, distance(p, q) = ||q - p|| + :param spawn: the number of spawns + :param iteration: the maximum number of iterations + :param verbose: verbosity + :returns: the partition, structured as \ + a list of [kernel of the cluster, [elements in the cluster]] + """ + if not points: + raise ValueError('No points given!') + + kw = { + 'distance': distance, + 'spawn': spawn, + 'max_iterations': max_iterations, + 'verbose': verbose, + } + + for k, _ in enumerate(points, start=1): + diameter, medoids = k_medoids(points, k, **kw) + if diameter <= diam_max: + break + if verbose: + print('*** Diameter too big {0:.3f} > {1:.3f}'.format(diameter, diam_max)) + print('*** Now trying {0} clusters\n'.format(k + 1)) + + if verbose: + print('*** Diameter ok {0:.3f} <= {1:.3f}'.format(diameter, diam_max)) + print('*** Stopping, {0} clusters enough ({1} points initially)'.format(k, len(points))) + + return diameter, medoids