3 @brief Silhouette - method of interpretation and validation of consistency.
4 @details Implementation based on paper @cite article::cluster::silhouette::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
13 from enum
import IntEnum
24 from pyclustering.core.wrapper
import ccore_library
25 from pyclustering.core.metric_wrapper
import metric_wrapper
27 import pyclustering.core.silhouette_wrapper
as wrapper
32 @brief Represents Silhouette method that is used interpretation and validation of consistency.
33 @details The silhouette value is a measure of how similar an object is to its own cluster compared to other clusters.
34 Be aware that silhouette method is applicable for K algorithm family, such as K-Means, K-Medians,
35 K-Medoids, X-Means, etc., not not applicable for DBSCAN, OPTICS, CURE, etc. The Silhouette value is
36 calculated using following formula:
37 \f[s\left ( i \right )=\frac{ b\left ( i \right ) - a\left ( i \right ) }{ max\left \{ a\left ( i \right ), b\left ( i \right ) \right \}}\f]
38 where \f$a\left ( i \right )\f$ - is average distance from object i to objects in its own cluster,
39 \f$b\left ( i \right )\f$ - is average distance from object i to objects in the nearest cluster (the appropriate among other clusters).
41 Here is an example where Silhouette score is calculated for K-Means's clustering result:
43 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
44 from pyclustering.cluster.kmeans import kmeans
45 from pyclustering.cluster.silhouette import silhouette
47 from pyclustering.samples.definitions import SIMPLE_SAMPLES
48 from pyclustering.utils import read_sample
50 # Read data 'SampleSimple3' from Simple Sample collection.
51 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
53 # Prepare initial centers
54 centers = kmeans_plusplus_initializer(sample, 4).initialize()
56 # Perform cluster analysis
57 kmeans_instance = kmeans(sample, centers)
58 kmeans_instance.process()
59 clusters = kmeans_instance.get_clusters()
61 # Calculate Silhouette score
62 score = silhouette(sample, clusters).process().get_score()
65 Let's perform clustering of the same sample by K-Means algorithm using different `K` values (2, 4, 6 and 8) and
66 estimate clustering results using Silhouette method.
68 from pyclustering.cluster.kmeans import kmeans
69 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
70 from pyclustering.cluster.silhouette import silhouette
72 from pyclustering.samples.definitions import SIMPLE_SAMPLES
73 from pyclustering.utils import read_sample
75 import matplotlib.pyplot as plt
77 def get_score(sample, amount_clusters):
78 # Prepare initial centers for K-Means algorithm.
79 centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize()
81 # Perform cluster analysis.
82 kmeans_instance = kmeans(sample, centers)
83 kmeans_instance.process()
84 clusters = kmeans_instance.get_clusters()
86 # Calculate Silhouette score.
87 return silhouette(sample, clusters).process().get_score()
89 def draw_score(figure, position, title, score):
90 ax = figure.add_subplot(position)
91 ax.bar(range(0, len(score)), score, width=0.7)
93 ax.set_xlim(0, len(score))
94 ax.set_xticklabels([])
97 # Read data 'SampleSimple3' from Simple Sample collection.
98 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
100 # Perform cluster analysis and estimation by Silhouette.
101 score_2 = get_score(sample, 2) # K = 2 (amount of clusters).
102 score_4 = get_score(sample, 4) # K = 4 - optimal.
103 score_6 = get_score(sample, 6) # K = 6.
104 score_8 = get_score(sample, 8) # K = 8.
107 figure = plt.figure()
109 # Visualize each result separately.
110 draw_score(figure, 221, 'K = 2', score_2)
111 draw_score(figure, 222, 'K = 4 (optimal)', score_4)
112 draw_score(figure, 223, 'K = 6', score_6)
113 draw_score(figure, 224, 'K = 8', score_8)
115 # Show a plot with visualized results.
119 There is visualized results that were done by Silhouette method. `K = 4` is the optimal amount of clusters in line
120 with Silhouette method because the score for each point is close to `1.0` and the average score for `K = 4` is
121 biggest value among others `K`.
123 @image html silhouette_score_for_various_K.png "Fig. 1. Silhouette scores for various K."
125 @see kmeans, kmedoids, kmedians, xmeans, elbow
131 @brief Initializes Silhouette method for analysis.
133 @param[in] data (array_like): Input data that was used for cluster analysis and that is presented as list of
134 points or distance matrix (defined by parameter 'data_type', by default data is considered as a list
136 @param[in] clusters (list): Clusters that have been obtained after cluster analysis.
137 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
139 <b>Keyword Args:</b><br>
140 - metric (distance_metric): Metric that was used for cluster analysis and should be used for Silhouette
141 score calculation (by default Square Euclidean distance).
142 - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
143 - ccore (bool): If True then CCORE (C++ implementation of pyclustering library) is used (by default True).
149 self.
__data_type = kwargs.get(
'data_type',
'points')
151 if self.
__metric.get_type() != type_metric.USER_DEFINED:
156 self.
__score = [0.0] * len(data)
158 self.
__ccore = kwargs.get(
'ccore',
True)
and self.
__metric.get_type() != type_metric.USER_DEFINED
160 self.
__ccore = ccore_library.workable()
163 self.
__data = numpy.array(data)
170 @brief Calculates Silhouette score for each object from input data.
172 @return (silhouette) Instance of the method (self).
183 def __process_by_ccore(self):
185 @brief Performs processing using CCORE (C/C++ part of pyclustering library).
188 ccore_metric = metric_wrapper.create_instance(self.
__metric)
192 def __process_by_python(self):
194 @brief Performs processing using python code.
197 for index_cluster
in range(len(self.
__clusters)):
198 for index_point
in self.
__clusters[index_cluster]:
204 @brief Returns Silhouette score for each object from input data.
212 def __calculate_score(self, index_point, index_cluster):
214 @brief Calculates Silhouette score for the specific object defined by index_point.
216 @param[in] index_point (uint): Index point from input data for which Silhouette score should be calculated.
217 @param[in] index_cluster (uint): Index cluster to which the point belongs to.
219 @return (float) Silhouette score for the object.
225 difference = self.
__data[index_point]
230 return (b_score - a_score) / max(a_score, b_score)
233 def __calculate_within_cluster_score(self, index_cluster, difference):
235 @brief Calculates 'A' score for the specific object in cluster to which it belongs to.
237 @param[in] index_point (uint): Index point from input data for which 'A' score should be calculated.
238 @param[in] index_cluster (uint): Index cluster to which the point is belong to.
240 @return (float) 'A' score for the object.
247 return score / (len(self.
__clusters[index_cluster]) - 1)
250 def __calculate_cluster_score(self, index_cluster, difference):
252 @brief Calculates 'B*' score for the specific object for specific cluster.
254 @param[in] index_point (uint): Index point from input data for which 'B*' score should be calculated.
255 @param[in] index_cluster (uint): Index cluster to which the point is belong to.
257 @return (float) 'B*' score for the object for specific cluster.
262 return score / len(self.
__clusters[index_cluster])
265 def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference):
267 @brief Calculates 'B' score for the specific object for the nearest cluster.
269 @param[in] index_point (uint): Index point from input data for which 'B' score should be calculated.
270 @param[in] index_cluster (uint): Index cluster to which the point is belong to.
272 @return (float) 'B' score for the object.
276 optimal_score = float(
'inf')
277 for index_neighbor_cluster
in range(len(self.
__clusters)):
278 if index_cluster != index_neighbor_cluster:
280 if candidate_score < optimal_score:
281 optimal_score = candidate_score
283 if optimal_score == float(
'inf'):
289 def __calculate_cluster_difference(self, index_cluster, difference):
291 @brief Calculates distance from each object in specified cluster to specified object.
293 @param[in] index_point (uint): Index point for which difference is calculated.
295 @return (list) Distance from specified object to each object from input data in specified cluster.
298 cluster_difference = 0.0
299 for index_point
in self.
__clusters[index_cluster]:
300 cluster_difference += difference[index_point]
302 return cluster_difference
305 def __calculate_dataset_difference(self, index_point):
307 @brief Calculate distance from each object to specified object.
309 @param[in] index_point (uint): Index point for which difference with other points is calculated.
311 @return (list) Distance to each object from input data from the specified.
315 if self.
__metric.get_type() != type_metric.USER_DEFINED:
320 return dataset_differences
323 def __verify_arguments(self):
325 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
329 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__data))
332 raise ValueError(
"Input clusters are empty (size: '%d')." % len(self.
__clusters))
338 @brief Defines algorithms that can be used to find optimal number of cluster using Silhouette method.
340 @see silhouette_ksearch
355 @brief Returns algorithm type that corresponds to specified enumeration value.
357 @return (type) Algorithm type for cluster analysis.
360 if self == silhouette_ksearch_type.KMEANS:
362 elif self == silhouette_ksearch_type.KMEDIANS:
364 elif self == silhouette_ksearch_type.KMEDOIDS:
373 @brief Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means,
374 K-Medians, K-Medoids) that is based on Silhouette method.
376 @details This algorithm uses average value of scores for estimation and applicable for clusters that are well
377 separated. Here is an example where clusters are well separated (sample 'Hepta'):
379 from pyclustering.cluster import cluster_visualizer
380 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
381 from pyclustering.cluster.kmeans import kmeans
382 from pyclustering.cluster.silhouette import silhouette_ksearch_type, silhouette_ksearch
383 from pyclustering.samples.definitions import FCPS_SAMPLES
384 from pyclustering.utils import read_sample
386 sample = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA)
387 search_instance = silhouette_ksearch(sample, 2, 10, algorithm=silhouette_ksearch_type.KMEANS).process()
389 amount = search_instance.get_amount()
390 scores = search_instance.get_scores()
392 print("Scores: '%s'" % str(scores))
394 initial_centers = kmeans_plusplus_initializer(sample, amount).initialize()
395 kmeans_instance = kmeans(sample, initial_centers).process()
397 clusters = kmeans_instance.get_clusters()
399 visualizer = cluster_visualizer()
400 visualizer.append_clusters(clusters, sample)
404 Obtained Silhouette scores for each K:
406 Scores: '{2: 0.418434, 3: 0.450906, 4: 0.534709, 5: 0.689970, 6: 0.588460, 7: 0.882674, 8: 0.804725, 9: 0.780189}'
409 K = 7 has the bigger average Silhouette score and it means that it is optimal amount of clusters:
410 @image html silhouette_ksearch_hepta.png "Silhouette ksearch's analysis with further K-Means clustering (sample 'Hepta')."
412 @see silhouette_ksearch_type
418 @brief Initialize Silhouette search algorithm to find out optimal amount of clusters.
420 @param[in] data (array_like): Input data that is used for searching optimal amount of clusters.
421 @param[in] kmin (uint): Minimum amount of clusters that might be allocated. Should be equal or greater than `2`.
422 @param[in] kmax (uint): Maximum amount of clusters that might be allocated. Should be equal or less than amount
423 of points in input data.
424 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `algorithm`, `random_state`).
426 <b>Keyword Args:</b><br>
427 - algorithm (silhouette_ksearch_type): Defines algorithm that is used for searching optimal number of
428 clusters (by default K-Means).
429 - ccore (bool): If True then CCORE (C++ implementation of pyclustering library) is used (by default True).
436 self.
__algorithm = kwargs.get(
'algorithm', silhouette_ksearch_type.KMEANS)
446 self.
__ccore = kwargs.get(
'ccore',
True)
448 self.
__ccore = ccore_library.workable()
453 @brief Performs analysis to find optimal amount of clusters.
455 @see get_amount, get_score, get_scores
457 @return (silhouette_search) Itself instance (silhouette_search)
468 def __process_by_ccore(self):
470 @brief Performs processing using CCORE (C/C++ part of pyclustering library).
478 scores_list = results[2]
480 for i
in range(len(scores_list)):
484 def __process_by_python(self):
486 @brief Performs processing using python code.
493 if len(clusters) != k:
499 self.
__scores[k] = sum(score) / len(score)
508 @brief Returns optimal amount of clusters that has been found during analysis.
510 @return (uint) Optimal amount of clusters.
520 @brief Returns silhouette score that belongs to optimal amount of clusters (k).
522 @return (float) Score that belong to optimal amount of clusters.
524 @see process, get_scores
532 @brief Returns silhouette score for each K value (amount of clusters).
534 @return (dict) Silhouette score for each K value, where key is a K value and value is a silhouette score.
536 @see process, get_score
542 def __calculate_clusters(self, k):
544 @brief Performs cluster analysis using specified K value.
546 @param[in] k (uint): Amount of clusters that should be allocated.
548 @return (array_like) Allocated clusters.
553 return algorithm_type(self.
__data, initial_values).
process().get_clusters()
556 def __verify_arguments(self):
558 @brief Checks algorithm's arguments and if some of them is incorrect then exception is thrown.
562 raise ValueError(
"K max value '" + str(self.
__kmax) +
"' is bigger than amount of objects '" +
563 str(len(self.
__data)) +
"' in input data.")
566 raise ValueError(
"K min value '" + str(self.
__kmin) +
"' should be greater than 1 (impossible to provide "
567 "silhouette score for only one cluster).")