 silhouette.py
1 """!
2
3 @brief Silhouette - method of interpretation and validation of consistency.
4 @details Implementation based on paper @cite article::cluster::silhouette::1.
5
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
9
11  PyClustering is free software: you can redistribute it and/or modify
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24
25 """
26
27
28 from enum import Enum
29
30 import numpy
31
32 from pyclustering.cluster.kmeans import kmeans
33 from pyclustering.cluster.kmedians import kmedians
34 from pyclustering.cluster.kmedoids import kmedoids
35 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
36
37 from pyclustering.utils.metric import distance_metric, type_metric
38
39
40 class silhouette:
41  """!
42  @brief Represents Silhouette method that is used interpretation and validation of consistency.
43  @details The silhouette value is a measure of how similar an object is to its own cluster compared to other clusters.
44  Be aware that silhouette method is applicable for K algorithm family, such as K-Means, K-Medians,
45  K-Medoids, X-Means, etc., not not applicable for DBSCAN, OPTICS, CURE, etc. The Silhouette value is
46  calculated using following formula:
47  \f[s\left ( i \right )=\frac{ b\left ( i \right ) - a\left ( i \right ) }{ max\left \{ a\left ( i \right ), b\left ( i \right ) \right \}}\f]
48  where \f$a\left ( i \right )\f$ - is average distance from object i to objects in its own cluster,
49  \f$b\left ( i \right )\f$ - is average distance from object i to objects in the nearest cluster (the appropriate among other clusters).
50
51  Here is an example where Silhouette score is calculated for K-Means's clustering result:
52  @code
53  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
54  from pyclustering.cluster.kmeans import kmeans
55  from pyclustering.cluster.silhouette import silhouette
56
57  from pyclustering.samples.definitions import SIMPLE_SAMPLES
59
60  # Read data 'SampleSimple3' from Simple Sample collection.
62
63  # Prepare initial centers
64  centers = kmeans_plusplus_initializer(sample, 4).initialize();
65
66  # Perform cluster analysis
67  kmeans_instance = kmeans(sample, centers);
68  kmeans_instance.process();
69  clusters = kmeans_instance.get_clusters();
70
71  # Calculate Silhouette score
72  score = silhouette(sample, clusters).process().get_score()
73  @endcode
74
75  @see kmeans, kmedoids, kmedians, xmeans, elbow
76
77  """
78
79  def __init__(self, data, clusters, **kwargs):
80  """!
81  @brief Initializes Silhouette method for analysis.
82
83  @param[in] data (array_like): Input data that was used for cluster analysis.
84  @param[in] clusters (list): Cluster that have been obtained after cluster analysis.
85  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
86
87  <b>Keyword Args:</b><br>
88  - metric (distance_metric): Metric that was used for cluster analysis and should be used for Silhouette
89  score calculation (by default Square Euclidean distance).
90
91  """
92  self.__data = numpy.array(data)
93  self.__clusters = clusters
94  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
95
96  if self.__metric.get_type() != type_metric.USER_DEFINED:
97  self.__metric.enable_numpy_usage()
98  else:
99  self.__metric.disable_numpy_usage()
100
101  self.__score = [0.0] * len(data)
102
103
104  def process(self):
105  """!
106  @brief Calculates Silhouette score for each object from input data.
107
108  @return (silhouette) Instance of the method (self).
109
110  """
111  for index_cluster in range(len(self.__clusters)):
112  for index_point in self.__clusters[index_cluster]:
113  self.__score[index_point] = self.__calculate_score(index_point, index_cluster)
114
115  return self
116
117
118  def get_score(self):
119  """!
120  @brief Returns Silhouette score for each object from input data.
121
122  @see process
123
124  """
125  return self.__score
126
127
128  def __calculate_score(self, index_point, index_cluster):
129  """!
130  @brief Calculates Silhouette score for the specific object defined by index_point.
131
132  @param[in] index_point (uint): Index point from input data for which Silhouette score should be calculated.
133  @param[in] index_cluster (uint): Index cluster to which the point belongs to.
134
135  @return (float) Silhouette score for the object.
136
137  """
138  difference = self.__calculate_dataset_difference(index_point)
139
140  a_score = self.__calculate_within_cluster_score(index_cluster, difference)
141  b_score = self.__caclulate_optimal_neighbor_cluster_score(index_cluster, difference)
142
143  return (b_score - a_score) / max(a_score, b_score)
144
145
146  def __calculate_within_cluster_score(self, index_cluster, difference):
147  """!
148  @brief Calculates 'A' score for the specific object in cluster to which it belongs to.
149
150  @param[in] index_point (uint): Index point from input data for which 'A' score should be calculated.
151  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
152
153  @return (float) 'A' score for the object.
154
155  """
156
157  score = self.__calculate_cluster_difference(index_cluster, difference)
158  if len(self.__clusters[index_cluster]) == 1:
159  return float('nan')
160  return score / (len(self.__clusters[index_cluster]) - 1)
161
162
163  def __calculate_cluster_score(self, index_cluster, difference):
164  """!
165  @brief Calculates 'B*' score for the specific object for specific cluster.
166
167  @param[in] index_point (uint): Index point from input data for which 'B*' score should be calculated.
168  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
169
170  @return (float) 'B*' score for the object for specific cluster.
171
172  """
173
174  score = self.__calculate_cluster_difference(index_cluster, difference)
175  return score / len(self.__clusters[index_cluster])
176
177
178  def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference):
179  """!
180  @brief Calculates 'B' score for the specific object for the nearest cluster.
181
182  @param[in] index_point (uint): Index point from input data for which 'B' score should be calculated.
183  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
184
185  @return (float) 'B' score for the object.
186
187  """
188
189  optimal_score = float('inf')
190  for index_neighbor_cluster in range(len(self.__clusters)):
191  if index_cluster != index_neighbor_cluster:
192  candidate_score = self.__calculate_cluster_score(index_neighbor_cluster, difference)
193  if candidate_score < optimal_score:
194  optimal_score = candidate_score
195
196  return optimal_score
197
198
199  def __calculate_cluster_difference(self, index_cluster, difference):
200  """!
201  @brief Calculates distance from each object in specified cluster to specified object.
202
203  @param[in] index_point (uint): Index point for which difference is calculated.
204
205  @return (list) Distance from specified object to each object from input data in specified cluster.
206
207  """
208  cluster_difference = 0.0
209  for index_point in self.__clusters[index_cluster]:
210  cluster_difference += difference[index_point]
211
212  return cluster_difference
213
214
215  def __calculate_dataset_difference(self, index_point):
216  """!
217  @brief Calculate distance from each object to specified object.
218
219  @param[in] index_point (uint): Index point for which difference with other points is calculated.
220
221  @return (list) Distance to each object from input data from the specified.
222
223  """
224
225  if self.__metric.get_type() != type_metric.USER_DEFINED:
226  dataset_differences = self.__metric(self.__data, self.__data[index_point])
227  else:
228  dataset_differences = [self.__metric(point, self.__data[index_point]) for point in self.__data]
229
230  return dataset_differences
231
232
233
235  """!
236  @brief Defines algorithms that can be used to find optimal number of cluster using Silhouette method.
237
238  @see silhouette_ksearch
239
240  """
241
242
243  KMEANS = 0
244
245
246  KMEDIANS = 1
247
248
249  KMEDOIDS = 2
250
251  def get_type(self):
252  """!
253  @brief Returns algorithm type that corresponds to specified enumeration value.
254
255  @return (type) Algorithm type for cluster analysis.
256
257  """
258  if self == silhouette_ksearch_type.KMEANS:
259  return kmeans
260  elif self == silhouette_ksearch_type.KMEDIANS:
261  return kmedians
262  elif self == silhouette_ksearch_type.KMEDOIDS:
263  return kmedoids
264  else:
265  return None
266
267
268
270  """!
271  @brief Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means,
272  K-Medians, K-Medoids) that is based on Silhouette method.
273
274  @details This algorithm uses average value of scores for estimation and applicable for clusters that are well
275  separated. Here is an example where clusters are well separated (sample 'Hepta'):
276  @code
277  from pyclustering.cluster import cluster_visualizer
278  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
279  from pyclustering.cluster.kmeans import kmeans
280  from pyclustering.cluster.silhouette import silhouette_ksearch_type, silhouette_ksearch
281  from pyclustering.samples.definitions import FCPS_SAMPLES
283
285  search_instance = silhouette_ksearch(sample, 2, 10, algorithm=silhouette_ksearch_type.KMEANS).process()
286
287  amount = search_instance.get_amount()
288  scores = search_instance.get_scores()
289
290  print("Scores: '%s'" % str(scores))
291
292  initial_centers = kmeans_plusplus_initializer(sample, amount).initialize()
293  kmeans_instance = kmeans(sample, initial_centers).process()
294
295  clusters = kmeans_instance.get_clusters()
296
297  visualizer = cluster_visualizer()
298  visualizer.append_clusters(clusters, sample)
299  visualizer.show()
300  @endcode
301
302  Obtained Silhouette scores for each K:
303  @code
304  Scores: '{2: 0.418434, 3: 0.450906, 4: 0.534709, 5: 0.689970, 6: 0.588460, 7: 0.882674, 8: 0.804725, 9: 0.780189}'
305  @endcode
306
307  K = 7 has the bigger average Silhouette score and it means that it is optimal amount of clusters:
308  @image html silhouette_ksearch_hepta.png "Silhouette ksearch's analysis with further K-Means clustering (sample 'Hepta')."
309
310  @see silhouette_ksearch_type
311
312  """
313
314  def __init__(self, data, kmin, kmax, **kwargs):
315  """!
316  @brief Initialize Silhouette search algorithm to find out optimal amount of clusters.
317
318  @param[in] data (array_like): Input data that is used for searching optimal amount of clusters.
319  @param[in] kmin (uint): Amount of clusters from which search is performed. Should be equal or greater than 2.
320  @param[in] kmax (uint): Amount of clusters to which search is performed. Should be equal or less than amount of
321  points in input data.
322  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'algorithm').
323
324  <b>Keyword Args:</b><br>
325  - algorithm (silhouette_ksearch_type): Defines algorithm that is used for searching optimal number of
326  clusters (by default K-Means).
327
328  """
329  self.__data = data
330  self.__kmin = kmin
331  self.__kmax = kmax
332
333  self.__algorithm = kwargs.get('algorithm', silhouette_ksearch_type.KMEANS)
334  self.__return_index = self.__algorithm == silhouette_ksearch_type.KMEDOIDS
335
336  self.__amount = -1
337  self.__score = float('-Inf')
338  self.__scores = {}
339
340  self.__verify_arguments()
341
342
343  def process(self):
344  """!
345  @brief Performs analysis to find optimal amount of clusters.
346
347  @see get_amount, get_score, get_scores
348
349  @return (silhouette_search) Itself instance (silhouette_search)
350
351  """
352  self.__scores = {}
353
354  for k in range(self.__kmin, self.__kmax):
355  clusters = self.__calculate_clusters(k)
356  if len(clusters) != k:
357  self.__scores[k] = float('nan')
358  continue
359
360  score = silhouette(self.__data, clusters).process().get_score()
361
362  self.__scores[k] = sum(score) / len(score)
363
364  if self.__scores[k] > self.__score:
365  self.__score = self.__scores[k]
366  self.__amount = k
367
368  return self
369
370
371  def get_amount(self):
372  """!
373  @brief Returns optimal amount of clusters that has been found during analysis.
374
375  @return (uint) Optimal amount of clusters.
376
377  @see process
378
379  """
380  return self.__amount
381
382
383  def get_score(self):
384  """!
385  @brief Returns silhouette score that belongs to optimal amount of clusters (k).
386
387  @return (float) Score that belong to optimal amount of clusters.
388
389  @see process, get_scores
390
391  """
392  return self.__score
393
394
395  def get_scores(self):
396  """!
397  @brief Returns silhouette score for each K value (amount of clusters).
398
399  @return (dict) Silhouette score for each K value, where key is a K value and value is a silhouette score.
400
401  @see process, get_score
402
403  """
404  return self.__scores
405
406
407  def __calculate_clusters(self, k):
408  """!
409  @brief Performs cluster analysis using specified K value.
410
411  @param[in] k (uint): Amount of clusters that should be allocated.
412
413  @return (array_like) Allocated clusters.
414
415  """
416  initial_values = kmeans_plusplus_initializer(self.__data, k).initialize(return_index=self.__return_index)
417  algorithm_type = self.__algorithm.get_type()
418  return algorithm_type(self.__data, initial_values).process().get_clusters()
419
420
421  def __verify_arguments(self):
422  """!
423  @brief Checks algorithm's arguments and if some of them is incorrect then exception is thrown.
424
425  """
426  if self.__kmax > len(self.__data):
427  raise ValueError("K max value '" + str(self.__kmax) + "' is bigger than amount of objects '" +
428  str(len(self.__data)) + "' in input data.")
429
430  if self.__kmin <= 1:
431  raise ValueError("K min value '" + str(self.__kmin) + "' should be greater than 1 (impossible to provide "
432  "silhiuette score for only one cluster).")
Cluster analysis algorithm: K-Medians.
Definition: kmedians.py:1
Cluster analysis algorithm: K-Means.
Definition: kmeans.py:1
def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference)
Calculates &#39;B&#39; score for the specific object for the nearest cluster.
Definition: silhouette.py:178
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def process(self)
Calculates Silhouette score for each object from input data.
Definition: silhouette.py:104
def __calculate_score(self, index_point, index_cluster)
Calculates Silhouette score for the specific object defined by index_point.
Definition: silhouette.py:128
def process(self)
Performs analysis to find optimal amount of clusters.
Definition: silhouette.py:343
def get_score(self)
Returns Silhouette score for each object from input data.
Definition: silhouette.py:118
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:58
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def __calculate_cluster_difference(self, index_cluster, difference)
Calculates distance from each object in specified cluster to specified object.
Definition: silhouette.py:199
def __init__(self, data, clusters, kwargs)
Initializes Silhouette method for analysis.
Definition: silhouette.py:79
def __init__(self, data, kmin, kmax, kwargs)
Initialize Silhouette search algorithm to find out optimal amount of clusters.
Definition: silhouette.py:314
def __calculate_within_cluster_score(self, index_cluster, difference)
Calculates &#39;A&#39; score for the specific object in cluster to which it belongs to.
Definition: silhouette.py:146
def get_type(self)
Returns algorithm type that corresponds to specified enumeration value.
Definition: silhouette.py:251
Defines algorithms that can be used to find optimal number of cluster using Silhouette method...
Definition: silhouette.py:234
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def __calculate_clusters(self, k)
Performs cluster analysis using specified K value.
Definition: silhouette.py:407
def get_scores(self)
Returns silhouette score for each K value (amount of clusters).
Definition: silhouette.py:395
def __verify_arguments(self)
Checks algorithm&#39;s arguments and if some of them is incorrect then exception is thrown.
Definition: silhouette.py:421
Represents Silhouette method that is used interpretation and validation of consistency.
Definition: silhouette.py:40
def get_score(self)
Returns silhouette score that belongs to optimal amount of clusters (k).
Definition: silhouette.py:383
Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means...
Definition: silhouette.py:269
Cluster analysis algorithm: K-Medoids (PAM - Partitioning Around Medoids).
Definition: kmedoids.py:1
def __calculate_dataset_difference(self, index_point)
Calculate distance from each object to specified object.
Definition: silhouette.py:215
def __calculate_cluster_score(self, index_cluster, difference)
Calculates &#39;B*&#39; score for the specific object for specific cluster.
Definition: silhouette.py:163
def get_amount(self)
Returns optimal amount of clusters that has been found during analysis.
Definition: silhouette.py:371