silhouette.py
1 """!
2 
3 @brief Silhouette - method of interpretation and validation of consistency.
4 @details Implementation based on paper @cite article::cluster::silhouette::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from enum import IntEnum
29 
30 import numpy
31 
32 from pyclustering.cluster.kmeans import kmeans
33 from pyclustering.cluster.kmedians import kmedians
34 from pyclustering.cluster.kmedoids import kmedoids
35 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
36 
37 from pyclustering.utils.metric import distance_metric, type_metric
38 
39 from pyclustering.core.wrapper import ccore_library
40 from pyclustering.core.metric_wrapper import metric_wrapper
41 
42 import pyclustering.core.silhouette_wrapper as wrapper
43 
44 
45 class silhouette:
46  """!
47  @brief Represents Silhouette method that is used interpretation and validation of consistency.
48  @details The silhouette value is a measure of how similar an object is to its own cluster compared to other clusters.
49  Be aware that silhouette method is applicable for K algorithm family, such as K-Means, K-Medians,
50  K-Medoids, X-Means, etc., not not applicable for DBSCAN, OPTICS, CURE, etc. The Silhouette value is
51  calculated using following formula:
52  \f[s\left ( i \right )=\frac{ b\left ( i \right ) - a\left ( i \right ) }{ max\left \{ a\left ( i \right ), b\left ( i \right ) \right \}}\f]
53  where \f$a\left ( i \right )\f$ - is average distance from object i to objects in its own cluster,
54  \f$b\left ( i \right )\f$ - is average distance from object i to objects in the nearest cluster (the appropriate among other clusters).
55 
56  Here is an example where Silhouette score is calculated for K-Means's clustering result:
57  @code
58  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
59  from pyclustering.cluster.kmeans import kmeans
60  from pyclustering.cluster.silhouette import silhouette
61 
62  from pyclustering.samples.definitions import SIMPLE_SAMPLES
63  from pyclustering.utils import read_sample
64 
65  # Read data 'SampleSimple3' from Simple Sample collection.
66  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
67 
68  # Prepare initial centers
69  centers = kmeans_plusplus_initializer(sample, 4).initialize()
70 
71  # Perform cluster analysis
72  kmeans_instance = kmeans(sample, centers)
73  kmeans_instance.process()
74  clusters = kmeans_instance.get_clusters()
75 
76  # Calculate Silhouette score
77  score = silhouette(sample, clusters).process().get_score()
78  @endcode
79 
80  Let's perform clustering of the same sample by K-Means algorithm using different `K` values (2, 4, 6 and 8) and
81  estimate clustering results using Silhouette method.
82  @code
83  from pyclustering.cluster.kmeans import kmeans
84  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
85  from pyclustering.cluster.silhouette import silhouette
86 
87  from pyclustering.samples.definitions import SIMPLE_SAMPLES
88  from pyclustering.utils import read_sample
89 
90  import matplotlib.pyplot as plt
91 
92  def get_score(sample, amount_clusters):
93  # Prepare initial centers for K-Means algorithm.
94  centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize()
95 
96  # Perform cluster analysis.
97  kmeans_instance = kmeans(sample, centers)
98  kmeans_instance.process()
99  clusters = kmeans_instance.get_clusters()
100 
101  # Calculate Silhouette score.
102  return silhouette(sample, clusters).process().get_score()
103 
104  def draw_score(figure, position, title, score):
105  ax = figure.add_subplot(position)
106  ax.bar(range(0, len(score)), score, width=0.7)
107  ax.set_title(title)
108  ax.set_xlim(0, len(score))
109  ax.set_xticklabels([])
110  ax.grid()
111 
112  # Read data 'SampleSimple3' from Simple Sample collection.
113  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
114 
115  # Perform cluster analysis and estimation by Silhouette.
116  score_2 = get_score(sample, 2) # K = 2 (amount of clusters).
117  score_4 = get_score(sample, 4) # K = 4 - optimal.
118  score_6 = get_score(sample, 6) # K = 6.
119  score_8 = get_score(sample, 8) # K = 8.
120 
121  # Visualize results.
122  figure = plt.figure()
123 
124  # Visualize each result separately.
125  draw_score(figure, 221, 'K = 2', score_2)
126  draw_score(figure, 222, 'K = 4 (optimal)', score_4)
127  draw_score(figure, 223, 'K = 6', score_6)
128  draw_score(figure, 224, 'K = 8', score_8)
129 
130  # Show a plot with visualized results.
131  plt.show()
132  @endcode
133 
134  There is visualized results that were done by Silhouette method. `K = 4` is the optimal amount of clusters in line
135  with Silhouette method because the score for each point is close to `1.0` and the average score for `K = 4` is
136  biggest value among others `K`.
137 
138  @image html silhouette_score_for_various_K.png "Fig. 1. Silhouette scores for various K."
139 
140  @see kmeans, kmedoids, kmedians, xmeans, elbow
141 
142  """
143 
144  def __init__(self, data, clusters, **kwargs):
145  """!
146  @brief Initializes Silhouette method for analysis.
147 
148  @param[in] data (array_like): Input data that was used for cluster analysis and that is presented as list of
149  points or distance matrix (defined by parameter 'data_type', by default data is considered as a list
150  of points).
151  @param[in] clusters (list): Clusters that have been obtained after cluster analysis.
152  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
153 
154  <b>Keyword Args:</b><br>
155  - metric (distance_metric): Metric that was used for cluster analysis and should be used for Silhouette
156  score calculation (by default Square Euclidean distance).
157  - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
158  - ccore (bool): If True then CCORE (C++ implementation of pyclustering library) is used (by default True).
159 
160  """
161  self.__data = data
162  self.__clusters = clusters
163  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
164  self.__data_type = kwargs.get('data_type', 'points')
165 
166  if self.__metric.get_type() != type_metric.USER_DEFINED:
167  self.__metric.enable_numpy_usage()
168  else:
169  self.__metric.disable_numpy_usage()
170 
171  self.__score = [0.0] * len(data)
172 
173  self.__ccore = kwargs.get('ccore', True) and self.__metric.get_type() != type_metric.USER_DEFINED
174  if self.__ccore:
175  self.__ccore = ccore_library.workable()
176 
177  if self.__ccore is False:
178  self.__data = numpy.array(data)
179 
180  self.__verify_arguments()
181 
182 
183  def process(self):
184  """!
185  @brief Calculates Silhouette score for each object from input data.
186 
187  @return (silhouette) Instance of the method (self).
188 
189  """
190  if self.__ccore is True:
191  self.__process_by_ccore()
192  else:
193  self.__process_by_python()
194 
195  return self
196 
197 
198  def __process_by_ccore(self):
199  """!
200  @brief Performs processing using CCORE (C/C++ part of pyclustering library).
201 
202  """
203  ccore_metric = metric_wrapper.create_instance(self.__metric)
204  self.__score = wrapper.silhoeutte(self.__data, self.__clusters, ccore_metric.get_pointer(), self.__data_type)
205 
206 
207  def __process_by_python(self):
208  """!
209  @brief Performs processing using python code.
210 
211  """
212  for index_cluster in range(len(self.__clusters)):
213  for index_point in self.__clusters[index_cluster]:
214  self.__score[index_point] = self.__calculate_score(index_point, index_cluster)
215 
216 
217  def get_score(self):
218  """!
219  @brief Returns Silhouette score for each object from input data.
220 
221  @see process
222 
223  """
224  return self.__score
225 
226 
227  def __calculate_score(self, index_point, index_cluster):
228  """!
229  @brief Calculates Silhouette score for the specific object defined by index_point.
230 
231  @param[in] index_point (uint): Index point from input data for which Silhouette score should be calculated.
232  @param[in] index_cluster (uint): Index cluster to which the point belongs to.
233 
234  @return (float) Silhouette score for the object.
235 
236  """
237  if self.__data_type == 'points':
238  difference = self.__calculate_dataset_difference(index_point)
239  else:
240  difference = self.__data[index_point]
241 
242  a_score = self.__calculate_within_cluster_score(index_cluster, difference)
243  b_score = self.__caclulate_optimal_neighbor_cluster_score(index_cluster, difference)
244 
245  return (b_score - a_score) / max(a_score, b_score)
246 
247 
248  def __calculate_within_cluster_score(self, index_cluster, difference):
249  """!
250  @brief Calculates 'A' score for the specific object in cluster to which it belongs to.
251 
252  @param[in] index_point (uint): Index point from input data for which 'A' score should be calculated.
253  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
254 
255  @return (float) 'A' score for the object.
256 
257  """
258 
259  score = self.__calculate_cluster_difference(index_cluster, difference)
260  if len(self.__clusters[index_cluster]) == 1:
261  return float('nan')
262  return score / (len(self.__clusters[index_cluster]) - 1)
263 
264 
265  def __calculate_cluster_score(self, index_cluster, difference):
266  """!
267  @brief Calculates 'B*' score for the specific object for specific cluster.
268 
269  @param[in] index_point (uint): Index point from input data for which 'B*' score should be calculated.
270  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
271 
272  @return (float) 'B*' score for the object for specific cluster.
273 
274  """
275 
276  score = self.__calculate_cluster_difference(index_cluster, difference)
277  return score / len(self.__clusters[index_cluster])
278 
279 
280  def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference):
281  """!
282  @brief Calculates 'B' score for the specific object for the nearest cluster.
283 
284  @param[in] index_point (uint): Index point from input data for which 'B' score should be calculated.
285  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
286 
287  @return (float) 'B' score for the object.
288 
289  """
290 
291  optimal_score = float('inf')
292  for index_neighbor_cluster in range(len(self.__clusters)):
293  if index_cluster != index_neighbor_cluster:
294  candidate_score = self.__calculate_cluster_score(index_neighbor_cluster, difference)
295  if candidate_score < optimal_score:
296  optimal_score = candidate_score
297 
298  if optimal_score == float('inf'):
299  optimal_score = -1.0
300 
301  return optimal_score
302 
303 
304  def __calculate_cluster_difference(self, index_cluster, difference):
305  """!
306  @brief Calculates distance from each object in specified cluster to specified object.
307 
308  @param[in] index_point (uint): Index point for which difference is calculated.
309 
310  @return (list) Distance from specified object to each object from input data in specified cluster.
311 
312  """
313  cluster_difference = 0.0
314  for index_point in self.__clusters[index_cluster]:
315  cluster_difference += difference[index_point]
316 
317  return cluster_difference
318 
319 
320  def __calculate_dataset_difference(self, index_point):
321  """!
322  @brief Calculate distance from each object to specified object.
323 
324  @param[in] index_point (uint): Index point for which difference with other points is calculated.
325 
326  @return (list) Distance to each object from input data from the specified.
327 
328  """
329 
330  if self.__metric.get_type() != type_metric.USER_DEFINED:
331  dataset_differences = self.__metric(self.__data, self.__data[index_point])
332  else:
333  dataset_differences = [self.__metric(point, self.__data[index_point]) for point in self.__data]
334 
335  return dataset_differences
336 
337 
338  def __verify_arguments(self):
339  """!
340  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
341 
342  """
343  if len(self.__data) == 0:
344  raise ValueError("Input data is empty (size: '%d')." % len(self.__data))
345 
346  if len(self.__clusters) == 0:
347  raise ValueError("Input clusters are empty (size: '%d')." % len(self.__clusters))
348 
349 
350 
351 class silhouette_ksearch_type(IntEnum):
352  """!
353  @brief Defines algorithms that can be used to find optimal number of cluster using Silhouette method.
354 
355  @see silhouette_ksearch
356 
357  """
358 
359 
360  KMEANS = 0
361 
362 
363  KMEDIANS = 1
364 
365 
366  KMEDOIDS = 2
367 
368  def get_type(self):
369  """!
370  @brief Returns algorithm type that corresponds to specified enumeration value.
371 
372  @return (type) Algorithm type for cluster analysis.
373 
374  """
375  if self == silhouette_ksearch_type.KMEANS:
376  return kmeans
377  elif self == silhouette_ksearch_type.KMEDIANS:
378  return kmedians
379  elif self == silhouette_ksearch_type.KMEDOIDS:
380  return kmedoids
381  else:
382  return None
383 
384 
385 
387  """!
388  @brief Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means,
389  K-Medians, K-Medoids) that is based on Silhouette method.
390 
391  @details This algorithm uses average value of scores for estimation and applicable for clusters that are well
392  separated. Here is an example where clusters are well separated (sample 'Hepta'):
393  @code
394  from pyclustering.cluster import cluster_visualizer
395  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
396  from pyclustering.cluster.kmeans import kmeans
397  from pyclustering.cluster.silhouette import silhouette_ksearch_type, silhouette_ksearch
398  from pyclustering.samples.definitions import FCPS_SAMPLES
399  from pyclustering.utils import read_sample
400 
401  sample = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA)
402  search_instance = silhouette_ksearch(sample, 2, 10, algorithm=silhouette_ksearch_type.KMEANS).process()
403 
404  amount = search_instance.get_amount()
405  scores = search_instance.get_scores()
406 
407  print("Scores: '%s'" % str(scores))
408 
409  initial_centers = kmeans_plusplus_initializer(sample, amount).initialize()
410  kmeans_instance = kmeans(sample, initial_centers).process()
411 
412  clusters = kmeans_instance.get_clusters()
413 
414  visualizer = cluster_visualizer()
415  visualizer.append_clusters(clusters, sample)
416  visualizer.show()
417  @endcode
418 
419  Obtained Silhouette scores for each K:
420  @code
421  Scores: '{2: 0.418434, 3: 0.450906, 4: 0.534709, 5: 0.689970, 6: 0.588460, 7: 0.882674, 8: 0.804725, 9: 0.780189}'
422  @endcode
423 
424  K = 7 has the bigger average Silhouette score and it means that it is optimal amount of clusters:
425  @image html silhouette_ksearch_hepta.png "Silhouette ksearch's analysis with further K-Means clustering (sample 'Hepta')."
426 
427  @see silhouette_ksearch_type
428 
429  """
430 
431  def __init__(self, data, kmin, kmax, **kwargs):
432  """!
433  @brief Initialize Silhouette search algorithm to find out optimal amount of clusters.
434 
435  @param[in] data (array_like): Input data that is used for searching optimal amount of clusters.
436  @param[in] kmin (uint): Minimum amount of clusters that might be allocated. Should be equal or greater than `2`.
437  @param[in] kmax (uint): Maximum amount of clusters that might be allocated. Should be equal or less than amount
438  of points in input data.
439  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `algorithm`, `random_state`).
440 
441  <b>Keyword Args:</b><br>
442  - algorithm (silhouette_ksearch_type): Defines algorithm that is used for searching optimal number of
443  clusters (by default K-Means).
444  - ccore (bool): If True then CCORE (C++ implementation of pyclustering library) is used (by default True).
445 
446  """
447  self.__data = data
448  self.__kmin = kmin
449  self.__kmax = kmax
450 
451  self.__algorithm = kwargs.get('algorithm', silhouette_ksearch_type.KMEANS)
452  self.__random_state = kwargs.get('random_state', None)
453  self.__return_index = self.__algorithm == silhouette_ksearch_type.KMEDOIDS
454 
455  self.__amount = -1
456  self.__score = -1.0
457  self.__scores = {}
458 
459  self.__verify_arguments()
460 
461  self.__ccore = kwargs.get('ccore', True)
462  if self.__ccore:
463  self.__ccore = ccore_library.workable()
464 
465 
466  def process(self):
467  """!
468  @brief Performs analysis to find optimal amount of clusters.
469 
470  @see get_amount, get_score, get_scores
471 
472  @return (silhouette_search) Itself instance (silhouette_search)
473 
474  """
475  if self.__ccore is True:
476  self.__process_by_ccore()
477  else:
478  self.__process_by_python()
479 
480  return self
481 
482 
483  def __process_by_ccore(self):
484  """!
485  @brief Performs processing using CCORE (C/C++ part of pyclustering library).
486 
487  """
488  results = wrapper.silhoeutte_ksearch(self.__data, self.__kmin, self.__kmax, self.__algorithm, self.__random_state)
489 
490  self.__amount = results[0]
491  self.__score = results[1]
492 
493  scores_list = results[2]
494  self.__scores = {}
495  for i in range(len(scores_list)):
496  self.__scores[self.__kmin + i] = scores_list[i]
497 
498 
499  def __process_by_python(self):
500  """!
501  @brief Performs processing using python code.
502 
503  """
504  self.__scores = {}
505 
506  for k in range(self.__kmin, self.__kmax):
507  clusters = self.__calculate_clusters(k)
508  if len(clusters) != k:
509  self.__scores[k] = float('nan')
510  continue
511 
512  score = silhouette(self.__data, clusters).process().get_score()
513 
514  self.__scores[k] = sum(score) / len(score)
515 
516  if self.__scores[k] > self.__score:
517  self.__score = self.__scores[k]
518  self.__amount = k
519 
520 
521  def get_amount(self):
522  """!
523  @brief Returns optimal amount of clusters that has been found during analysis.
524 
525  @return (uint) Optimal amount of clusters.
526 
527  @see process
528 
529  """
530  return self.__amount
531 
532 
533  def get_score(self):
534  """!
535  @brief Returns silhouette score that belongs to optimal amount of clusters (k).
536 
537  @return (float) Score that belong to optimal amount of clusters.
538 
539  @see process, get_scores
540 
541  """
542  return self.__score
543 
544 
545  def get_scores(self):
546  """!
547  @brief Returns silhouette score for each K value (amount of clusters).
548 
549  @return (dict) Silhouette score for each K value, where key is a K value and value is a silhouette score.
550 
551  @see process, get_score
552 
553  """
554  return self.__scores
555 
556 
557  def __calculate_clusters(self, k):
558  """!
559  @brief Performs cluster analysis using specified K value.
560 
561  @param[in] k (uint): Amount of clusters that should be allocated.
562 
563  @return (array_like) Allocated clusters.
564 
565  """
566  initial_values = kmeans_plusplus_initializer(self.__data, k, random_state=self.__random_state).initialize(return_index=self.__return_index)
567  algorithm_type = self.__algorithm.get_type()
568  return algorithm_type(self.__data, initial_values).process().get_clusters()
569 
570 
571  def __verify_arguments(self):
572  """!
573  @brief Checks algorithm's arguments and if some of them is incorrect then exception is thrown.
574 
575  """
576  if self.__kmax > len(self.__data):
577  raise ValueError("K max value '" + str(self.__kmax) + "' is bigger than amount of objects '" +
578  str(len(self.__data)) + "' in input data.")
579 
580  if self.__kmin <= 1:
581  raise ValueError("K min value '" + str(self.__kmin) + "' should be greater than 1 (impossible to provide "
582  "silhouette score for only one cluster).")
Cluster analysis algorithm: K-Medians.
Definition: kmedians.py:1
The module contains K-Means algorithm and other related services.
Definition: kmeans.py:1
def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference)
Calculates &#39;B&#39; score for the specific object for the nearest cluster.
Definition: silhouette.py:280
def __process_by_ccore(self)
Performs processing using CCORE (C/C++ part of pyclustering library).
Definition: silhouette.py:198
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def process(self)
Calculates Silhouette score for each object from input data.
Definition: silhouette.py:183
def __calculate_score(self, index_point, index_cluster)
Calculates Silhouette score for the specific object defined by index_point.
Definition: silhouette.py:227
def process(self)
Performs analysis to find optimal amount of clusters.
Definition: silhouette.py:466
def get_score(self)
Returns Silhouette score for each object from input data.
Definition: silhouette.py:217
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:67
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def __calculate_cluster_difference(self, index_cluster, difference)
Calculates distance from each object in specified cluster to specified object.
Definition: silhouette.py:304
def __init__(self, data, clusters, kwargs)
Initializes Silhouette method for analysis.
Definition: silhouette.py:144
def __process_by_python(self)
Performs processing using python code.
Definition: silhouette.py:499
def __init__(self, data, kmin, kmax, kwargs)
Initialize Silhouette search algorithm to find out optimal amount of clusters.
Definition: silhouette.py:431
def __calculate_within_cluster_score(self, index_cluster, difference)
Calculates &#39;A&#39; score for the specific object in cluster to which it belongs to.
Definition: silhouette.py:248
def __process_by_ccore(self)
Performs processing using CCORE (C/C++ part of pyclustering library).
Definition: silhouette.py:483
def get_type(self)
Returns algorithm type that corresponds to specified enumeration value.
Definition: silhouette.py:368
Defines algorithms that can be used to find optimal number of cluster using Silhouette method...
Definition: silhouette.py:351
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def __calculate_clusters(self, k)
Performs cluster analysis using specified K value.
Definition: silhouette.py:557
def __process_by_python(self)
Performs processing using python code.
Definition: silhouette.py:207
def get_scores(self)
Returns silhouette score for each K value (amount of clusters).
Definition: silhouette.py:545
def __verify_arguments(self)
Checks algorithm&#39;s arguments and if some of them is incorrect then exception is thrown.
Definition: silhouette.py:571
Represents Silhouette method that is used interpretation and validation of consistency.
Definition: silhouette.py:45
def get_score(self)
Returns silhouette score that belongs to optimal amount of clusters (k).
Definition: silhouette.py:533
Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means...
Definition: silhouette.py:386
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: silhouette.py:338
Cluster analysis algorithm: K-Medoids.
Definition: kmedoids.py:1
def __calculate_dataset_difference(self, index_point)
Calculate distance from each object to specified object.
Definition: silhouette.py:320
def __calculate_cluster_score(self, index_cluster, difference)
Calculates &#39;B*&#39; score for the specific object for specific cluster.
Definition: silhouette.py:265
def get_amount(self)
Returns optimal amount of clusters that has been found during analysis.
Definition: silhouette.py:521