silhouette.py
1 """!
2 
3 @brief Silhouette - method of interpretation and validation of consistency.
4 @details Implementation based on paper @cite article::cluster::silhouette::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from enum import IntEnum
29 
30 import numpy
31 
32 from pyclustering.cluster.kmeans import kmeans
33 from pyclustering.cluster.kmedians import kmedians
34 from pyclustering.cluster.kmedoids import kmedoids
35 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
36 
37 from pyclustering.utils.metric import distance_metric, type_metric
38 
39 from pyclustering.core.wrapper import ccore_library
40 from pyclustering.core.metric_wrapper import metric_wrapper
41 
42 import pyclustering.core.silhouette_wrapper as wrapper
43 
44 
45 class silhouette:
46  """!
47  @brief Represents Silhouette method that is used interpretation and validation of consistency.
48  @details The silhouette value is a measure of how similar an object is to its own cluster compared to other clusters.
49  Be aware that silhouette method is applicable for K algorithm family, such as K-Means, K-Medians,
50  K-Medoids, X-Means, etc., not not applicable for DBSCAN, OPTICS, CURE, etc. The Silhouette value is
51  calculated using following formula:
52  \f[s\left ( i \right )=\frac{ b\left ( i \right ) - a\left ( i \right ) }{ max\left \{ a\left ( i \right ), b\left ( i \right ) \right \}}\f]
53  where \f$a\left ( i \right )\f$ - is average distance from object i to objects in its own cluster,
54  \f$b\left ( i \right )\f$ - is average distance from object i to objects in the nearest cluster (the appropriate among other clusters).
55 
56  Here is an example where Silhouette score is calculated for K-Means's clustering result:
57  @code
58  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
59  from pyclustering.cluster.kmeans import kmeans
60  from pyclustering.cluster.silhouette import silhouette
61 
62  from pyclustering.samples.definitions import SIMPLE_SAMPLES
63  from pyclustering.utils import read_sample
64 
65  # Read data 'SampleSimple3' from Simple Sample collection.
66  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
67 
68  # Prepare initial centers
69  centers = kmeans_plusplus_initializer(sample, 4).initialize();
70 
71  # Perform cluster analysis
72  kmeans_instance = kmeans(sample, centers);
73  kmeans_instance.process();
74  clusters = kmeans_instance.get_clusters();
75 
76  # Calculate Silhouette score
77  score = silhouette(sample, clusters).process().get_score()
78  @endcode
79 
80  @see kmeans, kmedoids, kmedians, xmeans, elbow
81 
82  """
83 
84  def __init__(self, data, clusters, **kwargs):
85  """!
86  @brief Initializes Silhouette method for analysis.
87 
88  @param[in] data (array_like): Input data that was used for cluster analysis and that is presented as list of
89  points or distance matrix (defined by parameter 'data_type', by default data is considered as a list
90  of points).
91  @param[in] clusters (list): Cluster that have been obtained after cluster analysis.
92  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
93 
94  <b>Keyword Args:</b><br>
95  - metric (distance_metric): Metric that was used for cluster analysis and should be used for Silhouette
96  score calculation (by default Square Euclidean distance).
97  - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
98  - ccore (bool): If True then CCORE (C++ implementation of pyclustering library) is used (by default True).
99 
100  """
101  self.__data = data
102  self.__clusters = clusters
103  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
104  self.__data_type = kwargs.get('data_type', 'points')
105 
106  if self.__metric.get_type() != type_metric.USER_DEFINED:
107  self.__metric.enable_numpy_usage()
108  else:
109  self.__metric.disable_numpy_usage()
110 
111  self.__score = [0.0] * len(data)
112 
113  self.__ccore = kwargs.get('ccore', True) and self.__metric.get_type() != type_metric.USER_DEFINED
114  if self.__ccore:
115  self.__ccore = ccore_library.workable()
116 
117  if self.__ccore is False:
118  self.__data = numpy.array(data)
119 
120  self.__verify_arguments()
121 
122 
123  def process(self):
124  """!
125  @brief Calculates Silhouette score for each object from input data.
126 
127  @return (silhouette) Instance of the method (self).
128 
129  """
130  if self.__ccore is True:
131  self.__process_by_ccore()
132  else:
133  self.__process_by_python()
134 
135  return self
136 
137 
138  def __process_by_ccore(self):
139  """!
140  @brief Performs processing using CCORE (C/C++ part of pyclustering library).
141 
142  """
143  ccore_metric = metric_wrapper.create_instance(self.__metric)
144  self.__score = wrapper.silhoeutte(self.__data, self.__clusters, ccore_metric.get_pointer(), self.__data_type)
145 
146 
147  def __process_by_python(self):
148  """!
149  @brief Performs processing using python code.
150 
151  """
152  for index_cluster in range(len(self.__clusters)):
153  for index_point in self.__clusters[index_cluster]:
154  self.__score[index_point] = self.__calculate_score(index_point, index_cluster)
155 
156 
157  def get_score(self):
158  """!
159  @brief Returns Silhouette score for each object from input data.
160 
161  @see process
162 
163  """
164  return self.__score
165 
166 
167  def __calculate_score(self, index_point, index_cluster):
168  """!
169  @brief Calculates Silhouette score for the specific object defined by index_point.
170 
171  @param[in] index_point (uint): Index point from input data for which Silhouette score should be calculated.
172  @param[in] index_cluster (uint): Index cluster to which the point belongs to.
173 
174  @return (float) Silhouette score for the object.
175 
176  """
177  if self.__data_type == 'points':
178  difference = self.__calculate_dataset_difference(index_point)
179  else:
180  difference = self.__data[index_point]
181 
182  a_score = self.__calculate_within_cluster_score(index_cluster, difference)
183  b_score = self.__caclulate_optimal_neighbor_cluster_score(index_cluster, difference)
184 
185  return (b_score - a_score) / max(a_score, b_score)
186 
187 
188  def __calculate_within_cluster_score(self, index_cluster, difference):
189  """!
190  @brief Calculates 'A' score for the specific object in cluster to which it belongs to.
191 
192  @param[in] index_point (uint): Index point from input data for which 'A' score should be calculated.
193  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
194 
195  @return (float) 'A' score for the object.
196 
197  """
198 
199  score = self.__calculate_cluster_difference(index_cluster, difference)
200  if len(self.__clusters[index_cluster]) == 1:
201  return float('nan')
202  return score / (len(self.__clusters[index_cluster]) - 1)
203 
204 
205  def __calculate_cluster_score(self, index_cluster, difference):
206  """!
207  @brief Calculates 'B*' score for the specific object for specific cluster.
208 
209  @param[in] index_point (uint): Index point from input data for which 'B*' score should be calculated.
210  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
211 
212  @return (float) 'B*' score for the object for specific cluster.
213 
214  """
215 
216  score = self.__calculate_cluster_difference(index_cluster, difference)
217  return score / len(self.__clusters[index_cluster])
218 
219 
220  def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference):
221  """!
222  @brief Calculates 'B' score for the specific object for the nearest cluster.
223 
224  @param[in] index_point (uint): Index point from input data for which 'B' score should be calculated.
225  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
226 
227  @return (float) 'B' score for the object.
228 
229  """
230 
231  optimal_score = float('inf')
232  for index_neighbor_cluster in range(len(self.__clusters)):
233  if index_cluster != index_neighbor_cluster:
234  candidate_score = self.__calculate_cluster_score(index_neighbor_cluster, difference)
235  if candidate_score < optimal_score:
236  optimal_score = candidate_score
237 
238  if optimal_score == float('inf'):
239  optimal_score = -1.0
240 
241  return optimal_score
242 
243 
244  def __calculate_cluster_difference(self, index_cluster, difference):
245  """!
246  @brief Calculates distance from each object in specified cluster to specified object.
247 
248  @param[in] index_point (uint): Index point for which difference is calculated.
249 
250  @return (list) Distance from specified object to each object from input data in specified cluster.
251 
252  """
253  cluster_difference = 0.0
254  for index_point in self.__clusters[index_cluster]:
255  cluster_difference += difference[index_point]
256 
257  return cluster_difference
258 
259 
260  def __calculate_dataset_difference(self, index_point):
261  """!
262  @brief Calculate distance from each object to specified object.
263 
264  @param[in] index_point (uint): Index point for which difference with other points is calculated.
265 
266  @return (list) Distance to each object from input data from the specified.
267 
268  """
269 
270  if self.__metric.get_type() != type_metric.USER_DEFINED:
271  dataset_differences = self.__metric(self.__data, self.__data[index_point])
272  else:
273  dataset_differences = [self.__metric(point, self.__data[index_point]) for point in self.__data]
274 
275  return dataset_differences
276 
277 
278  def __verify_arguments(self):
279  """!
280  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
281 
282  """
283  if len(self.__data) == 0:
284  raise ValueError("Input data is empty (size: '%d')." % len(self.__data))
285 
286  if len(self.__clusters) == 0:
287  raise ValueError("Input clusters are empty (size: '%d')." % len(self.__clusters))
288 
289 
290 
291 class silhouette_ksearch_type(IntEnum):
292  """!
293  @brief Defines algorithms that can be used to find optimal number of cluster using Silhouette method.
294 
295  @see silhouette_ksearch
296 
297  """
298 
299 
300  KMEANS = 0
301 
302 
303  KMEDIANS = 1
304 
305 
306  KMEDOIDS = 2
307 
308  def get_type(self):
309  """!
310  @brief Returns algorithm type that corresponds to specified enumeration value.
311 
312  @return (type) Algorithm type for cluster analysis.
313 
314  """
315  if self == silhouette_ksearch_type.KMEANS:
316  return kmeans
317  elif self == silhouette_ksearch_type.KMEDIANS:
318  return kmedians
319  elif self == silhouette_ksearch_type.KMEDOIDS:
320  return kmedoids
321  else:
322  return None
323 
324 
325 
327  """!
328  @brief Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means,
329  K-Medians, K-Medoids) that is based on Silhouette method.
330 
331  @details This algorithm uses average value of scores for estimation and applicable for clusters that are well
332  separated. Here is an example where clusters are well separated (sample 'Hepta'):
333  @code
334  from pyclustering.cluster import cluster_visualizer
335  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
336  from pyclustering.cluster.kmeans import kmeans
337  from pyclustering.cluster.silhouette import silhouette_ksearch_type, silhouette_ksearch
338  from pyclustering.samples.definitions import FCPS_SAMPLES
339  from pyclustering.utils import read_sample
340 
341  sample = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA)
342  search_instance = silhouette_ksearch(sample, 2, 10, algorithm=silhouette_ksearch_type.KMEANS).process()
343 
344  amount = search_instance.get_amount()
345  scores = search_instance.get_scores()
346 
347  print("Scores: '%s'" % str(scores))
348 
349  initial_centers = kmeans_plusplus_initializer(sample, amount).initialize()
350  kmeans_instance = kmeans(sample, initial_centers).process()
351 
352  clusters = kmeans_instance.get_clusters()
353 
354  visualizer = cluster_visualizer()
355  visualizer.append_clusters(clusters, sample)
356  visualizer.show()
357  @endcode
358 
359  Obtained Silhouette scores for each K:
360  @code
361  Scores: '{2: 0.418434, 3: 0.450906, 4: 0.534709, 5: 0.689970, 6: 0.588460, 7: 0.882674, 8: 0.804725, 9: 0.780189}'
362  @endcode
363 
364  K = 7 has the bigger average Silhouette score and it means that it is optimal amount of clusters:
365  @image html silhouette_ksearch_hepta.png "Silhouette ksearch's analysis with further K-Means clustering (sample 'Hepta')."
366 
367  @see silhouette_ksearch_type
368 
369  """
370 
371  def __init__(self, data, kmin, kmax, **kwargs):
372  """!
373  @brief Initialize Silhouette search algorithm to find out optimal amount of clusters.
374 
375  @param[in] data (array_like): Input data that is used for searching optimal amount of clusters.
376  @param[in] kmin (uint): Amount of clusters from which search is performed. Should be equal or greater than 2.
377  @param[in] kmax (uint): Amount of clusters to which search is performed. Should be equal or less than amount of
378  points in input data.
379  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'algorithm').
380 
381  <b>Keyword Args:</b><br>
382  - algorithm (silhouette_ksearch_type): Defines algorithm that is used for searching optimal number of
383  clusters (by default K-Means).
384  - ccore (bool): If True then CCORE (C++ implementation of pyclustering library) is used (by default True).
385 
386  """
387  self.__data = data
388  self.__kmin = kmin
389  self.__kmax = kmax
390 
391  self.__algorithm = kwargs.get('algorithm', silhouette_ksearch_type.KMEANS)
392  self.__return_index = self.__algorithm == silhouette_ksearch_type.KMEDOIDS
393 
394  self.__amount = -1
395  self.__score = -1.0
396  self.__scores = {}
397 
398  self.__verify_arguments()
399 
400  self.__ccore = kwargs.get('ccore', True)
401  if self.__ccore:
402  self.__ccore = ccore_library.workable()
403 
404 
405  def process(self):
406  """!
407  @brief Performs analysis to find optimal amount of clusters.
408 
409  @see get_amount, get_score, get_scores
410 
411  @return (silhouette_search) Itself instance (silhouette_search)
412 
413  """
414  if self.__ccore is True:
415  self.__process_by_ccore()
416  else:
417  self.__process_by_python()
418 
419  return self
420 
421 
422  def __process_by_ccore(self):
423  """!
424  @brief Performs processing using CCORE (C/C++ part of pyclustering library).
425 
426  """
427  results = wrapper.silhoeutte_ksearch(self.__data, self.__kmin, self.__kmax, self.__algorithm)
428 
429  self.__amount = results[0]
430  self.__score = results[1]
431  self.__scores = results[2]
432 
433 
434  def __process_by_python(self):
435  """!
436  @brief Performs processing using python code.
437 
438  """
439  self.__scores = {}
440 
441  for k in range(self.__kmin, self.__kmax):
442  clusters = self.__calculate_clusters(k)
443  if len(clusters) != k:
444  self.__scores[k] = float('nan')
445  continue
446 
447  score = silhouette(self.__data, clusters).process().get_score()
448 
449  self.__scores[k] = sum(score) / len(score)
450 
451  if self.__scores[k] > self.__score:
452  self.__score = self.__scores[k]
453  self.__amount = k
454 
455 
456  def get_amount(self):
457  """!
458  @brief Returns optimal amount of clusters that has been found during analysis.
459 
460  @return (uint) Optimal amount of clusters.
461 
462  @see process
463 
464  """
465  return self.__amount
466 
467 
468  def get_score(self):
469  """!
470  @brief Returns silhouette score that belongs to optimal amount of clusters (k).
471 
472  @return (float) Score that belong to optimal amount of clusters.
473 
474  @see process, get_scores
475 
476  """
477  return self.__score
478 
479 
480  def get_scores(self):
481  """!
482  @brief Returns silhouette score for each K value (amount of clusters).
483 
484  @return (dict) Silhouette score for each K value, where key is a K value and value is a silhouette score.
485 
486  @see process, get_score
487 
488  """
489  return self.__scores
490 
491 
492  def __calculate_clusters(self, k):
493  """!
494  @brief Performs cluster analysis using specified K value.
495 
496  @param[in] k (uint): Amount of clusters that should be allocated.
497 
498  @return (array_like) Allocated clusters.
499 
500  """
501  initial_values = kmeans_plusplus_initializer(self.__data, k).initialize(return_index=self.__return_index)
502  algorithm_type = self.__algorithm.get_type()
503  return algorithm_type(self.__data, initial_values).process().get_clusters()
504 
505 
506  def __verify_arguments(self):
507  """!
508  @brief Checks algorithm's arguments and if some of them is incorrect then exception is thrown.
509 
510  """
511  if self.__kmax > len(self.__data):
512  raise ValueError("K max value '" + str(self.__kmax) + "' is bigger than amount of objects '" +
513  str(len(self.__data)) + "' in input data.")
514 
515  if self.__kmin <= 1:
516  raise ValueError("K min value '" + str(self.__kmin) + "' should be greater than 1 (impossible to provide "
517  "silhouette score for only one cluster).")
Cluster analysis algorithm: K-Medians.
Definition: kmedians.py:1
The module contains K-Means algorithm and other related services.
Definition: kmeans.py:1
def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference)
Calculates &#39;B&#39; score for the specific object for the nearest cluster.
Definition: silhouette.py:220
def __process_by_ccore(self)
Performs processing using CCORE (C/C++ part of pyclustering library).
Definition: silhouette.py:138
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def process(self)
Calculates Silhouette score for each object from input data.
Definition: silhouette.py:123
def __calculate_score(self, index_point, index_cluster)
Calculates Silhouette score for the specific object defined by index_point.
Definition: silhouette.py:167
def process(self)
Performs analysis to find optimal amount of clusters.
Definition: silhouette.py:405
def get_score(self)
Returns Silhouette score for each object from input data.
Definition: silhouette.py:157
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:67
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def __calculate_cluster_difference(self, index_cluster, difference)
Calculates distance from each object in specified cluster to specified object.
Definition: silhouette.py:244
def __init__(self, data, clusters, kwargs)
Initializes Silhouette method for analysis.
Definition: silhouette.py:84
def __process_by_python(self)
Performs processing using python code.
Definition: silhouette.py:434
def __init__(self, data, kmin, kmax, kwargs)
Initialize Silhouette search algorithm to find out optimal amount of clusters.
Definition: silhouette.py:371
def __calculate_within_cluster_score(self, index_cluster, difference)
Calculates &#39;A&#39; score for the specific object in cluster to which it belongs to.
Definition: silhouette.py:188
def __process_by_ccore(self)
Performs processing using CCORE (C/C++ part of pyclustering library).
Definition: silhouette.py:422
def get_type(self)
Returns algorithm type that corresponds to specified enumeration value.
Definition: silhouette.py:308
Defines algorithms that can be used to find optimal number of cluster using Silhouette method...
Definition: silhouette.py:291
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def __calculate_clusters(self, k)
Performs cluster analysis using specified K value.
Definition: silhouette.py:492
def __process_by_python(self)
Performs processing using python code.
Definition: silhouette.py:147
def get_scores(self)
Returns silhouette score for each K value (amount of clusters).
Definition: silhouette.py:480
def __verify_arguments(self)
Checks algorithm&#39;s arguments and if some of them is incorrect then exception is thrown.
Definition: silhouette.py:506
Represents Silhouette method that is used interpretation and validation of consistency.
Definition: silhouette.py:45
def get_score(self)
Returns silhouette score that belongs to optimal amount of clusters (k).
Definition: silhouette.py:468
Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means...
Definition: silhouette.py:326
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: silhouette.py:278
Cluster analysis algorithm: K-Medoids.
Definition: kmedoids.py:1
def __calculate_dataset_difference(self, index_point)
Calculate distance from each object to specified object.
Definition: silhouette.py:260
def __calculate_cluster_score(self, index_cluster, difference)
Calculates &#39;B*&#39; score for the specific object for specific cluster.
Definition: silhouette.py:205
def get_amount(self)
Returns optimal amount of clusters that has been found during analysis.
Definition: silhouette.py:456