silhouette.py
1 """!
2 
3 @brief Silhouette - method of interpretation and validation of consistency.
4 @details Implementation based on paper @cite article::cluster::silhouette::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from enum import IntEnum
29 
30 import numpy
31 
32 from pyclustering.cluster.kmeans import kmeans
33 from pyclustering.cluster.kmedians import kmedians
34 from pyclustering.cluster.kmedoids import kmedoids
35 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
36 
37 from pyclustering.utils.metric import distance_metric, type_metric
38 
39 from pyclustering.core.wrapper import ccore_library
40 from pyclustering.core.metric_wrapper import metric_wrapper
41 
42 import pyclustering.core.silhouette_wrapper as wrapper
43 
44 
45 class silhouette:
46  """!
47  @brief Represents Silhouette method that is used interpretation and validation of consistency.
48  @details The silhouette value is a measure of how similar an object is to its own cluster compared to other clusters.
49  Be aware that silhouette method is applicable for K algorithm family, such as K-Means, K-Medians,
50  K-Medoids, X-Means, etc., not not applicable for DBSCAN, OPTICS, CURE, etc. The Silhouette value is
51  calculated using following formula:
52  \f[s\left ( i \right )=\frac{ b\left ( i \right ) - a\left ( i \right ) }{ max\left \{ a\left ( i \right ), b\left ( i \right ) \right \}}\f]
53  where \f$a\left ( i \right )\f$ - is average distance from object i to objects in its own cluster,
54  \f$b\left ( i \right )\f$ - is average distance from object i to objects in the nearest cluster (the appropriate among other clusters).
55 
56  Here is an example where Silhouette score is calculated for K-Means's clustering result:
57  @code
58  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
59  from pyclustering.cluster.kmeans import kmeans
60  from pyclustering.cluster.silhouette import silhouette
61 
62  from pyclustering.samples.definitions import SIMPLE_SAMPLES
63  from pyclustering.utils import read_sample
64 
65  # Read data 'SampleSimple3' from Simple Sample collection.
66  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
67 
68  # Prepare initial centers
69  centers = kmeans_plusplus_initializer(sample, 4).initialize();
70 
71  # Perform cluster analysis
72  kmeans_instance = kmeans(sample, centers);
73  kmeans_instance.process();
74  clusters = kmeans_instance.get_clusters();
75 
76  # Calculate Silhouette score
77  score = silhouette(sample, clusters).process().get_score()
78  @endcode
79 
80  @see kmeans, kmedoids, kmedians, xmeans, elbow
81 
82  """
83 
84  def __init__(self, data, clusters, **kwargs):
85  """!
86  @brief Initializes Silhouette method for analysis.
87 
88  @param[in] data (array_like): Input data that was used for cluster analysis.
89  @param[in] clusters (list): Cluster that have been obtained after cluster analysis.
90  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
91 
92  <b>Keyword Args:</b><br>
93  - metric (distance_metric): Metric that was used for cluster analysis and should be used for Silhouette
94  score calculation (by default Square Euclidean distance).
95  - ccore (bool): If True then CCORE (C++ implementation of pyclustering library) is used (by default True).
96 
97  """
98  self.__data = data
99  self.__clusters = clusters
100  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
101 
102  if self.__metric.get_type() != type_metric.USER_DEFINED:
103  self.__metric.enable_numpy_usage()
104  else:
105  self.__metric.disable_numpy_usage()
106 
107  self.__score = [0.0] * len(data)
108 
109  self.__ccore = kwargs.get('ccore', True) and self.__metric.get_type() != type_metric.USER_DEFINED
110  if self.__ccore:
111  self.__ccore = ccore_library.workable()
112 
113  if self.__ccore is False:
114  self.__data = numpy.array(data)
115 
116 
117  def process(self):
118  """!
119  @brief Calculates Silhouette score for each object from input data.
120 
121  @return (silhouette) Instance of the method (self).
122 
123  """
124  if self.__ccore is True:
125  self.__process_by_ccore()
126  else:
127  self.__process_by_python()
128 
129  return self
130 
131 
132  def __process_by_ccore(self):
133  """!
134  @brief Performs processing using CCORE (C/C++ part of pyclustering library).
135 
136  """
137  ccore_metric = metric_wrapper.create_instance(self.__metric)
138  self.__score = wrapper.silhoeutte(self.__data, self.__clusters, ccore_metric.get_pointer())
139 
140 
141  def __process_by_python(self):
142  """!
143  @brief Performs processing using python code.
144 
145  """
146  for index_cluster in range(len(self.__clusters)):
147  for index_point in self.__clusters[index_cluster]:
148  self.__score[index_point] = self.__calculate_score(index_point, index_cluster)
149 
150 
151  def get_score(self):
152  """!
153  @brief Returns Silhouette score for each object from input data.
154 
155  @see process
156 
157  """
158  return self.__score
159 
160 
161  def __calculate_score(self, index_point, index_cluster):
162  """!
163  @brief Calculates Silhouette score for the specific object defined by index_point.
164 
165  @param[in] index_point (uint): Index point from input data for which Silhouette score should be calculated.
166  @param[in] index_cluster (uint): Index cluster to which the point belongs to.
167 
168  @return (float) Silhouette score for the object.
169 
170  """
171  difference = self.__calculate_dataset_difference(index_point)
172 
173  a_score = self.__calculate_within_cluster_score(index_cluster, difference)
174  b_score = self.__caclulate_optimal_neighbor_cluster_score(index_cluster, difference)
175 
176  return (b_score - a_score) / max(a_score, b_score)
177 
178 
179  def __calculate_within_cluster_score(self, index_cluster, difference):
180  """!
181  @brief Calculates 'A' score for the specific object in cluster to which it belongs to.
182 
183  @param[in] index_point (uint): Index point from input data for which 'A' score should be calculated.
184  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
185 
186  @return (float) 'A' score for the object.
187 
188  """
189 
190  score = self.__calculate_cluster_difference(index_cluster, difference)
191  if len(self.__clusters[index_cluster]) == 1:
192  return float('nan')
193  return score / (len(self.__clusters[index_cluster]) - 1)
194 
195 
196  def __calculate_cluster_score(self, index_cluster, difference):
197  """!
198  @brief Calculates 'B*' score for the specific object for specific cluster.
199 
200  @param[in] index_point (uint): Index point from input data for which 'B*' score should be calculated.
201  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
202 
203  @return (float) 'B*' score for the object for specific cluster.
204 
205  """
206 
207  score = self.__calculate_cluster_difference(index_cluster, difference)
208  return score / len(self.__clusters[index_cluster])
209 
210 
211  def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference):
212  """!
213  @brief Calculates 'B' score for the specific object for the nearest cluster.
214 
215  @param[in] index_point (uint): Index point from input data for which 'B' score should be calculated.
216  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
217 
218  @return (float) 'B' score for the object.
219 
220  """
221 
222  optimal_score = float('inf')
223  for index_neighbor_cluster in range(len(self.__clusters)):
224  if index_cluster != index_neighbor_cluster:
225  candidate_score = self.__calculate_cluster_score(index_neighbor_cluster, difference)
226  if candidate_score < optimal_score:
227  optimal_score = candidate_score
228 
229  if optimal_score == float('inf'):
230  optimal_score = -1.0
231 
232  return optimal_score
233 
234 
235  def __calculate_cluster_difference(self, index_cluster, difference):
236  """!
237  @brief Calculates distance from each object in specified cluster to specified object.
238 
239  @param[in] index_point (uint): Index point for which difference is calculated.
240 
241  @return (list) Distance from specified object to each object from input data in specified cluster.
242 
243  """
244  cluster_difference = 0.0
245  for index_point in self.__clusters[index_cluster]:
246  cluster_difference += difference[index_point]
247 
248  return cluster_difference
249 
250 
251  def __calculate_dataset_difference(self, index_point):
252  """!
253  @brief Calculate distance from each object to specified object.
254 
255  @param[in] index_point (uint): Index point for which difference with other points is calculated.
256 
257  @return (list) Distance to each object from input data from the specified.
258 
259  """
260 
261  if self.__metric.get_type() != type_metric.USER_DEFINED:
262  dataset_differences = self.__metric(self.__data, self.__data[index_point])
263  else:
264  dataset_differences = [self.__metric(point, self.__data[index_point]) for point in self.__data]
265 
266  return dataset_differences
267 
268 
269 
270 class silhouette_ksearch_type(IntEnum):
271  """!
272  @brief Defines algorithms that can be used to find optimal number of cluster using Silhouette method.
273 
274  @see silhouette_ksearch
275 
276  """
277 
278 
279  KMEANS = 0
280 
281 
282  KMEDIANS = 1
283 
284 
285  KMEDOIDS = 2
286 
287  def get_type(self):
288  """!
289  @brief Returns algorithm type that corresponds to specified enumeration value.
290 
291  @return (type) Algorithm type for cluster analysis.
292 
293  """
294  if self == silhouette_ksearch_type.KMEANS:
295  return kmeans
296  elif self == silhouette_ksearch_type.KMEDIANS:
297  return kmedians
298  elif self == silhouette_ksearch_type.KMEDOIDS:
299  return kmedoids
300  else:
301  return None
302 
303 
304 
306  """!
307  @brief Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means,
308  K-Medians, K-Medoids) that is based on Silhouette method.
309 
310  @details This algorithm uses average value of scores for estimation and applicable for clusters that are well
311  separated. Here is an example where clusters are well separated (sample 'Hepta'):
312  @code
313  from pyclustering.cluster import cluster_visualizer
314  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
315  from pyclustering.cluster.kmeans import kmeans
316  from pyclustering.cluster.silhouette import silhouette_ksearch_type, silhouette_ksearch
317  from pyclustering.samples.definitions import FCPS_SAMPLES
318  from pyclustering.utils import read_sample
319 
320  sample = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA)
321  search_instance = silhouette_ksearch(sample, 2, 10, algorithm=silhouette_ksearch_type.KMEANS).process()
322 
323  amount = search_instance.get_amount()
324  scores = search_instance.get_scores()
325 
326  print("Scores: '%s'" % str(scores))
327 
328  initial_centers = kmeans_plusplus_initializer(sample, amount).initialize()
329  kmeans_instance = kmeans(sample, initial_centers).process()
330 
331  clusters = kmeans_instance.get_clusters()
332 
333  visualizer = cluster_visualizer()
334  visualizer.append_clusters(clusters, sample)
335  visualizer.show()
336  @endcode
337 
338  Obtained Silhouette scores for each K:
339  @code
340  Scores: '{2: 0.418434, 3: 0.450906, 4: 0.534709, 5: 0.689970, 6: 0.588460, 7: 0.882674, 8: 0.804725, 9: 0.780189}'
341  @endcode
342 
343  K = 7 has the bigger average Silhouette score and it means that it is optimal amount of clusters:
344  @image html silhouette_ksearch_hepta.png "Silhouette ksearch's analysis with further K-Means clustering (sample 'Hepta')."
345 
346  @see silhouette_ksearch_type
347 
348  """
349 
350  def __init__(self, data, kmin, kmax, **kwargs):
351  """!
352  @brief Initialize Silhouette search algorithm to find out optimal amount of clusters.
353 
354  @param[in] data (array_like): Input data that is used for searching optimal amount of clusters.
355  @param[in] kmin (uint): Amount of clusters from which search is performed. Should be equal or greater than 2.
356  @param[in] kmax (uint): Amount of clusters to which search is performed. Should be equal or less than amount of
357  points in input data.
358  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'algorithm').
359 
360  <b>Keyword Args:</b><br>
361  - algorithm (silhouette_ksearch_type): Defines algorithm that is used for searching optimal number of
362  clusters (by default K-Means).
363  - ccore (bool): If True then CCORE (C++ implementation of pyclustering library) is used (by default True).
364 
365  """
366  self.__data = data
367  self.__kmin = kmin
368  self.__kmax = kmax
369 
370  self.__algorithm = kwargs.get('algorithm', silhouette_ksearch_type.KMEANS)
371  self.__return_index = self.__algorithm == silhouette_ksearch_type.KMEDOIDS
372 
373  self.__amount = -1
374  self.__score = -1.0
375  self.__scores = {}
376 
377  self.__verify_arguments()
378 
379  self.__ccore = kwargs.get('ccore', True)
380  if self.__ccore:
381  self.__ccore = ccore_library.workable()
382 
383 
384  def process(self):
385  """!
386  @brief Performs analysis to find optimal amount of clusters.
387 
388  @see get_amount, get_score, get_scores
389 
390  @return (silhouette_search) Itself instance (silhouette_search)
391 
392  """
393  if self.__ccore is True:
394  self.__process_by_ccore()
395  else:
396  self.__process_by_python()
397 
398  return self
399 
400 
401  def __process_by_ccore(self):
402  """!
403  @brief Performs processing using CCORE (C/C++ part of pyclustering library).
404 
405  """
406  results = wrapper.silhoeutte_ksearch(self.__data, self.__kmin, self.__kmax, self.__algorithm)
407 
408  self.__amount = results[0]
409  self.__score = results[1]
410  self.__scores = results[2]
411 
412 
413  def __process_by_python(self):
414  """!
415  @brief Performs processing using python code.
416 
417  """
418  self.__scores = {}
419 
420  for k in range(self.__kmin, self.__kmax):
421  clusters = self.__calculate_clusters(k)
422  if len(clusters) != k:
423  self.__scores[k] = float('nan')
424  continue
425 
426  score = silhouette(self.__data, clusters).process().get_score()
427 
428  self.__scores[k] = sum(score) / len(score)
429 
430  if self.__scores[k] > self.__score:
431  self.__score = self.__scores[k]
432  self.__amount = k
433 
434 
435  def get_amount(self):
436  """!
437  @brief Returns optimal amount of clusters that has been found during analysis.
438 
439  @return (uint) Optimal amount of clusters.
440 
441  @see process
442 
443  """
444  return self.__amount
445 
446 
447  def get_score(self):
448  """!
449  @brief Returns silhouette score that belongs to optimal amount of clusters (k).
450 
451  @return (float) Score that belong to optimal amount of clusters.
452 
453  @see process, get_scores
454 
455  """
456  return self.__score
457 
458 
459  def get_scores(self):
460  """!
461  @brief Returns silhouette score for each K value (amount of clusters).
462 
463  @return (dict) Silhouette score for each K value, where key is a K value and value is a silhouette score.
464 
465  @see process, get_score
466 
467  """
468  return self.__scores
469 
470 
471  def __calculate_clusters(self, k):
472  """!
473  @brief Performs cluster analysis using specified K value.
474 
475  @param[in] k (uint): Amount of clusters that should be allocated.
476 
477  @return (array_like) Allocated clusters.
478 
479  """
480  initial_values = kmeans_plusplus_initializer(self.__data, k).initialize(return_index=self.__return_index)
481  algorithm_type = self.__algorithm.get_type()
482  return algorithm_type(self.__data, initial_values).process().get_clusters()
483 
484 
485  def __verify_arguments(self):
486  """!
487  @brief Checks algorithm's arguments and if some of them is incorrect then exception is thrown.
488 
489  """
490  if self.__kmax > len(self.__data):
491  raise ValueError("K max value '" + str(self.__kmax) + "' is bigger than amount of objects '" +
492  str(len(self.__data)) + "' in input data.")
493 
494  if self.__kmin <= 1:
495  raise ValueError("K min value '" + str(self.__kmin) + "' should be greater than 1 (impossible to provide "
496  "silhouette score for only one cluster).")
Cluster analysis algorithm: K-Medians.
Definition: kmedians.py:1
Cluster analysis algorithm: K-Means.
Definition: kmeans.py:1
def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference)
Calculates &#39;B&#39; score for the specific object for the nearest cluster.
Definition: silhouette.py:211
def __process_by_ccore(self)
Performs processing using CCORE (C/C++ part of pyclustering library).
Definition: silhouette.py:132
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def process(self)
Calculates Silhouette score for each object from input data.
Definition: silhouette.py:117
def __calculate_score(self, index_point, index_cluster)
Calculates Silhouette score for the specific object defined by index_point.
Definition: silhouette.py:161
def process(self)
Performs analysis to find optimal amount of clusters.
Definition: silhouette.py:384
def get_score(self)
Returns Silhouette score for each object from input data.
Definition: silhouette.py:151
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:64
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def __calculate_cluster_difference(self, index_cluster, difference)
Calculates distance from each object in specified cluster to specified object.
Definition: silhouette.py:235
def __init__(self, data, clusters, kwargs)
Initializes Silhouette method for analysis.
Definition: silhouette.py:84
def __process_by_python(self)
Performs processing using python code.
Definition: silhouette.py:413
def __init__(self, data, kmin, kmax, kwargs)
Initialize Silhouette search algorithm to find out optimal amount of clusters.
Definition: silhouette.py:350
def __calculate_within_cluster_score(self, index_cluster, difference)
Calculates &#39;A&#39; score for the specific object in cluster to which it belongs to.
Definition: silhouette.py:179
def __process_by_ccore(self)
Performs processing using CCORE (C/C++ part of pyclustering library).
Definition: silhouette.py:401
def get_type(self)
Returns algorithm type that corresponds to specified enumeration value.
Definition: silhouette.py:287
Defines algorithms that can be used to find optimal number of cluster using Silhouette method...
Definition: silhouette.py:270
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def __calculate_clusters(self, k)
Performs cluster analysis using specified K value.
Definition: silhouette.py:471
def __process_by_python(self)
Performs processing using python code.
Definition: silhouette.py:141
def get_scores(self)
Returns silhouette score for each K value (amount of clusters).
Definition: silhouette.py:459
def __verify_arguments(self)
Checks algorithm&#39;s arguments and if some of them is incorrect then exception is thrown.
Definition: silhouette.py:485
Represents Silhouette method that is used interpretation and validation of consistency.
Definition: silhouette.py:45
def get_score(self)
Returns silhouette score that belongs to optimal amount of clusters (k).
Definition: silhouette.py:447
Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means...
Definition: silhouette.py:305
Cluster analysis algorithm: K-Medoids.
Definition: kmedoids.py:1
def __calculate_dataset_difference(self, index_point)
Calculate distance from each object to specified object.
Definition: silhouette.py:251
def __calculate_cluster_score(self, index_cluster, difference)
Calculates &#39;B*&#39; score for the specific object for specific cluster.
Definition: silhouette.py:196
def get_amount(self)
Returns optimal amount of clusters that has been found during analysis.
Definition: silhouette.py:435