silhouette.py
1 """!
2 
3 @brief Silhouette - method of interpretation and validation of consistency.
4 @details Implementation based on paper @cite article::cluster::silhouette::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from enum import Enum
29 
30 import numpy
31 
32 from pyclustering.cluster.kmeans import kmeans
33 from pyclustering.cluster.kmedians import kmedians
34 from pyclustering.cluster.kmedoids import kmedoids
35 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
36 
37 from pyclustering.utils.metric import distance_metric, type_metric
38 
39 
40 class silhouette:
41  """!
42  @brief Represents Silhouette method that is used interpretation and validation of consistency.
43  @details The silhouette value is a measure of how similar an object is to its own cluster compared to other clusters.
44  Be aware that silhouette method is applicable for K algorithm family, such as K-Means, K-Medians,
45  K-Medoids, X-Means, etc., not not applicable for DBSCAN, OPTICS, CURE, etc. The Silhouette value is
46  calculated using following formula:
47  \f[s\left ( i \right )=\frac{ b\left ( i \right ) - a\left ( i \right ) }{ max\left \{ a\left ( i \right ), b\left ( i \right ) \right \}}\f]
48  where \f$a\left ( i \right )\f$ - is average distance from object i to objects in its own cluster,
49  \f$b\left ( i \right )\f$ - is average distance from object i to objects in the nearest cluster (the appropriate among other clusters).
50 
51  Here is an example where Silhouette score is calculated for K-Means's clustering result:
52  @code
53  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
54  from pyclustering.cluster.kmeans import kmeans
55  from pyclustering.cluster.silhouette import silhouette
56 
57  from pyclustering.samples.definitions import SIMPLE_SAMPLES
58  from pyclustering.utils import read_sample
59 
60  # Read data 'SampleSimple3' from Simple Sample collection.
61  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
62 
63  # Prepare initial centers
64  centers = kmeans_plusplus_initializer(sample, 4).initialize();
65 
66  # Perform cluster analysis
67  kmeans_instance = kmeans(sample, centers);
68  kmeans_instance.process();
69  clusters = kmeans_instance.get_clusters();
70 
71  # Calculate Silhouette score
72  score = silhouette(sample, clusters).process().get_score()
73  @endcode
74 
75  @see kmeans, kmedoids, kmedians, xmeans, elbow
76 
77  """
78 
79  def __init__(self, data, clusters, **kwargs):
80  """!
81  @brief Initializes Silhouette method for analysis.
82 
83  @param[in] data (array_like): Input data that was used for cluster analysis.
84  @param[in] clusters (list): Cluster that have been obtained after cluster analysis.
85  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
86 
87  <b>Keyword Args:</b><br>
88  - metric (distance_metric): Metric that was used for cluster analysis and should be used for Silhouette
89  score calculation (by default Square Euclidean distance).
90 
91  """
92  self.__data = numpy.array(data)
93  self.__clusters = clusters
94  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
95 
96  if self.__metric.get_type() != type_metric.USER_DEFINED:
97  self.__metric.enable_numpy_usage()
98  else:
99  self.__metric.disable_numpy_usage()
100 
101  self.__score = [0.0] * len(data)
102 
103 
104  def process(self):
105  """!
106  @brief Calculates Silhouette score for each object from input data.
107 
108  @return (silhouette) Instance of the method (self).
109 
110  """
111  for index_cluster in range(len(self.__clusters)):
112  for index_point in self.__clusters[index_cluster]:
113  self.__score[index_point] = self.__calculate_score(index_point, index_cluster)
114 
115  return self
116 
117 
118  def get_score(self):
119  """!
120  @brief Returns Silhouette score for each object from input data.
121 
122  @see process
123 
124  """
125  return self.__score
126 
127 
128  def __calculate_score(self, index_point, index_cluster):
129  """!
130  @brief Calculates Silhouette score for the specific object defined by index_point.
131 
132  @param[in] index_point (uint): Index point from input data for which Silhouette score should be calculated.
133  @param[in] index_cluster (uint): Index cluster to which the point belongs to.
134 
135  @return (float) Silhouette score for the object.
136 
137  """
138  difference = self.__calculate_dataset_difference(index_point)
139 
140  a_score = self.__calculate_within_cluster_score(index_cluster, difference)
141  b_score = self.__caclulate_optimal_neighbor_cluster_score(index_cluster, difference)
142 
143  return (b_score - a_score) / max(a_score, b_score)
144 
145 
146  def __calculate_within_cluster_score(self, index_cluster, difference):
147  """!
148  @brief Calculates 'A' score for the specific object in cluster to which it belongs to.
149 
150  @param[in] index_point (uint): Index point from input data for which 'A' score should be calculated.
151  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
152 
153  @return (float) 'A' score for the object.
154 
155  """
156 
157  score = self.__calculate_cluster_difference(index_cluster, difference)
158  if len(self.__clusters[index_cluster]) == 1:
159  return float('nan')
160  return score / (len(self.__clusters[index_cluster]) - 1)
161 
162 
163  def __calculate_cluster_score(self, index_cluster, difference):
164  """!
165  @brief Calculates 'B*' score for the specific object for specific cluster.
166 
167  @param[in] index_point (uint): Index point from input data for which 'B*' score should be calculated.
168  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
169 
170  @return (float) 'B*' score for the object for specific cluster.
171 
172  """
173 
174  score = self.__calculate_cluster_difference(index_cluster, difference)
175  return score / len(self.__clusters[index_cluster])
176 
177 
178  def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference):
179  """!
180  @brief Calculates 'B' score for the specific object for the nearest cluster.
181 
182  @param[in] index_point (uint): Index point from input data for which 'B' score should be calculated.
183  @param[in] index_cluster (uint): Index cluster to which the point is belong to.
184 
185  @return (float) 'B' score for the object.
186 
187  """
188 
189  optimal_score = float('inf')
190  for index_neighbor_cluster in range(len(self.__clusters)):
191  if index_cluster != index_neighbor_cluster:
192  candidate_score = self.__calculate_cluster_score(index_neighbor_cluster, difference)
193  if candidate_score < optimal_score:
194  optimal_score = candidate_score
195 
196  return optimal_score
197 
198 
199  def __calculate_cluster_difference(self, index_cluster, difference):
200  """!
201  @brief Calculates distance from each object in specified cluster to specified object.
202 
203  @param[in] index_point (uint): Index point for which difference is calculated.
204 
205  @return (list) Distance from specified object to each object from input data in specified cluster.
206 
207  """
208  cluster_difference = 0.0
209  for index_point in self.__clusters[index_cluster]:
210  cluster_difference += difference[index_point]
211 
212  return cluster_difference
213 
214 
215  def __calculate_dataset_difference(self, index_point):
216  """!
217  @brief Calculate distance from each object to specified object.
218 
219  @param[in] index_point (uint): Index point for which difference with other points is calculated.
220 
221  @return (list) Distance to each object from input data from the specified.
222 
223  """
224 
225  if self.__metric.get_type() != type_metric.USER_DEFINED:
226  dataset_differences = self.__metric(self.__data, self.__data[index_point])
227  else:
228  dataset_differences = [self.__metric(point, self.__data[index_point]) for point in self.__data]
229 
230  return dataset_differences
231 
232 
233 
235  """!
236  @brief Defines algorithms that can be used to find optimal number of cluster using Silhouette method.
237 
238  @see silhouette_ksearch
239 
240  """
241 
242 
243  KMEANS = 0
244 
245 
246  KMEDIANS = 1
247 
248 
249  KMEDOIDS = 2
250 
251  def get_type(self):
252  """!
253  @brief Returns algorithm type that corresponds to specified enumeration value.
254 
255  @return (type) Algorithm type for cluster analysis.
256 
257  """
258  if self == silhouette_ksearch_type.KMEANS:
259  return kmeans
260  elif self == silhouette_ksearch_type.KMEDIANS:
261  return kmedians
262  elif self == silhouette_ksearch_type.KMEDOIDS:
263  return kmedoids
264  else:
265  return None
266 
267 
268 
270  """!
271  @brief Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means,
272  K-Medians, K-Medoids) that is based on Silhouette method.
273 
274  @details This algorithm uses average value of scores for estimation and applicable for clusters that are well
275  separated. Here is an example where clusters are well separated (sample 'Hepta'):
276  @code
277  from pyclustering.cluster import cluster_visualizer
278  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
279  from pyclustering.cluster.kmeans import kmeans
280  from pyclustering.cluster.silhouette import silhouette_ksearch_type, silhouette_ksearch
281  from pyclustering.samples.definitions import FCPS_SAMPLES
282  from pyclustering.utils import read_sample
283 
284  sample = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA)
285  search_instance = silhouette_ksearch(sample, 2, 10, algorithm=silhouette_ksearch_type.KMEANS).process()
286 
287  amount = search_instance.get_amount()
288  scores = search_instance.get_scores()
289 
290  print("Scores: '%s'" % str(scores))
291 
292  initial_centers = kmeans_plusplus_initializer(sample, amount).initialize()
293  kmeans_instance = kmeans(sample, initial_centers).process()
294 
295  clusters = kmeans_instance.get_clusters()
296 
297  visualizer = cluster_visualizer()
298  visualizer.append_clusters(clusters, sample)
299  visualizer.show()
300  @endcode
301 
302  Obtained Silhouette scores for each K:
303  @code
304  Scores: '{2: 0.418434, 3: 0.450906, 4: 0.534709, 5: 0.689970, 6: 0.588460, 7: 0.882674, 8: 0.804725, 9: 0.780189}'
305  @endcode
306 
307  K = 7 has the bigger average Silhouette score and it means that it is optimal amount of clusters:
308  @image html silhouette_ksearch_hepta.png "Silhouette ksearch's analysis with further K-Means clustering (sample 'Hepta')."
309 
310  @see silhouette_ksearch_type
311 
312  """
313 
314  def __init__(self, data, kmin, kmax, **kwargs):
315  """!
316  @brief Initialize Silhouette search algorithm to find out optimal amount of clusters.
317 
318  @param[in] data (array_like): Input data that is used for searching optimal amount of clusters.
319  @param[in] kmin (uint): Amount of clusters from which search is performed. Should be equal or greater than 2.
320  @param[in] kmax (uint): Amount of clusters to which search is performed. Should be equal or less than amount of
321  points in input data.
322  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'algorithm').
323 
324  <b>Keyword Args:</b><br>
325  - algorithm (silhouette_ksearch_type): Defines algorithm that is used for searching optimal number of
326  clusters (by default K-Means).
327 
328  """
329  self.__data = data
330  self.__kmin = kmin
331  self.__kmax = kmax
332 
333  self.__algorithm = kwargs.get('algorithm', silhouette_ksearch_type.KMEANS)
334  self.__return_index = self.__algorithm == silhouette_ksearch_type.KMEDOIDS
335 
336  self.__amount = -1
337  self.__score = float('-Inf')
338  self.__scores = {}
339 
340  self.__verify_arguments()
341 
342 
343  def process(self):
344  """!
345  @brief Performs analysis to find optimal amount of clusters.
346 
347  @see get_amount, get_score, get_scores
348 
349  @return (silhouette_search) Itself instance (silhouette_search)
350 
351  """
352  self.__scores = {}
353 
354  for k in range(self.__kmin, self.__kmax):
355  clusters = self.__calculate_clusters(k)
356  if len(clusters) != k:
357  self.__scores[k] = float('nan')
358  continue
359 
360  score = silhouette(self.__data, clusters).process().get_score()
361 
362  self.__scores[k] = sum(score) / len(score)
363 
364  if self.__scores[k] > self.__score:
365  self.__score = self.__scores[k]
366  self.__amount = k
367 
368  return self
369 
370 
371  def get_amount(self):
372  """!
373  @brief Returns optimal amount of clusters that has been found during analysis.
374 
375  @return (uint) Optimal amount of clusters.
376 
377  @see process
378 
379  """
380  return self.__amount
381 
382 
383  def get_score(self):
384  """!
385  @brief Returns silhouette score that belongs to optimal amount of clusters (k).
386 
387  @return (float) Score that belong to optimal amount of clusters.
388 
389  @see process, get_scores
390 
391  """
392  return self.__score
393 
394 
395  def get_scores(self):
396  """!
397  @brief Returns silhouette score for each K value (amount of clusters).
398 
399  @return (dict) Silhouette score for each K value, where key is a K value and value is a silhouette score.
400 
401  @see process, get_score
402 
403  """
404  return self.__scores
405 
406 
407  def __calculate_clusters(self, k):
408  """!
409  @brief Performs cluster analysis using specified K value.
410 
411  @param[in] k (uint): Amount of clusters that should be allocated.
412 
413  @return (array_like) Allocated clusters.
414 
415  """
416  initial_values = kmeans_plusplus_initializer(self.__data, k).initialize(return_index=self.__return_index)
417  algorithm_type = self.__algorithm.get_type()
418  return algorithm_type(self.__data, initial_values).process().get_clusters()
419 
420 
421  def __verify_arguments(self):
422  """!
423  @brief Checks algorithm's arguments and if some of them is incorrect then exception is thrown.
424 
425  """
426  if self.__kmax > len(self.__data):
427  raise ValueError("K max value '" + str(self.__kmax) + "' is bigger than amount of objects '" +
428  str(len(self.__data)) + "' in input data.")
429 
430  if self.__kmin <= 1:
431  raise ValueError("K min value '" + str(self.__kmin) + "' should be greater than 1 (impossible to provide "
432  "silhiuette score for only one cluster).")
Cluster analysis algorithm: K-Medians.
Definition: kmedians.py:1
Cluster analysis algorithm: K-Means.
Definition: kmeans.py:1
def __caclulate_optimal_neighbor_cluster_score(self, index_cluster, difference)
Calculates &#39;B&#39; score for the specific object for the nearest cluster.
Definition: silhouette.py:178
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def process(self)
Calculates Silhouette score for each object from input data.
Definition: silhouette.py:104
def __calculate_score(self, index_point, index_cluster)
Calculates Silhouette score for the specific object defined by index_point.
Definition: silhouette.py:128
def process(self)
Performs analysis to find optimal amount of clusters.
Definition: silhouette.py:343
def get_score(self)
Returns Silhouette score for each object from input data.
Definition: silhouette.py:118
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:58
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def __calculate_cluster_difference(self, index_cluster, difference)
Calculates distance from each object in specified cluster to specified object.
Definition: silhouette.py:199
def __init__(self, data, clusters, kwargs)
Initializes Silhouette method for analysis.
Definition: silhouette.py:79
def __init__(self, data, kmin, kmax, kwargs)
Initialize Silhouette search algorithm to find out optimal amount of clusters.
Definition: silhouette.py:314
def __calculate_within_cluster_score(self, index_cluster, difference)
Calculates &#39;A&#39; score for the specific object in cluster to which it belongs to.
Definition: silhouette.py:146
def get_type(self)
Returns algorithm type that corresponds to specified enumeration value.
Definition: silhouette.py:251
Defines algorithms that can be used to find optimal number of cluster using Silhouette method...
Definition: silhouette.py:234
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def __calculate_clusters(self, k)
Performs cluster analysis using specified K value.
Definition: silhouette.py:407
def get_scores(self)
Returns silhouette score for each K value (amount of clusters).
Definition: silhouette.py:395
def __verify_arguments(self)
Checks algorithm&#39;s arguments and if some of them is incorrect then exception is thrown.
Definition: silhouette.py:421
Represents Silhouette method that is used interpretation and validation of consistency.
Definition: silhouette.py:40
def get_score(self)
Returns silhouette score that belongs to optimal amount of clusters (k).
Definition: silhouette.py:383
Represent algorithm for searching optimal number of clusters using specified K-algorithm (K-Means...
Definition: silhouette.py:269
Cluster analysis algorithm: K-Medoids (PAM - Partitioning Around Medoids).
Definition: kmedoids.py:1
def __calculate_dataset_difference(self, index_point)
Calculate distance from each object to specified object.
Definition: silhouette.py:215
def __calculate_cluster_score(self, index_cluster, difference)
Calculates &#39;B*&#39; score for the specific object for specific cluster.
Definition: silhouette.py:163
def get_amount(self)
Returns optimal amount of clusters that has been found during analysis.
Definition: silhouette.py:371