xmeans.py
1 """!
2 
3 @brief Cluster analysis algorithm: X-Means
4 @details Implementation based on paper @cite article::xmeans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import random
30 
31 from enum import IntEnum
32 
33 from math import log
34 
35 from pyclustering.cluster.encoder import type_encoding
36 from pyclustering.cluster.kmeans import kmeans
37 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
38 
39 from pyclustering.core.wrapper import ccore_library
40 
41 import pyclustering.core.xmeans_wrapper as wrapper
42 
43 from pyclustering.utils import euclidean_distance_square, euclidean_distance, distance_metric, type_metric
44 
45 
46 class splitting_type(IntEnum):
47  """!
48  @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
49 
50  """
51 
52 
64  BAYESIAN_INFORMATION_CRITERION = 0
65 
66 
75  MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1
76 
77 
78 class xmeans:
79  """!
80  @brief Class represents clustering algorithm X-Means.
81  @details X-means clustering method starts with the assumption of having a minimum number of clusters,
82  and then dynamically increases them. X-means uses specified splitting criterion to control
83  the process of splitting clusters. Method K-Means++ can be used for calculation of initial centers.
84 
85  CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
86 
87  Here example how to perform cluster analysis using X-Means algorithm:
88  @code
89  from pyclustering.cluster import cluster_visualizer
90  from pyclustering.cluster.xmeans import xmeans
91  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
92  from pyclustering.utils import read_sample
93  from pyclustering.samples.definitions import SIMPLE_SAMPLES
94 
95  # Read sample 'simple3' from file.
96  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
97 
98  # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will
99  # start analysis.
100  amount_initial_centers = 2
101  initial_centers = kmeans_plusplus_initializer(sample, amount_initial_centers).initialize()
102 
103  # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
104  # number of clusters that can be allocated is 20.
105  xmeans_instance = xmeans(sample, initial_centers, 20)
106  xmeans_instance.process()
107 
108  # Extract clustering results: clusters and their centers
109  clusters = xmeans_instance.get_clusters()
110  centers = xmeans_instance.get_centers()
111 
112  # Print total sum of metric errors
113  print("Total WCE:", xmeans_instance.get_total_wce())
114 
115  # Visualize clustering results
116  visualizer = cluster_visualizer()
117  visualizer.append_clusters(clusters, sample)
118  visualizer.append_cluster(centers, None, marker='*', markersize=10)
119  visualizer.show()
120  @endcode
121 
122  Visualization of clustering results that were obtained using code above and where X-Means algorithm allocates four clusters.
123  @image html xmeans_clustering_simple3.png "Fig. 1. X-Means clustering results (data 'Simple3')."
124 
125  @see center_initializer
126 
127  """
128 
129  def __init__(self, data, initial_centers=None, kmax=20, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True, **kwargs):
130  """!
131  @brief Constructor of clustering algorithm X-Means.
132 
133  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
134  @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...],
135  if it is not specified then X-Means starts from the random center.
136  @param[in] kmax (uint): Maximum number of clusters that can be allocated.
137  @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
138  @param[in] criterion (splitting_type): Type of splitting creation.
139  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
140  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'repeat').
141 
142  <b>Keyword Args:</b><br>
143  - repeat (unit): How many times K-Means should be run to improve parameters (by default is 1).
144  With larger 'repeat' values suggesting higher probability of finding global optimum.
145 
146  """
147 
148  self.__pointer_data = data
149  self.__clusters = []
150 
151  if initial_centers is not None:
152  self.__centers = initial_centers[:]
153  else:
154  self.__centers = kmeans_plusplus_initializer(data, 2).initialize()
155 
156  self.__kmax = kmax
157  self.__tolerance = tolerance
158  self.__criterion = criterion
159  self.__total_wce = 0.0
160  self.__repeat = kwargs.get('repeat', 1)
161 
162  self.__ccore = ccore
163  if self.__ccore:
164  self.__ccore = ccore_library.workable()
165 
166  self.__verify_arguments()
167 
168 
169  def process(self):
170  """!
171  @brief Performs cluster analysis in line with rules of X-Means algorithm.
172 
173  @return (xmeans) Returns itself (X-Means instance).
174 
175  @see get_clusters()
176  @see get_centers()
177 
178  """
179 
180  if self.__ccore is True:
181  self.__process_by_ccore()
182 
183  else:
184  self.__process_by_python()
185 
186  return self
187 
188 
189  def __process_by_ccore(self):
190  """!
191  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
192 
193  """
194 
195  result = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion, self.__repeat)
196  self.__clusters = result[0]
197  self.__centers = result[1]
198  self.__total_wce = result[2][0]
199 
200 
201  def __process_by_python(self):
202  """!
203  @brief Performs cluster analysis using python code.
204 
205  """
206 
207  self.__clusters = []
208  while len(self.__centers) <= self.__kmax:
209  current_cluster_number = len(self.__centers)
210 
211  self.__clusters, self.__centers, _ = self.__improve_parameters(self.__centers)
212  allocated_centers = self.__improve_structure(self.__clusters, self.__centers)
213 
214  if current_cluster_number == len(allocated_centers):
215  break
216  else:
217  self.__centers = allocated_centers
218 
219  self.__clusters, self.__centers, self.__total_wce = self.__improve_parameters(self.__centers)
220 
221 
222  def predict(self, points):
223  """!
224  @brief Calculates the closest cluster to each point.
225 
226  @param[in] points (array_like): Points for which closest clusters are calculated.
227 
228  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
229  collection if 'process()' method was not called.
230 
231  An example how to calculate (or predict) the closest cluster to specified points.
232  @code
233  from pyclustering.cluster.xmeans import xmeans
234  from pyclustering.samples.definitions import SIMPLE_SAMPLES
235  from pyclustering.utils import read_sample
236 
237  # Load list of points for cluster analysis.
238  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
239 
240  # Initial centers for sample 'Simple3'.
241  initial_centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
242 
243  # Create instance of X-Means algorithm with prepared centers.
244  xmeans_instance = xmeans(sample, initial_centers)
245 
246  # Run cluster analysis.
247  xmeans_instance.process()
248 
249  # Calculate the closest cluster to following two points.
250  points = [[0.25, 0.2], [2.5, 4.0]]
251  closest_clusters = xmeans_instance.predict(points)
252  print(closest_clusters)
253  @endcode
254 
255  """
256  nppoints = numpy.array(points)
257  if len(self.__clusters) == 0:
258  return []
259 
260  metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, numpy_usage=True)
261 
262  differences = numpy.zeros((len(nppoints), len(self.__centers)))
263  for index_point in range(len(nppoints)):
264  differences[index_point] = metric(nppoints[index_point], self.__centers)
265 
266  return numpy.argmin(differences, axis=1)
267 
268 
269  def get_clusters(self):
270  """!
271  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
272 
273  @return (list) List of allocated clusters.
274 
275  @see process()
276  @see get_centers()
277  @see get_total_wce()
278 
279  """
280 
281  return self.__clusters
282 
283 
284  def get_centers(self):
285  """!
286  @brief Returns list of centers for allocated clusters.
287 
288  @return (list) List of centers for allocated clusters.
289 
290  @see process()
291  @see get_clusters()
292  @see get_total_wce()
293 
294  """
295 
296  return self.__centers
297 
298 
300  """!
301  @brief Returns clustering result representation type that indicate how clusters are encoded.
302 
303  @return (type_encoding) Clustering result representation.
304 
305  @see get_clusters()
306 
307  """
308 
309  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
310 
311 
312  def get_total_wce(self):
313  """!
314  @brief Returns sum of Euclidean Squared metric errors (SSE - Sum of Squared Errors).
315  @details Sum of metric errors is calculated using distance between point and its center:
316  \f[error=\sum_{i=0}^{N}euclidean_square_distance(x_{i}-center(x_{i}))\f]
317 
318  @see process()
319  @see get_clusters()
320 
321  """
322 
323  return self.__total_wce
324 
325 
326  def __search_optimial_parameters(self, local_data):
327  """!
328  @brief Split data of the region into two cluster and tries to find global optimum by running k-means clustering
329  several times (defined by 'repeat' argument).
330 
331  @param[in] local_data (list): Points of a region that should be split into two clusters.
332 
333  @return (tuple) List of allocated clusters, list of centers and total WCE (clusters, centers, wce).
334 
335  """
336  optimal_wce, optimal_centers, optimal_clusters = float('+inf'), None, None
337 
338  for _ in range(self.__repeat):
339  candidates = 5
340  if len(local_data) < candidates:
341  candidates = len(local_data)
342 
343  local_centers = kmeans_plusplus_initializer(local_data, 2, candidates).initialize()
344 
345  kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False)
346  kmeans_instance.process()
347 
348  local_wce = kmeans_instance.get_total_wce()
349  if local_wce < optimal_wce:
350  optimal_centers = kmeans_instance.get_centers()
351  optimal_clusters = kmeans_instance.get_clusters()
352  optimal_wce = local_wce
353 
354  return optimal_clusters, optimal_centers, optimal_wce
355 
356 
357  def __improve_parameters(self, centers, available_indexes=None):
358  """!
359  @brief Performs k-means clustering in the specified region.
360 
361  @param[in] centers (list): Cluster centers, if None then automatically generated two centers using center initialization method.
362  @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None then all points are used.
363 
364  @return (tuple) List of allocated clusters, list of centers and total WCE (clusters, centers, wce).
365 
366  """
367 
368  if available_indexes and len(available_indexes) == 1:
369  index_center = available_indexes[0]
370  return [available_indexes], self.__pointer_data[index_center], 0.0
371 
372  local_data = self.__pointer_data
373  if available_indexes:
374  local_data = [self.__pointer_data[i] for i in available_indexes]
375 
376  local_centers = centers
377  if centers is None:
378  clusters, local_centers, local_wce = self.__search_optimial_parameters(local_data)
379  else:
380  kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False).process()
381 
382  local_wce = kmeans_instance.get_total_wce()
383  local_centers = kmeans_instance.get_centers()
384  clusters = kmeans_instance.get_clusters()
385 
386  if available_indexes:
387  clusters = self.__local_to_global_clusters(clusters, available_indexes)
388 
389  return clusters, local_centers, local_wce
390 
391 
392  def __local_to_global_clusters(self, local_clusters, available_indexes):
393  """!
394  @brief Converts clusters in local region define by 'available_indexes' to global clusters.
395 
396  @param[in] local_clusters (list): Local clusters in specific region.
397  @param[in] available_indexes (list): Map between local and global point's indexes.
398 
399  @return Global clusters.
400 
401  """
402 
403  clusters = []
404  for local_cluster in local_clusters:
405  current_cluster = []
406  for index_point in local_cluster:
407  current_cluster.append(available_indexes[index_point])
408 
409  clusters.append(current_cluster)
410 
411  return clusters
412 
413 
414  def __improve_structure(self, clusters, centers):
415  """!
416  @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
417 
418  @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
419  @param[in] centers (list): Centers of clusters.
420 
421  @return (list) Allocated centers for clustering.
422 
423  """
424 
425  allocated_centers = []
426  amount_free_centers = self.__kmax - len(centers)
427 
428  for index_cluster in range(len(clusters)):
429  # solve k-means problem for children where data of parent are used.
430  (parent_child_clusters, parent_child_centers, _) = self.__improve_parameters(None, clusters[index_cluster])
431 
432  # If it's possible to split current data
433  if len(parent_child_clusters) > 1:
434  # Calculate splitting criterion
435  parent_scores = self.__splitting_criterion([clusters[index_cluster]], [centers[index_cluster]])
436  child_scores = self.__splitting_criterion([parent_child_clusters[0], parent_child_clusters[1]], parent_child_centers)
437 
438  split_require = False
439 
440  # Reallocate number of centers (clusters) in line with scores
441  if self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
442  if parent_scores < child_scores: split_require = True
443 
444  elif self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
445  # If its score for the split structure with two children is smaller than that for the parent structure,
446  # then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
447  if parent_scores > child_scores: split_require = True;
448 
449  if (split_require is True) and (amount_free_centers > 0):
450  allocated_centers.append(parent_child_centers[0])
451  allocated_centers.append(parent_child_centers[1])
452 
453  amount_free_centers -= 1
454  else:
455  allocated_centers.append(centers[index_cluster])
456 
457  else:
458  allocated_centers.append(centers[index_cluster])
459 
460  return allocated_centers
461 
462 
463  def __splitting_criterion(self, clusters, centers):
464  """!
465  @brief Calculates splitting criterion for input clusters.
466 
467  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
468  @param[in] centers (list): Centers of the clusters.
469 
470  @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
471 
472  @see __bayesian_information_criterion(clusters, centers)
473  @see __minimum_noiseless_description_length(clusters, centers)
474 
475  """
476 
477  if self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
478  return self.__bayesian_information_criterion(clusters, centers)
479 
480  elif self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
481  return self.__minimum_noiseless_description_length(clusters, centers)
482 
483  else:
484  assert 0;
485 
486 
487  def __minimum_noiseless_description_length(self, clusters, centers):
488  """!
489  @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
490 
491  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
492  @param[in] centers (list): Centers of the clusters.
493 
494  @return (double) Returns splitting criterion in line with bayesian information criterion.
495  Low value of splitting cretion means that current structure is much better.
496 
497  @see __bayesian_information_criterion(clusters, centers)
498 
499  """
500 
501  scores = float('inf')
502 
503  W = 0.0
504  K = len(clusters)
505  N = 0.0
506 
507  sigma_sqrt = 0.0
508 
509  alpha = 0.9
510  betta = 0.9
511 
512  for index_cluster in range(0, len(clusters), 1):
513  Ni = len(clusters[index_cluster])
514  if Ni == 0:
515  return float('inf')
516 
517  Wi = 0.0
518  for index_object in clusters[index_cluster]:
519  # euclidean_distance_square should be used in line with paper, but in this case results are
520  # very poor, therefore square root is used to improved.
521  Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster])
522 
523  sigma_sqrt += Wi
524  W += Wi / Ni
525  N += Ni
526 
527  if N - K > 0:
528  sigma_sqrt /= (N - K)
529  sigma = sigma_sqrt ** 0.5
530 
531  Kw = (1.0 - K / N) * sigma_sqrt
532  Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5
533 
534  scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N
535 
536  return scores
537 
538 
539  def __bayesian_information_criterion(self, clusters, centers):
540  """!
541  @brief Calculates splitting criterion for input clusters using bayesian information criterion.
542 
543  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
544  @param[in] centers (list): Centers of the clusters.
545 
546  @return (double) Splitting criterion in line with bayesian information criterion.
547  High value of splitting criterion means that current structure is much better.
548 
549  @see __minimum_noiseless_description_length(clusters, centers)
550 
551  """
552 
553  scores = [float('inf')] * len(clusters) # splitting criterion
554  dimension = len(self.__pointer_data[0])
555 
556  # estimation of the noise variance in the data set
557  sigma_sqrt = 0.0
558  K = len(clusters)
559  N = 0.0
560 
561  for index_cluster in range(0, len(clusters), 1):
562  for index_object in clusters[index_cluster]:
563  sigma_sqrt += euclidean_distance_square(self.__pointer_data[index_object], centers[index_cluster])
564 
565  N += len(clusters[index_cluster])
566 
567  if N - K > 0:
568  sigma_sqrt /= (N - K)
569  p = (K - 1) + dimension * K + 1
570 
571  # in case of the same points, sigma_sqrt can be zero (issue: #407)
572  sigma_multiplier = 0.0
573  if sigma_sqrt <= 0.0:
574  sigma_multiplier = float('-inf')
575  else:
576  sigma_multiplier = dimension * 0.5 * log(sigma_sqrt)
577 
578  # splitting criterion
579  for index_cluster in range(0, len(clusters), 1):
580  n = len(clusters[index_cluster])
581 
582  L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * sigma_multiplier - (n - K) * 0.5
583 
584  # BIC calculation
585  scores[index_cluster] = L - p * 0.5 * log(N)
586 
587  return sum(scores)
588 
589 
590  def __verify_arguments(self):
591  """!
592  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
593 
594  """
595  if len(self.__pointer_data) == 0:
596  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
597 
598  if len(self.__centers) == 0:
599  raise ValueError("Initial centers are empty (size: '%d')." % len(self.__pointer_data))
600 
601  if self.__tolerance < 0:
602  raise ValueError("Tolerance (current value: '%d') should be greater or equal to 0." %
603  self.__tolerance)
604 
605  if self.__repeat <= 0:
606  raise ValueError("Repeat (current value: '%d') should be greater than 0." %
607  self.__repeat)
def __improve_structure(self, clusters, centers)
Check for best structure: divides each cluster into two and checks for best results using splitting c...
Definition: xmeans.py:414
def __splitting_criterion(self, clusters, centers)
Calculates splitting criterion for input clusters.
Definition: xmeans.py:463
def __minimum_noiseless_description_length(self, clusters, centers)
Calculates splitting criterion for input clusters using minimum noiseless description length criterio...
Definition: xmeans.py:487
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: xmeans.py:299
The module contains K-Means algorithm and other related services.
Definition: kmeans.py:1
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def __process_by_python(self)
Performs cluster analysis using python code.
Definition: xmeans.py:201
def process(self)
Performs cluster analysis in line with rules of X-Means algorithm.
Definition: xmeans.py:169
Module for representing clustering results.
Definition: encoder.py:1
def get_total_wce(self)
Returns sum of Euclidean Squared metric errors (SSE - Sum of Squared Errors).
Definition: xmeans.py:312
def __local_to_global_clusters(self, local_clusters, available_indexes)
Converts clusters in local region define by &#39;available_indexes&#39; to global clusters.
Definition: xmeans.py:392
def __bayesian_information_criterion(self, clusters, centers)
Calculates splitting criterion for input clusters using bayesian information criterion.
Definition: xmeans.py:539
Class represents clustering algorithm X-Means.
Definition: xmeans.py:78
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: xmeans.py:590
Class implements K-Means clustering algorithm.
Definition: kmeans.py:272
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: xmeans.py:269
def get_centers(self)
Returns list of centers for allocated clusters.
Definition: xmeans.py:284
def __search_optimial_parameters(self, local_data)
Split data of the region into two cluster and tries to find global optimum by running k-means cluster...
Definition: xmeans.py:326
def __improve_parameters(self, centers, available_indexes=None)
Performs k-means clustering in the specified region.
Definition: xmeans.py:357
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def predict(self, points)
Calculates the closest cluster to each point.
Definition: xmeans.py:222
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: xmeans.py:189
def __init__(self, data, initial_centers=None, kmax=20, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True, kwargs)
Constructor of clustering algorithm X-Means.
Definition: xmeans.py:129
Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm...
Definition: xmeans.py:46