xmeans.py
1 """!
2 
3 @brief Cluster analysis algorithm: X-Means
4 @details Implementation based on paper @cite article::xmeans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 
30 from enum import IntEnum
31 
32 from math import log
33 
34 from pyclustering.cluster.encoder import type_encoding
35 from pyclustering.cluster.kmeans import kmeans
36 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
37 
38 from pyclustering.core.wrapper import ccore_library
39 
40 import pyclustering.core.xmeans_wrapper as wrapper
41 
42 from pyclustering.utils import euclidean_distance_square, euclidean_distance, distance_metric, type_metric
43 
44 
45 class splitting_type(IntEnum):
46  """!
47  @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
48 
49  """
50 
51 
63  BAYESIAN_INFORMATION_CRITERION = 0
64 
65 
74  MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1
75 
76 
77 class xmeans:
78  """!
79  @brief Class represents clustering algorithm X-Means.
80  @details X-means clustering method starts with the assumption of having a minimum number of clusters,
81  and then dynamically increases them. X-means uses specified splitting criterion to control
82  the process of splitting clusters. Method K-Means++ can be used for calculation of initial centers.
83 
84  CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
85 
86  Here example how to perform cluster analysis using X-Means algorithm:
87  @code
88  from pyclustering.cluster import cluster_visualizer
89  from pyclustering.cluster.xmeans import xmeans
90  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
91  from pyclustering.utils import read_sample
92  from pyclustering.samples.definitions import SIMPLE_SAMPLES
93 
94  # Read sample 'simple3' from file.
95  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
96 
97  # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will
98  # start analysis.
99  amount_initial_centers = 2
100  initial_centers = kmeans_plusplus_initializer(sample, amount_initial_centers).initialize()
101 
102  # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
103  # number of clusters that can be allocated is 20.
104  xmeans_instance = xmeans(sample, initial_centers, 20)
105  xmeans_instance.process()
106 
107  # Extract clustering results: clusters and their centers
108  clusters = xmeans_instance.get_clusters()
109  centers = xmeans_instance.get_centers()
110 
111  # Print total sum of metric errors
112  print("Total WCE:", xmeans_instance.get_total_wce())
113 
114  # Visualize clustering results
115  visualizer = cluster_visualizer()
116  visualizer.append_clusters(clusters, sample)
117  visualizer.append_cluster(centers, None, marker='*', markersize=10)
118  visualizer.show()
119  @endcode
120 
121  Visualization of clustering results that were obtained using code above and where X-Means algorithm allocates four clusters.
122  @image html xmeans_clustering_simple3.png "Fig. 1. X-Means clustering results (data 'Simple3')."
123 
124  @see center_initializer
125 
126  """
127 
128  def __init__(self, data, initial_centers=None, kmax=20, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True, **kwargs):
129  """!
130  @brief Constructor of clustering algorithm X-Means.
131 
132  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
133  @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: `[center1, center2, ...]`,
134  if it is not specified then X-Means starts from the random center.
135  @param[in] kmax (uint): Maximum number of clusters that can be allocated.
136  @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
137  @param[in] criterion (splitting_type): Type of splitting creation.
138  @param[in] ccore (bool): Defines if C++ pyclustering library should be used instead of Python implementation.
139  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `repeat`, `random_state`).
140 
141  <b>Keyword Args:</b><br>
142  - repeat (unit): How many times K-Means should be run to improve parameters (by default is `1`).
143  With larger `repeat` values suggesting higher probability of finding global optimum.
144  - random_state (int): Seed for random state (by default is `None`, current system time is used).
145 
146  """
147 
148  self.__pointer_data = data
149  self.__clusters = []
150  self.__random_state = kwargs.get('random_state', None)
151 
152  if initial_centers is not None:
153  self.__centers = initial_centers[:]
154  else:
155  self.__centers = kmeans_plusplus_initializer(data, 2, random_state=self.__random_state).initialize()
156 
157  self.__kmax = kmax
158  self.__tolerance = tolerance
159  self.__criterion = criterion
160  self.__total_wce = 0.0
161  self.__repeat = kwargs.get('repeat', 1)
162 
163  self.__ccore = ccore
164  if self.__ccore:
165  self.__ccore = ccore_library.workable()
166 
167  self.__verify_arguments()
168 
169 
170  def process(self):
171  """!
172  @brief Performs cluster analysis in line with rules of X-Means algorithm.
173 
174  @return (xmeans) Returns itself (X-Means instance).
175 
176  @see get_clusters()
177  @see get_centers()
178 
179  """
180 
181  if self.__ccore is True:
182  self.__process_by_ccore()
183 
184  else:
185  self.__process_by_python()
186 
187  return self
188 
189 
190  def __process_by_ccore(self):
191  """!
192  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
193 
194  """
195 
196  result = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion, self.__repeat, self.__random_state)
197  self.__clusters = result[0]
198  self.__centers = result[1]
199  self.__total_wce = result[2][0]
200 
201 
202  def __process_by_python(self):
203  """!
204  @brief Performs cluster analysis using python code.
205 
206  """
207 
208  self.__clusters = []
209  while len(self.__centers) <= self.__kmax:
210  current_cluster_number = len(self.__centers)
211 
212  self.__clusters, self.__centers, _ = self.__improve_parameters(self.__centers)
213  allocated_centers = self.__improve_structure(self.__clusters, self.__centers)
214 
215  if current_cluster_number == len(allocated_centers):
216  break
217  else:
218  self.__centers = allocated_centers
219 
220  self.__clusters, self.__centers, self.__total_wce = self.__improve_parameters(self.__centers)
221 
222 
223  def predict(self, points):
224  """!
225  @brief Calculates the closest cluster to each point.
226 
227  @param[in] points (array_like): Points for which closest clusters are calculated.
228 
229  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
230  collection if 'process()' method was not called.
231 
232  An example how to calculate (or predict) the closest cluster to specified points.
233  @code
234  from pyclustering.cluster.xmeans import xmeans
235  from pyclustering.samples.definitions import SIMPLE_SAMPLES
236  from pyclustering.utils import read_sample
237 
238  # Load list of points for cluster analysis.
239  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
240 
241  # Initial centers for sample 'Simple3'.
242  initial_centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
243 
244  # Create instance of X-Means algorithm with prepared centers.
245  xmeans_instance = xmeans(sample, initial_centers)
246 
247  # Run cluster analysis.
248  xmeans_instance.process()
249 
250  # Calculate the closest cluster to following two points.
251  points = [[0.25, 0.2], [2.5, 4.0]]
252  closest_clusters = xmeans_instance.predict(points)
253  print(closest_clusters)
254  @endcode
255 
256  """
257  nppoints = numpy.array(points)
258  if len(self.__clusters) == 0:
259  return []
260 
261  metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, numpy_usage=True)
262 
263  differences = numpy.zeros((len(nppoints), len(self.__centers)))
264  for index_point in range(len(nppoints)):
265  differences[index_point] = metric(nppoints[index_point], self.__centers)
266 
267  return numpy.argmin(differences, axis=1)
268 
269 
270  def get_clusters(self):
271  """!
272  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
273 
274  @return (list) List of allocated clusters.
275 
276  @see process()
277  @see get_centers()
278  @see get_total_wce()
279 
280  """
281 
282  return self.__clusters
283 
284 
285  def get_centers(self):
286  """!
287  @brief Returns list of centers for allocated clusters.
288 
289  @return (list) List of centers for allocated clusters.
290 
291  @see process()
292  @see get_clusters()
293  @see get_total_wce()
294 
295  """
296 
297  return self.__centers
298 
299 
301  """!
302  @brief Returns clustering result representation type that indicate how clusters are encoded.
303 
304  @return (type_encoding) Clustering result representation.
305 
306  @see get_clusters()
307 
308  """
309 
310  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
311 
312 
313  def get_total_wce(self):
314  """!
315  @brief Returns sum of Euclidean Squared metric errors (SSE - Sum of Squared Errors).
316  @details Sum of metric errors is calculated using distance between point and its center:
317  \f[error=\sum_{i=0}^{N}euclidean_square_distance(x_{i}-center(x_{i}))\f]
318 
319  @see process()
320  @see get_clusters()
321 
322  """
323 
324  return self.__total_wce
325 
326 
327  def __search_optimial_parameters(self, local_data):
328  """!
329  @brief Split data of the region into two cluster and tries to find global optimum by running k-means clustering
330  several times (defined by 'repeat' argument).
331 
332  @param[in] local_data (list): Points of a region that should be split into two clusters.
333 
334  @return (tuple) List of allocated clusters, list of centers and total WCE (clusters, centers, wce).
335 
336  """
337  optimal_wce, optimal_centers, optimal_clusters = float('+inf'), None, None
338 
339  for _ in range(self.__repeat):
340  candidates = 5
341  if len(local_data) < candidates:
342  candidates = len(local_data)
343 
344  local_centers = kmeans_plusplus_initializer(local_data, 2, candidates, random_state=self.__random_state).initialize()
345 
346  kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False)
347  kmeans_instance.process()
348 
349  local_wce = kmeans_instance.get_total_wce()
350  if local_wce < optimal_wce:
351  optimal_centers = kmeans_instance.get_centers()
352  optimal_clusters = kmeans_instance.get_clusters()
353  optimal_wce = local_wce
354 
355  return optimal_clusters, optimal_centers, optimal_wce
356 
357 
358  def __improve_parameters(self, centers, available_indexes=None):
359  """!
360  @brief Performs k-means clustering in the specified region.
361 
362  @param[in] centers (list): Cluster centers, if None then automatically generated two centers using center initialization method.
363  @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None then all points are used.
364 
365  @return (tuple) List of allocated clusters, list of centers and total WCE (clusters, centers, wce).
366 
367  """
368 
369  if available_indexes and len(available_indexes) == 1:
370  index_center = available_indexes[0]
371  return [available_indexes], self.__pointer_data[index_center], 0.0
372 
373  local_data = self.__pointer_data
374  if available_indexes:
375  local_data = [self.__pointer_data[i] for i in available_indexes]
376 
377  local_centers = centers
378  if centers is None:
379  clusters, local_centers, local_wce = self.__search_optimial_parameters(local_data)
380  else:
381  kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False).process()
382 
383  local_wce = kmeans_instance.get_total_wce()
384  local_centers = kmeans_instance.get_centers()
385  clusters = kmeans_instance.get_clusters()
386 
387  if available_indexes:
388  clusters = self.__local_to_global_clusters(clusters, available_indexes)
389 
390  return clusters, local_centers, local_wce
391 
392 
393  def __local_to_global_clusters(self, local_clusters, available_indexes):
394  """!
395  @brief Converts clusters in local region define by 'available_indexes' to global clusters.
396 
397  @param[in] local_clusters (list): Local clusters in specific region.
398  @param[in] available_indexes (list): Map between local and global point's indexes.
399 
400  @return Global clusters.
401 
402  """
403 
404  clusters = []
405  for local_cluster in local_clusters:
406  current_cluster = []
407  for index_point in local_cluster:
408  current_cluster.append(available_indexes[index_point])
409 
410  clusters.append(current_cluster)
411 
412  return clusters
413 
414 
415  def __improve_structure(self, clusters, centers):
416  """!
417  @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
418 
419  @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
420  @param[in] centers (list): Centers of clusters.
421 
422  @return (list) Allocated centers for clustering.
423 
424  """
425 
426  allocated_centers = []
427  amount_free_centers = self.__kmax - len(centers)
428 
429  for index_cluster in range(len(clusters)):
430  # solve k-means problem for children where data of parent are used.
431  (parent_child_clusters, parent_child_centers, _) = self.__improve_parameters(None, clusters[index_cluster])
432 
433  # If it's possible to split current data
434  if len(parent_child_clusters) > 1:
435  # Calculate splitting criterion
436  parent_scores = self.__splitting_criterion([clusters[index_cluster]], [centers[index_cluster]])
437  child_scores = self.__splitting_criterion([parent_child_clusters[0], parent_child_clusters[1]], parent_child_centers)
438 
439  split_require = False
440 
441  # Reallocate number of centers (clusters) in line with scores
442  if self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
443  if parent_scores < child_scores: split_require = True
444 
445  elif self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
446  # If its score for the split structure with two children is smaller than that for the parent structure,
447  # then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
448  if parent_scores > child_scores: split_require = True;
449 
450  if (split_require is True) and (amount_free_centers > 0):
451  allocated_centers.append(parent_child_centers[0])
452  allocated_centers.append(parent_child_centers[1])
453 
454  amount_free_centers -= 1
455  else:
456  allocated_centers.append(centers[index_cluster])
457 
458  else:
459  allocated_centers.append(centers[index_cluster])
460 
461  return allocated_centers
462 
463 
464  def __splitting_criterion(self, clusters, centers):
465  """!
466  @brief Calculates splitting criterion for input clusters.
467 
468  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
469  @param[in] centers (list): Centers of the clusters.
470 
471  @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
472 
473  @see __bayesian_information_criterion(clusters, centers)
474  @see __minimum_noiseless_description_length(clusters, centers)
475 
476  """
477 
478  if self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
479  return self.__bayesian_information_criterion(clusters, centers)
480 
481  elif self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
482  return self.__minimum_noiseless_description_length(clusters, centers)
483 
484  else:
485  assert 0;
486 
487 
488  def __minimum_noiseless_description_length(self, clusters, centers):
489  """!
490  @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
491 
492  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
493  @param[in] centers (list): Centers of the clusters.
494 
495  @return (double) Returns splitting criterion in line with bayesian information criterion.
496  Low value of splitting cretion means that current structure is much better.
497 
498  @see __bayesian_information_criterion(clusters, centers)
499 
500  """
501 
502  scores = float('inf')
503 
504  W = 0.0
505  K = len(clusters)
506  N = 0.0
507 
508  sigma_sqrt = 0.0
509 
510  alpha = 0.9
511  betta = 0.9
512 
513  for index_cluster in range(0, len(clusters), 1):
514  Ni = len(clusters[index_cluster])
515  if Ni == 0:
516  return float('inf')
517 
518  Wi = 0.0
519  for index_object in clusters[index_cluster]:
520  # euclidean_distance_square should be used in line with paper, but in this case results are
521  # very poor, therefore square root is used to improved.
522  Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster])
523 
524  sigma_sqrt += Wi
525  W += Wi / Ni
526  N += Ni
527 
528  if N - K > 0:
529  sigma_sqrt /= (N - K)
530  sigma = sigma_sqrt ** 0.5
531 
532  Kw = (1.0 - K / N) * sigma_sqrt
533  Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5
534 
535  scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N
536 
537  return scores
538 
539 
540  def __bayesian_information_criterion(self, clusters, centers):
541  """!
542  @brief Calculates splitting criterion for input clusters using bayesian information criterion.
543 
544  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
545  @param[in] centers (list): Centers of the clusters.
546 
547  @return (double) Splitting criterion in line with bayesian information criterion.
548  High value of splitting criterion means that current structure is much better.
549 
550  @see __minimum_noiseless_description_length(clusters, centers)
551 
552  """
553 
554  scores = [float('inf')] * len(clusters) # splitting criterion
555  dimension = len(self.__pointer_data[0])
556 
557  # estimation of the noise variance in the data set
558  sigma_sqrt = 0.0
559  K = len(clusters)
560  N = 0.0
561 
562  for index_cluster in range(0, len(clusters), 1):
563  for index_object in clusters[index_cluster]:
564  sigma_sqrt += euclidean_distance_square(self.__pointer_data[index_object], centers[index_cluster])
565 
566  N += len(clusters[index_cluster])
567 
568  if N - K > 0:
569  sigma_sqrt /= (N - K)
570  p = (K - 1) + dimension * K + 1
571 
572  # in case of the same points, sigma_sqrt can be zero (issue: #407)
573  sigma_multiplier = 0.0
574  if sigma_sqrt <= 0.0:
575  sigma_multiplier = float('-inf')
576  else:
577  sigma_multiplier = dimension * 0.5 * log(sigma_sqrt)
578 
579  # splitting criterion
580  for index_cluster in range(0, len(clusters), 1):
581  n = len(clusters[index_cluster])
582 
583  L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * sigma_multiplier - (n - K) * 0.5
584 
585  # BIC calculation
586  scores[index_cluster] = L - p * 0.5 * log(N)
587 
588  return sum(scores)
589 
590 
591  def __verify_arguments(self):
592  """!
593  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
594 
595  """
596  if len(self.__pointer_data) == 0:
597  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
598 
599  if len(self.__centers) == 0:
600  raise ValueError("Initial centers are empty (size: '%d')." % len(self.__pointer_data))
601 
602  if self.__tolerance < 0:
603  raise ValueError("Tolerance (current value: '%d') should be greater or equal to 0." %
604  self.__tolerance)
605 
606  if self.__repeat <= 0:
607  raise ValueError("Repeat (current value: '%d') should be greater than 0." %
608  self.__repeat)
def __improve_structure(self, clusters, centers)
Check for best structure: divides each cluster into two and checks for best results using splitting c...
Definition: xmeans.py:415
def __splitting_criterion(self, clusters, centers)
Calculates splitting criterion for input clusters.
Definition: xmeans.py:464
def __minimum_noiseless_description_length(self, clusters, centers)
Calculates splitting criterion for input clusters using minimum noiseless description length criterio...
Definition: xmeans.py:488
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: xmeans.py:300
The module contains K-Means algorithm and other related services.
Definition: kmeans.py:1
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def __process_by_python(self)
Performs cluster analysis using python code.
Definition: xmeans.py:202
def process(self)
Performs cluster analysis in line with rules of X-Means algorithm.
Definition: xmeans.py:170
Module for representing clustering results.
Definition: encoder.py:1
def get_total_wce(self)
Returns sum of Euclidean Squared metric errors (SSE - Sum of Squared Errors).
Definition: xmeans.py:313
def __local_to_global_clusters(self, local_clusters, available_indexes)
Converts clusters in local region define by &#39;available_indexes&#39; to global clusters.
Definition: xmeans.py:393
def __bayesian_information_criterion(self, clusters, centers)
Calculates splitting criterion for input clusters using bayesian information criterion.
Definition: xmeans.py:540
Class represents clustering algorithm X-Means.
Definition: xmeans.py:77
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: xmeans.py:591
Class implements K-Means clustering algorithm.
Definition: kmeans.py:272
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: xmeans.py:270
def get_centers(self)
Returns list of centers for allocated clusters.
Definition: xmeans.py:285
def __search_optimial_parameters(self, local_data)
Split data of the region into two cluster and tries to find global optimum by running k-means cluster...
Definition: xmeans.py:327
def __improve_parameters(self, centers, available_indexes=None)
Performs k-means clustering in the specified region.
Definition: xmeans.py:358
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def predict(self, points)
Calculates the closest cluster to each point.
Definition: xmeans.py:223
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: xmeans.py:190
def __init__(self, data, initial_centers=None, kmax=20, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True, kwargs)
Constructor of clustering algorithm X-Means.
Definition: xmeans.py:128
Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm...
Definition: xmeans.py:45