xmeans.py
1 """!
2 
3 @brief Cluster analysis algorithm: X-Means
4 @details Implementation based on paper @cite article::xmeans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import random
30 
31 from enum import IntEnum
32 
33 from math import log
34 
35 from pyclustering.cluster.encoder import type_encoding
36 from pyclustering.cluster.kmeans import kmeans
37 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
38 
39 from pyclustering.core.wrapper import ccore_library
40 
41 import pyclustering.core.xmeans_wrapper as wrapper
42 
43 from pyclustering.utils import euclidean_distance_square, euclidean_distance, distance_metric, type_metric
44 
45 
46 class splitting_type(IntEnum):
47  """!
48  @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
49 
50  """
51 
52 
64  BAYESIAN_INFORMATION_CRITERION = 0
65 
66 
75  MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1
76 
77 
78 class xmeans:
79  """!
80  @brief Class represents clustering algorithm X-Means.
81  @details X-means clustering method starts with the assumption of having a minimum number of clusters,
82  and then dynamically increases them. X-means uses specified splitting criterion to control
83  the process of splitting clusters. Method K-Means++ can be used for calculation of initial centers.
84 
85  CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
86 
87  Here example how to perform cluster analysis using X-Means algorithm:
88  @code
89  from pyclustering.cluster import cluster_visualizer
90  from pyclustering.cluster.xmeans import xmeans
91  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
92  from pyclustering.utils import read_sample
93  from pyclustering.samples.definitions import SIMPLE_SAMPLES
94 
95  # Read sample 'simple3' from file.
96  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
97 
98  # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will
99  # start analysis.
100  amount_initial_centers = 2
101  initial_centers = kmeans_plusplus_initializer(sample, amount_initial_centers).initialize()
102 
103  # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
104  # number of clusters that can be allocated is 20.
105  xmeans_instance = xmeans(sample, initial_centers, 20)
106  xmeans_instance.process()
107 
108  # Extract clustering results: clusters and their centers
109  clusters = xmeans_instance.get_clusters()
110  centers = xmeans_instance.get_centers()
111 
112  # Visualize clustering results
113  visualizer = cluster_visualizer()
114  visualizer.append_clusters(clusters, sample)
115  visualizer.append_cluster(centers, None, marker='*', markersize=10)
116  visualizer.show()
117  @endcode
118 
119  Visualization of clustering results that were obtained using code above and where X-Means algorithm allocates four clusters.
120  @image html xmeans_clustering_simple3.png "Fig. 1. X-Means clustering results (data 'Simple3')."
121 
122  @see center_initializer
123 
124  """
125 
126  def __init__(self, data, initial_centers=None, kmax=20, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True):
127  """!
128  @brief Constructor of clustering algorithm X-Means.
129 
130  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
131  @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...],
132  if it is not specified then X-Means starts from the random center.
133  @param[in] kmax (uint): Maximum number of clusters that can be allocated.
134  @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
135  @param[in] criterion (splitting_type): Type of splitting creation.
136  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
137 
138  """
139 
140  self.__pointer_data = data
141  self.__clusters = []
142 
143  if initial_centers is not None:
144  self.__centers = initial_centers[:]
145  else:
146  self.__centers = [[random.random() for _ in range(len(data[0]))]]
147 
148  self.__kmax = kmax
149  self.__tolerance = tolerance
150  self.__criterion = criterion
151 
152  self.__ccore = ccore
153  if self.__ccore:
154  self.__ccore = ccore_library.workable()
155 
156 
157  def process(self):
158  """!
159  @brief Performs cluster analysis in line with rules of X-Means algorithm.
160 
161  @remark Results of clustering can be obtained using corresponding gets methods.
162 
163  @see get_clusters()
164  @see get_centers()
165 
166  """
167 
168  if self.__ccore is True:
169  self.__clusters, self.__centers = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion)
170 
171  else:
172  self.__clusters = []
173  while len(self.__centers) <= self.__kmax:
174  current_cluster_number = len(self.__centers)
175 
176  self.__clusters, self.__centers = self.__improve_parameters(self.__centers)
177  allocated_centers = self.__improve_structure(self.__clusters, self.__centers)
178 
179  if current_cluster_number == len(allocated_centers):
180  #if ( (current_cluster_number == len(allocated_centers)) or (len(allocated_centers) > self.__kmax) ):
181  break
182  else:
183  self.__centers = allocated_centers
184 
185  self.__clusters, self.__centers = self.__improve_parameters(self.__centers)
186 
187 
188  def predict(self, points):
189  """!
190  @brief Calculates the closest cluster to each point.
191 
192  @param[in] points (array_like): Points for which closest clusters are calculated.
193 
194  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
195  collection if 'process()' method was not called.
196 
197  An example how to calculate (or predict) the closest cluster to specified points.
198  @code
199  from pyclustering.cluster.xmeans import xmeans
200  from pyclustering.samples.definitions import SIMPLE_SAMPLES
201  from pyclustering.utils import read_sample
202 
203  # Load list of points for cluster analysis.
204  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
205 
206  # Initial centers for sample 'Simple3'.
207  initial_centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
208 
209  # Create instance of X-Means algorithm with prepared centers.
210  xmeans_instance = xmeans(sample, initial_centers)
211 
212  # Run cluster analysis.
213  xmeans_instance.process()
214 
215  # Calculate the closest cluster to following two points.
216  points = [[0.25, 0.2], [2.5, 4.0]]
217  closest_clusters = xmeans_instance.predict(points)
218  print(closest_clusters)
219  @endcode
220 
221  """
222  nppoints = numpy.array(points)
223  if len(self.__clusters) == 0:
224  return []
225 
226  metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, numpy_usage=True)
227 
228  differences = numpy.zeros((len(nppoints), len(self.__centers)))
229  for index_point in range(len(nppoints)):
230  differences[index_point] = metric(nppoints[index_point], self.__centers)
231 
232  return numpy.argmin(differences, axis=1)
233 
234 
235  def get_clusters(self):
236  """!
237  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
238 
239  @return (list) List of allocated clusters.
240 
241  @see process()
242  @see get_centers()
243 
244  """
245 
246  return self.__clusters
247 
248 
249  def get_centers(self):
250  """!
251  @brief Returns list of centers for allocated clusters.
252 
253  @return (list) List of centers for allocated clusters.
254 
255  @see process()
256  @see get_clusters()
257 
258  """
259 
260  return self.__centers
261 
262 
264  """!
265  @brief Returns clustering result representation type that indicate how clusters are encoded.
266 
267  @return (type_encoding) Clustering result representation.
268 
269  @see get_clusters()
270 
271  """
272 
273  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
274 
275 
276  def __improve_parameters(self, centers, available_indexes = None):
277  """!
278  @brief Performs k-means clustering in the specified region.
279 
280  @param[in] centers (list): Centers of clusters.
281  @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
282 
283  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
284 
285  """
286 
287  if available_indexes and len(available_indexes) == 1:
288  index_center = available_indexes[0]
289  return [ available_indexes ], self.__pointer_data[index_center]
290 
291  local_data = self.__pointer_data
292  if available_indexes:
293  local_data = [ self.__pointer_data[i] for i in available_indexes ]
294 
295  local_centers = centers
296  if centers is None:
297  local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
298 
299  kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False)
300  kmeans_instance.process()
301 
302  local_centers = kmeans_instance.get_centers()
303 
304  clusters = kmeans_instance.get_clusters()
305  if available_indexes:
306  clusters = self.__local_to_global_clusters(clusters, available_indexes)
307 
308  return clusters, local_centers
309 
310 
311  def __local_to_global_clusters(self, local_clusters, available_indexes):
312  """!
313  @brief Converts clusters in local region define by 'available_indexes' to global clusters.
314 
315  @param[in] local_clusters (list): Local clusters in specific region.
316  @param[in] available_indexes (list): Map between local and global point's indexes.
317 
318  @return Global clusters.
319 
320  """
321 
322  clusters = []
323  for local_cluster in local_clusters:
324  current_cluster = []
325  for index_point in local_cluster:
326  current_cluster.append(available_indexes[index_point])
327 
328  clusters.append(current_cluster)
329 
330  return clusters
331 
332 
333  def __improve_structure(self, clusters, centers):
334  """!
335  @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
336 
337  @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
338  @param[in] centers (list): Centers of clusters.
339 
340  @return (list) Allocated centers for clustering.
341 
342  """
343 
344  allocated_centers = []
345  amount_free_centers = self.__kmax - len(centers)
346 
347  for index_cluster in range(len(clusters)):
348  # solve k-means problem for children where data of parent are used.
349  (parent_child_clusters, parent_child_centers) = self.__improve_parameters(None, clusters[index_cluster])
350 
351  # If it's possible to split current data
352  if len(parent_child_clusters) > 1:
353  # Calculate splitting criterion
354  parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ])
355  child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers)
356 
357  split_require = False
358 
359  # Reallocate number of centers (clusters) in line with scores
360  if self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
361  if parent_scores < child_scores: split_require = True
362 
363  elif self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
364  # If its score for the split structure with two children is smaller than that for the parent structure,
365  # then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
366  if parent_scores > child_scores: split_require = True;
367 
368  if (split_require is True) and (amount_free_centers > 0):
369  allocated_centers.append(parent_child_centers[0])
370  allocated_centers.append(parent_child_centers[1])
371 
372  amount_free_centers -= 1
373  else:
374  allocated_centers.append(centers[index_cluster])
375 
376 
377  else:
378  allocated_centers.append(centers[index_cluster])
379 
380  return allocated_centers
381 
382 
383  def __splitting_criterion(self, clusters, centers):
384  """!
385  @brief Calculates splitting criterion for input clusters.
386 
387  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
388  @param[in] centers (list): Centers of the clusters.
389 
390  @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
391 
392  @see __bayesian_information_criterion(clusters, centers)
393  @see __minimum_noiseless_description_length(clusters, centers)
394 
395  """
396 
397  if self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
398  return self.__bayesian_information_criterion(clusters, centers)
399 
400  elif self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
401  return self.__minimum_noiseless_description_length(clusters, centers)
402 
403  else:
404  assert 0;
405 
406 
407  def __minimum_noiseless_description_length(self, clusters, centers):
408  """!
409  @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
410 
411  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
412  @param[in] centers (list): Centers of the clusters.
413 
414  @return (double) Returns splitting criterion in line with bayesian information criterion.
415  Low value of splitting cretion means that current structure is much better.
416 
417  @see __bayesian_information_criterion(clusters, centers)
418 
419  """
420 
421  scores = float('inf')
422 
423  W = 0.0
424  K = len(clusters)
425  N = 0.0
426 
427  sigma_sqrt = 0.0
428 
429  alpha = 0.9
430  betta = 0.9
431 
432  for index_cluster in range(0, len(clusters), 1):
433  Ni = len(clusters[index_cluster])
434  if Ni == 0:
435  return float('inf')
436 
437  Wi = 0.0
438  for index_object in clusters[index_cluster]:
439  # euclidean_distance_square should be used in line with paper, but in this case results are
440  # very poor, therefore square root is used to improved.
441  Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster])
442 
443  sigma_sqrt += Wi
444  W += Wi / Ni
445  N += Ni
446 
447  if N - K > 0:
448  sigma_sqrt /= (N - K)
449  sigma = sigma_sqrt ** 0.5
450 
451  Kw = (1.0 - K / N) * sigma_sqrt
452  Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5
453 
454  scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N
455 
456  return scores
457 
458 
459  def __bayesian_information_criterion(self, clusters, centers):
460  """!
461  @brief Calculates splitting criterion for input clusters using bayesian information criterion.
462 
463  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
464  @param[in] centers (list): Centers of the clusters.
465 
466  @return (double) Splitting criterion in line with bayesian information criterion.
467  High value of splitting criterion means that current structure is much better.
468 
469  @see __minimum_noiseless_description_length(clusters, centers)
470 
471  """
472 
473  scores = [float('inf')] * len(clusters) # splitting criterion
474  dimension = len(self.__pointer_data[0])
475 
476  # estimation of the noise variance in the data set
477  sigma_sqrt = 0.0
478  K = len(clusters)
479  N = 0.0
480 
481  for index_cluster in range(0, len(clusters), 1):
482  for index_object in clusters[index_cluster]:
483  sigma_sqrt += euclidean_distance_square(self.__pointer_data[index_object], centers[index_cluster]);
484 
485  N += len(clusters[index_cluster])
486 
487  if N - K > 0:
488  sigma_sqrt /= (N - K)
489  p = (K - 1) + dimension * K + 1
490 
491  # in case of the same points, sigma_sqrt can be zero (issue: #407)
492  sigma_multiplier = 0.0
493  if sigma_sqrt <= 0.0:
494  sigma_multiplier = float('-inf')
495  else:
496  sigma_multiplier = dimension * 0.5 * log(sigma_sqrt)
497 
498  # splitting criterion
499  for index_cluster in range(0, len(clusters), 1):
500  n = len(clusters[index_cluster])
501 
502  L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * sigma_multiplier - (n - K) * 0.5
503 
504  # BIC calculation
505  scores[index_cluster] = L - p * 0.5 * log(N)
506 
507  return sum(scores)
def __improve_structure(self, clusters, centers)
Check for best structure: divides each cluster into two and checks for best results using splitting c...
Definition: xmeans.py:333
def __splitting_criterion(self, clusters, centers)
Calculates splitting criterion for input clusters.
Definition: xmeans.py:383
def __minimum_noiseless_description_length(self, clusters, centers)
Calculates splitting criterion for input clusters using minimum noiseless description length criterio...
Definition: xmeans.py:407
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: xmeans.py:263
Cluster analysis algorithm: K-Means.
Definition: kmeans.py:1
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def process(self)
Performs cluster analysis in line with rules of X-Means algorithm.
Definition: xmeans.py:157
Module for representing clustering results.
Definition: encoder.py:1
def __local_to_global_clusters(self, local_clusters, available_indexes)
Converts clusters in local region define by &#39;available_indexes&#39; to global clusters.
Definition: xmeans.py:311
def __bayesian_information_criterion(self, clusters, centers)
Calculates splitting criterion for input clusters using bayesian information criterion.
Definition: xmeans.py:459
Class represents clustering algorithm X-Means.
Definition: xmeans.py:78
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
Class represents K-Means clustering algorithm.
Definition: kmeans.py:272
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: xmeans.py:235
def get_centers(self)
Returns list of centers for allocated clusters.
Definition: xmeans.py:249
def __improve_parameters(self, centers, available_indexes=None)
Performs k-means clustering in the specified region.
Definition: xmeans.py:276
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def predict(self, points)
Calculates the closest cluster to each point.
Definition: xmeans.py:188
Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm...
Definition: xmeans.py:46
def __init__(self, data, initial_centers=None, kmax=20, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True)
Constructor of clustering algorithm X-Means.
Definition: xmeans.py:126