xmeans.py
1 """!
2 
3 @brief Cluster analysis algorithm: X-Means
4 @details Implementation based on paper @cite article::syncnet::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import random
30 
31 from enum import IntEnum
32 
33 from math import log
34 
35 from pyclustering.cluster.encoder import type_encoding
36 from pyclustering.cluster.kmeans import kmeans
37 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
38 
39 from pyclustering.core.wrapper import ccore_library
40 
41 import pyclustering.core.xmeans_wrapper as wrapper
42 
43 from pyclustering.utils import euclidean_distance_square, euclidean_distance
44 
45 
46 class splitting_type(IntEnum):
47  """!
48  @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
49 
50  """
51 
52 
64  BAYESIAN_INFORMATION_CRITERION = 0
65 
66 
75  MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1
76 
77 
78 class xmeans:
79  """!
80  @brief Class represents clustering algorithm X-Means.
81  @details X-means clustering method starts with the assumption of having a minimum number of clusters,
82  and then dynamically increases them. X-means uses specified splitting criterion to control
83  the process of splitting clusters. Method K-Means++ can be used for calculation of initial centers.
84 
85  CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
86 
87  CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
88 
89  Here example how to perform cluster analysis using X-Means algorithm:
90  @code
91  from pyclustering.cluster import cluster_visualizer
92  from pyclustering.cluster.xmeans import xmeans
93  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
94  from pyclustering.utils import read_sample
95  from pyclustering.samples.definitions import SIMPLE_SAMPLES
96 
97  # Read sample 'simple3' from file.
98  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
99 
100  # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will
101  # start analysis.
102  amount_initial_centers = 2
103  initial_centers = kmeans_plusplus_initializer(sample, amount_initial_centers).initialize()
104 
105  # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
106  # number of clusters that can be allocated is 20.
107  xmeans_instance = xmeans(sample, initial_centers, 20)
108  xmeans_instance.process()
109 
110  # Extract clustering results: clusters and their centers
111  clusters = xmeans_instance.get_clusters()
112  centers = xmeans_instance.get_centers()
113 
114  # Visualize clustering results
115  visualizer = cluster_visualizer()
116  visualizer.append_clusters(clusters, sample)
117  visualizer.append_cluster(centers, None, marker='*')
118  visualizer.show()
119  @endcode
120 
121  Visualization of clustering results that were obtained using code above and where X-Means algorithm allocates four clusters.
122  @image html xmeans_clustering_simple3.png "Fig. 1. X-Means clustering results (data 'Simple3')."
123 
124  @see center_initializer
125 
126  """
127 
128  def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = True):
129  """!
130  @brief Constructor of clustering algorithm X-Means.
131 
132  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
133  @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...],
134  if it is not specified then X-Means starts from the random center.
135  @param[in] kmax (uint): Maximum number of clusters that can be allocated.
136  @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
137  @param[in] criterion (splitting_type): Type of splitting creation.
138  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
139 
140  """
141 
142  self.__pointer_data = data
143  self.__clusters = []
144 
145  if initial_centers is not None:
146  self.__centers = initial_centers[:]
147  else:
148  self.__centers = [ [random.random() for _ in range(len(data[0])) ] ]
149 
150  self.__kmax = kmax
151  self.__tolerance = tolerance
152  self.__criterion = criterion
153 
154  self.__ccore = ccore
155  if self.__ccore:
156  self.__ccore = ccore_library.workable()
157 
158 
159  def process(self):
160  """!
161  @brief Performs cluster analysis in line with rules of X-Means algorithm.
162 
163  @remark Results of clustering can be obtained using corresponding gets methods.
164 
165  @see get_clusters()
166  @see get_centers()
167 
168  """
169 
170  if (self.__ccore is True):
171  self.__clusters, self.__centers = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion)
172 
173  else:
174  self.__clusters = []
175  while len(self.__centers) <= self.__kmax:
176  current_cluster_number = len(self.__centers)
177 
178  self.__clusters, self.__centers = self.__improve_parameters(self.__centers)
179  allocated_centers = self.__improve_structure(self.__clusters, self.__centers)
180 
181  if current_cluster_number == len(allocated_centers):
182  #if ( (current_cluster_number == len(allocated_centers)) or (len(allocated_centers) > self.__kmax) ):
183  break
184  else:
185  self.__centers = allocated_centers
186 
187  self.__clusters, self.__centers = self.__improve_parameters(self.__centers)
188 
189 
190  def get_clusters(self):
191  """!
192  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
193 
194  @return (list) List of allocated clusters.
195 
196  @see process()
197  @see get_centers()
198 
199  """
200 
201  return self.__clusters
202 
203 
204  def get_centers(self):
205  """!
206  @brief Returns list of centers for allocated clusters.
207 
208  @return (list) List of centers for allocated clusters.
209 
210  @see process()
211  @see get_clusters()
212 
213  """
214 
215  return self.__centers
216 
217 
219  """!
220  @brief Returns clustering result representation type that indicate how clusters are encoded.
221 
222  @return (type_encoding) Clustering result representation.
223 
224  @see get_clusters()
225 
226  """
227 
228  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
229 
230 
231  def __improve_parameters(self, centers, available_indexes = None):
232  """!
233  @brief Performs k-means clustering in the specified region.
234 
235  @param[in] centers (list): Centers of clusters.
236  @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
237 
238  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
239 
240  """
241 
242  if available_indexes and len(available_indexes) == 1:
243  index_center = available_indexes[0]
244  return [ available_indexes ], self.__pointer_data[index_center]
245 
246  local_data = self.__pointer_data
247  if available_indexes:
248  local_data = [ self.__pointer_data[i] for i in available_indexes ]
249 
250  local_centers = centers
251  if centers is None:
252  local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
253 
254  kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False)
255  kmeans_instance.process()
256 
257  local_centers = kmeans_instance.get_centers()
258 
259  clusters = kmeans_instance.get_clusters()
260  if available_indexes:
261  clusters = self.__local_to_global_clusters(clusters, available_indexes)
262 
263  return clusters, local_centers
264 
265 
266  def __local_to_global_clusters(self, local_clusters, available_indexes):
267  """!
268  @brief Converts clusters in local region define by 'available_indexes' to global clusters.
269 
270  @param[in] local_clusters (list): Local clusters in specific region.
271  @param[in] available_indexes (list): Map between local and global point's indexes.
272 
273  @return Global clusters.
274 
275  """
276 
277  clusters = []
278  for local_cluster in local_clusters:
279  current_cluster = []
280  for index_point in local_cluster:
281  current_cluster.append(available_indexes[index_point])
282 
283  clusters.append(current_cluster)
284 
285  return clusters
286 
287 
288  def __improve_structure(self, clusters, centers):
289  """!
290  @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
291 
292  @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
293  @param[in] centers (list): Centers of clusters.
294 
295  @return (list) Allocated centers for clustering.
296 
297  """
298 
299  allocated_centers = []
300  amount_free_centers = self.__kmax - len(centers)
301 
302  for index_cluster in range(len(clusters)):
303  # solve k-means problem for children where data of parent are used.
304  (parent_child_clusters, parent_child_centers) = self.__improve_parameters(None, clusters[index_cluster])
305 
306  # If it's possible to split current data
307  if len(parent_child_clusters) > 1:
308  # Calculate splitting criterion
309  parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ])
310  child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers)
311 
312  split_require = False
313 
314  # Reallocate number of centers (clusters) in line with scores
315  if self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
316  if parent_scores < child_scores: split_require = True
317 
318  elif self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
319  # If its score for the split structure with two children is smaller than that for the parent structure,
320  # then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
321  if parent_scores > child_scores: split_require = True;
322 
323  if (split_require is True) and (amount_free_centers > 0):
324  allocated_centers.append(parent_child_centers[0])
325  allocated_centers.append(parent_child_centers[1])
326 
327  amount_free_centers -= 1
328  else:
329  allocated_centers.append(centers[index_cluster])
330 
331 
332  else:
333  allocated_centers.append(centers[index_cluster])
334 
335  return allocated_centers
336 
337 
338  def __splitting_criterion(self, clusters, centers):
339  """!
340  @brief Calculates splitting criterion for input clusters.
341 
342  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
343  @param[in] centers (list): Centers of the clusters.
344 
345  @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
346 
347  @see __bayesian_information_criterion(clusters, centers)
348  @see __minimum_noiseless_description_length(clusters, centers)
349 
350  """
351 
352  if self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
353  return self.__bayesian_information_criterion(clusters, centers)
354 
355  elif self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
356  return self.__minimum_noiseless_description_length(clusters, centers)
357 
358  else:
359  assert 0;
360 
361 
362  def __minimum_noiseless_description_length(self, clusters, centers):
363  """!
364  @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
365 
366  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
367  @param[in] centers (list): Centers of the clusters.
368 
369  @return (double) Returns splitting criterion in line with bayesian information criterion.
370  Low value of splitting cretion means that current structure is much better.
371 
372  @see __bayesian_information_criterion(clusters, centers)
373 
374  """
375 
376  scores = float('inf')
377 
378  W = 0.0
379  K = len(clusters)
380  N = 0.0
381 
382  sigma_sqrt = 0.0
383 
384  alpha = 0.9
385  betta = 0.9
386 
387  for index_cluster in range(0, len(clusters), 1):
388  Ni = len(clusters[index_cluster])
389  if Ni == 0:
390  return float('inf')
391 
392  Wi = 0.0
393  for index_object in clusters[index_cluster]:
394  # euclidean_distance_square should be used in line with paper, but in this case results are
395  # very poor, therefore square root is used to improved.
396  Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster])
397 
398  sigma_sqrt += Wi
399  W += Wi / Ni
400  N += Ni
401 
402  if N - K > 0:
403  sigma_sqrt /= (N - K)
404  sigma = sigma_sqrt ** 0.5
405 
406  Kw = (1.0 - K / N) * sigma_sqrt
407  Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5
408 
409  scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N
410 
411  return scores
412 
413 
414  def __bayesian_information_criterion(self, clusters, centers):
415  """!
416  @brief Calculates splitting criterion for input clusters using bayesian information criterion.
417 
418  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
419  @param[in] centers (list): Centers of the clusters.
420 
421  @return (double) Splitting criterion in line with bayesian information criterion.
422  High value of splitting criterion means that current structure is much better.
423 
424  @see __minimum_noiseless_description_length(clusters, centers)
425 
426  """
427 
428  scores = [float('inf')] * len(clusters) # splitting criterion
429  dimension = len(self.__pointer_data[0])
430 
431  # estimation of the noise variance in the data set
432  sigma_sqrt = 0.0
433  K = len(clusters)
434  N = 0.0
435 
436  for index_cluster in range(0, len(clusters), 1):
437  for index_object in clusters[index_cluster]:
438  sigma_sqrt += euclidean_distance_square(self.__pointer_data[index_object], centers[index_cluster]);
439 
440  N += len(clusters[index_cluster])
441 
442  if N - K > 0:
443  sigma_sqrt /= (N - K)
444  p = (K - 1) + dimension * K + 1
445 
446  # in case of the same points, sigma_sqrt can be zero (issue: #407)
447  sigma_multiplier = 0.0
448  if sigma_sqrt <= 0.0:
449  sigma_multiplier = float('-inf')
450  else:
451  sigma_multiplier = dimension * 0.5 * log(sigma_sqrt)
452 
453  # splitting criterion
454  for index_cluster in range(0, len(clusters), 1):
455  n = len(clusters[index_cluster])
456 
457  L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * sigma_multiplier - (n - K) * 0.5
458 
459  # BIC calculation
460  scores[index_cluster] = L - p * 0.5 * log(N)
461 
462  return sum(scores)
def __improve_structure(self, clusters, centers)
Check for best structure: divides each cluster into two and checks for best results using splitting c...
Definition: xmeans.py:288
def __splitting_criterion(self, clusters, centers)
Calculates splitting criterion for input clusters.
Definition: xmeans.py:338
def __minimum_noiseless_description_length(self, clusters, centers)
Calculates splitting criterion for input clusters using minimum noiseless description length criterio...
Definition: xmeans.py:362
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: xmeans.py:218
Cluster analysis algorithm: K-Means.
Definition: kmeans.py:1
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def process(self)
Performs cluster analysis in line with rules of X-Means algorithm.
Definition: xmeans.py:159
Module for representing clustering results.
Definition: encoder.py:1
def __local_to_global_clusters(self, local_clusters, available_indexes)
Converts clusters in local region define by &#39;available_indexes&#39; to global clusters.
Definition: xmeans.py:266
def __bayesian_information_criterion(self, clusters, centers)
Calculates splitting criterion for input clusters using bayesian information criterion.
Definition: xmeans.py:414
Class represents clustering algorithm X-Means.
Definition: xmeans.py:78
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
Class represents K-Means clustering algorithm.
Definition: kmeans.py:272
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: xmeans.py:190
def get_centers(self)
Returns list of centers for allocated clusters.
Definition: xmeans.py:204
def __improve_parameters(self, centers, available_indexes=None)
Performs k-means clustering in the specified region.
Definition: xmeans.py:231
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm...
Definition: xmeans.py:46
def __init__(self, data, initial_centers=None, kmax=20, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True)
Constructor of clustering algorithm X-Means.
Definition: xmeans.py:128