xmeans.py
1 """!
2 
3 @brief Cluster analysis algorithm: X-Means
4 @details Implementation based on paper @cite article::xmeans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import random
30 
31 from enum import IntEnum
32 
33 from math import log
34 
35 from pyclustering.cluster.encoder import type_encoding
36 from pyclustering.cluster.kmeans import kmeans
37 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
38 
39 from pyclustering.core.wrapper import ccore_library
40 
41 import pyclustering.core.xmeans_wrapper as wrapper
42 
43 from pyclustering.utils import euclidean_distance_square, euclidean_distance
44 
45 
46 class splitting_type(IntEnum):
47  """!
48  @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
49 
50  """
51 
52 
64  BAYESIAN_INFORMATION_CRITERION = 0
65 
66 
75  MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1
76 
77 
78 class xmeans:
79  """!
80  @brief Class represents clustering algorithm X-Means.
81  @details X-means clustering method starts with the assumption of having a minimum number of clusters,
82  and then dynamically increases them. X-means uses specified splitting criterion to control
83  the process of splitting clusters. Method K-Means++ can be used for calculation of initial centers.
84 
85  CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
86 
87  Here example how to perform cluster analysis using X-Means algorithm:
88  @code
89  from pyclustering.cluster import cluster_visualizer
90  from pyclustering.cluster.xmeans import xmeans
91  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
92  from pyclustering.utils import read_sample
93  from pyclustering.samples.definitions import SIMPLE_SAMPLES
94 
95  # Read sample 'simple3' from file.
96  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
97 
98  # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will
99  # start analysis.
100  amount_initial_centers = 2
101  initial_centers = kmeans_plusplus_initializer(sample, amount_initial_centers).initialize()
102 
103  # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
104  # number of clusters that can be allocated is 20.
105  xmeans_instance = xmeans(sample, initial_centers, 20)
106  xmeans_instance.process()
107 
108  # Extract clustering results: clusters and their centers
109  clusters = xmeans_instance.get_clusters()
110  centers = xmeans_instance.get_centers()
111 
112  # Visualize clustering results
113  visualizer = cluster_visualizer()
114  visualizer.append_clusters(clusters, sample)
115  visualizer.append_cluster(centers, None, marker='*', markersize=10)
116  visualizer.show()
117  @endcode
118 
119  Visualization of clustering results that were obtained using code above and where X-Means algorithm allocates four clusters.
120  @image html xmeans_clustering_simple3.png "Fig. 1. X-Means clustering results (data 'Simple3')."
121 
122  @see center_initializer
123 
124  """
125 
126  def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = True):
127  """!
128  @brief Constructor of clustering algorithm X-Means.
129 
130  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
131  @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...],
132  if it is not specified then X-Means starts from the random center.
133  @param[in] kmax (uint): Maximum number of clusters that can be allocated.
134  @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
135  @param[in] criterion (splitting_type): Type of splitting creation.
136  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
137 
138  """
139 
140  self.__pointer_data = data
141  self.__clusters = []
142 
143  if initial_centers is not None:
144  self.__centers = initial_centers[:]
145  else:
146  self.__centers = [ [random.random() for _ in range(len(data[0])) ] ]
147 
148  self.__kmax = kmax
149  self.__tolerance = tolerance
150  self.__criterion = criterion
151 
152  self.__ccore = ccore
153  if self.__ccore:
154  self.__ccore = ccore_library.workable()
155 
156 
157  def process(self):
158  """!
159  @brief Performs cluster analysis in line with rules of X-Means algorithm.
160 
161  @remark Results of clustering can be obtained using corresponding gets methods.
162 
163  @see get_clusters()
164  @see get_centers()
165 
166  """
167 
168  if (self.__ccore is True):
169  self.__clusters, self.__centers = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion)
170 
171  else:
172  self.__clusters = []
173  while len(self.__centers) <= self.__kmax:
174  current_cluster_number = len(self.__centers)
175 
176  self.__clusters, self.__centers = self.__improve_parameters(self.__centers)
177  allocated_centers = self.__improve_structure(self.__clusters, self.__centers)
178 
179  if current_cluster_number == len(allocated_centers):
180  #if ( (current_cluster_number == len(allocated_centers)) or (len(allocated_centers) > self.__kmax) ):
181  break
182  else:
183  self.__centers = allocated_centers
184 
185  self.__clusters, self.__centers = self.__improve_parameters(self.__centers)
186 
187 
188  def get_clusters(self):
189  """!
190  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
191 
192  @return (list) List of allocated clusters.
193 
194  @see process()
195  @see get_centers()
196 
197  """
198 
199  return self.__clusters
200 
201 
202  def get_centers(self):
203  """!
204  @brief Returns list of centers for allocated clusters.
205 
206  @return (list) List of centers for allocated clusters.
207 
208  @see process()
209  @see get_clusters()
210 
211  """
212 
213  return self.__centers
214 
215 
217  """!
218  @brief Returns clustering result representation type that indicate how clusters are encoded.
219 
220  @return (type_encoding) Clustering result representation.
221 
222  @see get_clusters()
223 
224  """
225 
226  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
227 
228 
229  def __improve_parameters(self, centers, available_indexes = None):
230  """!
231  @brief Performs k-means clustering in the specified region.
232 
233  @param[in] centers (list): Centers of clusters.
234  @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
235 
236  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
237 
238  """
239 
240  if available_indexes and len(available_indexes) == 1:
241  index_center = available_indexes[0]
242  return [ available_indexes ], self.__pointer_data[index_center]
243 
244  local_data = self.__pointer_data
245  if available_indexes:
246  local_data = [ self.__pointer_data[i] for i in available_indexes ]
247 
248  local_centers = centers
249  if centers is None:
250  local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
251 
252  kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False)
253  kmeans_instance.process()
254 
255  local_centers = kmeans_instance.get_centers()
256 
257  clusters = kmeans_instance.get_clusters()
258  if available_indexes:
259  clusters = self.__local_to_global_clusters(clusters, available_indexes)
260 
261  return clusters, local_centers
262 
263 
264  def __local_to_global_clusters(self, local_clusters, available_indexes):
265  """!
266  @brief Converts clusters in local region define by 'available_indexes' to global clusters.
267 
268  @param[in] local_clusters (list): Local clusters in specific region.
269  @param[in] available_indexes (list): Map between local and global point's indexes.
270 
271  @return Global clusters.
272 
273  """
274 
275  clusters = []
276  for local_cluster in local_clusters:
277  current_cluster = []
278  for index_point in local_cluster:
279  current_cluster.append(available_indexes[index_point])
280 
281  clusters.append(current_cluster)
282 
283  return clusters
284 
285 
286  def __improve_structure(self, clusters, centers):
287  """!
288  @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
289 
290  @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
291  @param[in] centers (list): Centers of clusters.
292 
293  @return (list) Allocated centers for clustering.
294 
295  """
296 
297  allocated_centers = []
298  amount_free_centers = self.__kmax - len(centers)
299 
300  for index_cluster in range(len(clusters)):
301  # solve k-means problem for children where data of parent are used.
302  (parent_child_clusters, parent_child_centers) = self.__improve_parameters(None, clusters[index_cluster])
303 
304  # If it's possible to split current data
305  if len(parent_child_clusters) > 1:
306  # Calculate splitting criterion
307  parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ])
308  child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers)
309 
310  split_require = False
311 
312  # Reallocate number of centers (clusters) in line with scores
313  if self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
314  if parent_scores < child_scores: split_require = True
315 
316  elif self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
317  # If its score for the split structure with two children is smaller than that for the parent structure,
318  # then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
319  if parent_scores > child_scores: split_require = True;
320 
321  if (split_require is True) and (amount_free_centers > 0):
322  allocated_centers.append(parent_child_centers[0])
323  allocated_centers.append(parent_child_centers[1])
324 
325  amount_free_centers -= 1
326  else:
327  allocated_centers.append(centers[index_cluster])
328 
329 
330  else:
331  allocated_centers.append(centers[index_cluster])
332 
333  return allocated_centers
334 
335 
336  def __splitting_criterion(self, clusters, centers):
337  """!
338  @brief Calculates splitting criterion for input clusters.
339 
340  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
341  @param[in] centers (list): Centers of the clusters.
342 
343  @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
344 
345  @see __bayesian_information_criterion(clusters, centers)
346  @see __minimum_noiseless_description_length(clusters, centers)
347 
348  """
349 
350  if self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
351  return self.__bayesian_information_criterion(clusters, centers)
352 
353  elif self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
354  return self.__minimum_noiseless_description_length(clusters, centers)
355 
356  else:
357  assert 0;
358 
359 
360  def __minimum_noiseless_description_length(self, clusters, centers):
361  """!
362  @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
363 
364  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
365  @param[in] centers (list): Centers of the clusters.
366 
367  @return (double) Returns splitting criterion in line with bayesian information criterion.
368  Low value of splitting cretion means that current structure is much better.
369 
370  @see __bayesian_information_criterion(clusters, centers)
371 
372  """
373 
374  scores = float('inf')
375 
376  W = 0.0
377  K = len(clusters)
378  N = 0.0
379 
380  sigma_sqrt = 0.0
381 
382  alpha = 0.9
383  betta = 0.9
384 
385  for index_cluster in range(0, len(clusters), 1):
386  Ni = len(clusters[index_cluster])
387  if Ni == 0:
388  return float('inf')
389 
390  Wi = 0.0
391  for index_object in clusters[index_cluster]:
392  # euclidean_distance_square should be used in line with paper, but in this case results are
393  # very poor, therefore square root is used to improved.
394  Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster])
395 
396  sigma_sqrt += Wi
397  W += Wi / Ni
398  N += Ni
399 
400  if N - K > 0:
401  sigma_sqrt /= (N - K)
402  sigma = sigma_sqrt ** 0.5
403 
404  Kw = (1.0 - K / N) * sigma_sqrt
405  Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5
406 
407  scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N
408 
409  return scores
410 
411 
412  def __bayesian_information_criterion(self, clusters, centers):
413  """!
414  @brief Calculates splitting criterion for input clusters using bayesian information criterion.
415 
416  @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
417  @param[in] centers (list): Centers of the clusters.
418 
419  @return (double) Splitting criterion in line with bayesian information criterion.
420  High value of splitting criterion means that current structure is much better.
421 
422  @see __minimum_noiseless_description_length(clusters, centers)
423 
424  """
425 
426  scores = [float('inf')] * len(clusters) # splitting criterion
427  dimension = len(self.__pointer_data[0])
428 
429  # estimation of the noise variance in the data set
430  sigma_sqrt = 0.0
431  K = len(clusters)
432  N = 0.0
433 
434  for index_cluster in range(0, len(clusters), 1):
435  for index_object in clusters[index_cluster]:
436  sigma_sqrt += euclidean_distance_square(self.__pointer_data[index_object], centers[index_cluster]);
437 
438  N += len(clusters[index_cluster])
439 
440  if N - K > 0:
441  sigma_sqrt /= (N - K)
442  p = (K - 1) + dimension * K + 1
443 
444  # in case of the same points, sigma_sqrt can be zero (issue: #407)
445  sigma_multiplier = 0.0
446  if sigma_sqrt <= 0.0:
447  sigma_multiplier = float('-inf')
448  else:
449  sigma_multiplier = dimension * 0.5 * log(sigma_sqrt)
450 
451  # splitting criterion
452  for index_cluster in range(0, len(clusters), 1):
453  n = len(clusters[index_cluster])
454 
455  L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * sigma_multiplier - (n - K) * 0.5
456 
457  # BIC calculation
458  scores[index_cluster] = L - p * 0.5 * log(N)
459 
460  return sum(scores)
def __improve_structure(self, clusters, centers)
Check for best structure: divides each cluster into two and checks for best results using splitting c...
Definition: xmeans.py:286
def __splitting_criterion(self, clusters, centers)
Calculates splitting criterion for input clusters.
Definition: xmeans.py:336
def __minimum_noiseless_description_length(self, clusters, centers)
Calculates splitting criterion for input clusters using minimum noiseless description length criterio...
Definition: xmeans.py:360
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: xmeans.py:216
Cluster analysis algorithm: K-Means.
Definition: kmeans.py:1
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def process(self)
Performs cluster analysis in line with rules of X-Means algorithm.
Definition: xmeans.py:157
Module for representing clustering results.
Definition: encoder.py:1
def __local_to_global_clusters(self, local_clusters, available_indexes)
Converts clusters in local region define by &#39;available_indexes&#39; to global clusters.
Definition: xmeans.py:264
def __bayesian_information_criterion(self, clusters, centers)
Calculates splitting criterion for input clusters using bayesian information criterion.
Definition: xmeans.py:412
Class represents clustering algorithm X-Means.
Definition: xmeans.py:78
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
Class represents K-Means clustering algorithm.
Definition: kmeans.py:272
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: xmeans.py:188
def get_centers(self)
Returns list of centers for allocated clusters.
Definition: xmeans.py:202
def __improve_parameters(self, centers, available_indexes=None)
Performs k-means clustering in the specified region.
Definition: xmeans.py:229
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm...
Definition: xmeans.py:46
def __init__(self, data, initial_centers=None, kmax=20, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True)
Constructor of clustering algorithm X-Means.
Definition: xmeans.py:126