kmeans.py
1 """!
2
3 @brief Cluster analysis algorithm: K-Means
4 @details Implementation based on paper @cite inproceedings::kmeans::1.
5
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
9
11  PyClustering is free software: you can redistribute it and/or modify
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24
25 """
26
27
28 import numpy
29 import warnings
30
31 try:
32  import matplotlib.pyplot as plt
33  import matplotlib.animation as animation
34 except Exception as error_instance:
35  warnings.warn("Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization "
36  "functionality is not available (details: '%s')." % str(error_instance))
37
38 import pyclustering.core.kmeans_wrapper as wrapper
39
40 from pyclustering.core.wrapper import ccore_library
41 from pyclustering.core.metric_wrapper import metric_wrapper
42
43 from pyclustering.cluster.encoder import type_encoding
44 from pyclustering.cluster import cluster_visualizer
45
46 from pyclustering.utils.metric import distance_metric, type_metric
47
48
50  """!
51  @brief Observer of K-Means algorithm that is used to collect information about clustering process on each iteration of the algorithm.
52
53  @see kmeans
54
55  """
56
57  def __init__(self):
58  """!
59  @brief Initializer of observer of K-Means algorithm.
60
61  """
62  self.__evolution_clusters = []
63  self.__evolution_centers = []
64  self.__initial_centers = []
65
66
67  def __len__(self):
68  """!
69  @brief Returns amount of steps that were observer during clustering process in K-Means algorithm.
70
71  """
72  return len(self.__evolution_clusters)
73
74
75  def notify(self, clusters, centers):
76  """!
77  @brief This method is called by K-Means algorithm to notify about changes.
78
79  @param[in] clusters (array_like): Allocated clusters by K-Means algorithm.
80  @param[in] centers (array_like): Allocated centers by K-Means algorithm.
81
82  """
83  self.__evolution_clusters.append(clusters)
84  self.__evolution_centers.append(centers)
85
86
87  def set_evolution_centers(self, evolution_centers):
88  """!
89  @brief Set evolution of changes of centers during clustering process.
90
91  @param[in] evolution_centers (array_like): Evolution of changes of centers during clustering process.
92
93  """
94  self.__evolution_centers = evolution_centers
95
96
97  def get_centers(self, iteration):
98  """!
99  @brief Get method to return centers at specific iteration of clustering process.
100
101  @param[in] iteration (uint): Clustering process iteration at which centers are required.
102
103  @return (array_like) Centers at specific iteration.
104
105  """
106  return self.__evolution_centers[iteration]
107
108
109  def set_evolution_clusters(self, evolution_clusters):
110  """!
111  @brief Set evolution of changes of centers during clustering process.
112
113  @param[in] evolution_clusters (array_like): Evolution of changes of clusters during clustering process.
114
115  """
116  self.__evolution_clusters = evolution_clusters
117
118
119  def get_clusters(self, iteration):
120  """!
121  @brief Get method to return allocated clusters at specific iteration of clustering process.
122
123  @param[in] iteration (uint): Clustering process iteration at which clusters are required.
124
125  @return (array_like) Clusters at specific iteration.
126
127  """
128  return self.__evolution_clusters[iteration]
129
130
131
133  """!
134  @brief Visualizer of K-Means algorithm's results.
135  @details K-Means visualizer provides visualization services that are specific for K-Means algorithm.
136
137  """
138
139  __default_2d_marker_size = 15
140  __default_3d_marker_size = 70
141
142
143  @staticmethod
144  def show_clusters(sample, clusters, centers, initial_centers = None, **kwargs):
145  """!
146  @brief Display K-Means clustering results.
147
148  @param[in] sample (list): Dataset that was used for clustering.
149  @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
150  @param[in] centers (array_like): Centers that were allocated by the algorithm.
151  @param[in] initial_centers (array_like): Initial centers that were used by the algorithm, if 'None' then initial centers are not displyed.
152  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display', 'offset').
153
154  <b>Keyword Args:</b><br>
155  - figure (figure): If 'None' then new is figure is created, otherwise specified figure is used for visualization.
156  - display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'.
157  - offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified).
158
159  @return (figure) Figure where clusters were drawn.
160
161  """
162
163  visualizer = cluster_visualizer()
164  visualizer.append_clusters(clusters, sample)
165
166  offset = kwargs.get('offset', 0)
167  figure = kwargs.get('figure', None)
168  display = kwargs.get('display', True)
169
170  if figure is None:
171  figure = visualizer.show(display = False)
172  else:
173  visualizer.show(figure = figure, display = False)
174
175  kmeans_visualizer.__draw_centers(figure, offset, visualizer, centers, initial_centers)
176  kmeans_visualizer.__draw_rays(figure, offset, visualizer, sample, clusters, centers)
177
178  if display is True:
179  plt.show()
180
181  return figure
182
183
184  @staticmethod
185  def __draw_rays(figure, offset, visualizer, sample, clusters, centers):
186  ax = figure.get_axes()[offset]
187
188  for index_cluster in range(len(clusters)):
189  color = visualizer.get_cluster_color(index_cluster, 0)
190  kmeans_visualizer.__draw_cluster_rays(ax, color, sample, clusters[index_cluster], centers[index_cluster])
191
192
193  @staticmethod
194  def __draw_cluster_rays(ax, color, sample, cluster, center):
195  dimension = len(sample[0])
196
197  for index_point in cluster:
198  point = sample[index_point]
199  if dimension == 1:
200  ax.plot([point[0], center[0]], [0.0, 0.0], '-', color=color, linewidth=0.5)
201  elif dimension == 2:
202  ax.plot([point[0], center[0]], [point[1], center[1]], '-', color=color, linewidth=0.5)
203  elif dimension == 3:
204  ax.plot([point[0], center[0]], [point[1], center[1]], [point[2], center[2]], '-', color=color, linewidth=0.5)
205
206
207  @staticmethod
208  def __draw_center(ax, center, color, marker, alpha):
209  dimension = len(center)
210
211  if dimension == 1:
212  ax.plot(center[0], 0.0, color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
213  elif dimension == 2:
214  ax.plot(center[0], center[1], color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
215  elif dimension == 3:
216  ax.scatter(center[0], center[1], center[2], c=color, alpha=alpha, marker=marker, s=kmeans_visualizer.__default_3d_marker_size)
217
218
219  @staticmethod
220  def __draw_centers(figure, offset, visualizer, centers, initial_centers):
221  ax = figure.get_axes()[offset]
222
223  for index_center in range(len(centers)):
224  color = visualizer.get_cluster_color(index_center, 0)
225  kmeans_visualizer.__draw_center(ax, centers[index_center], color, '*', 1.0)
226
227  if initial_centers is not None:
228  kmeans_visualizer.__draw_center(ax, initial_centers[index_center], color, '*', 0.4)
229
230
231  @staticmethod
232  def animate_cluster_allocation(data, observer, animation_velocity = 500, movie_fps = 1, save_movie = None):
233  """!
234  @brief Animates clustering process that is performed by K-Means algorithm.
235
236  @param[in] data (list): Dataset that is used for clustering.
237  @param[in] observer (kmeans_observer): EM observer that was used for collection information about clustering process.
238  @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
239  @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
240  @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
241
242  """
243  figure = plt.figure()
244
245  def init_frame():
246  return frame_generation(0)
247
248  def frame_generation(index_iteration):
249  figure.clf()
250
251  figure.suptitle("K-Means algorithm (iteration: " + str(index_iteration) + ")", fontsize=18, fontweight='bold')
252
253  clusters = observer.get_clusters(index_iteration)
254  centers = observer.get_centers(index_iteration)
255  kmeans_visualizer.show_clusters(data, clusters, centers, None, figure=figure, display=False)
256
258
259  return [figure.gca()]
260
261  iterations = len(observer)
262  cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval=animation_velocity,
263  init_func=init_frame, repeat_delay=5000)
264
265  if save_movie is not None:
266  cluster_animation.save(save_movie, writer='ffmpeg', fps=movie_fps, bitrate=3000)
267  else:
268  plt.show()
269
270
271
272 class kmeans:
273  """!
274  @brief Class represents K-Means clustering algorithm.
275  @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
276
277  CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
278
279  K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization
280  initial centers from module 'pyclustering.cluster.center_initializer'.
281
282  @image html kmeans_example_clustering.png "K-Means clustering results. At the left - 'Simple03.data' sample, at the right - 'Lsun.data' sample."
283
284  Example #1 - Trivial clustering:
285  @code
286  # load list of points for cluster analysis
288
289  # create instance of K-Means algorithm
290  kmeans_instance = kmeans(sample, [ [0.0, 0.1], [2.5, 2.6] ])
291
292  # run cluster analysis and obtain results
293  kmeans_instance.process()
294  clusters = kmeans_instance.get_clusters()
295  @endcode
296
297  Example #2 - Clustering using K-Means++ for center initialization:
298  @code
299  # load list of points for cluster analysis
301
302  # initialize initial centers using K-Means++ method
303  initial_centers = kmeans_plusplus_initializer(sample, 3).initialize()
304
305  # create instance of K-Means algorithm with prepared centers
306  kmeans_instance = kmeans(sample, initial_centers)
307
308  # run cluster analysis and obtain results
309  kmeans_instance.process()
310  clusters = kmeans_instance.get_clusters()
311  final_centers = kmeans_instance.get_centers()
312  @endcode
313
314  @see center_initializer
315
316  """
317
318  def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
319  """!
320  @brief Constructor of clustering algorithm K-Means.
321  @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
322
323  @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
324  @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
325  @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
326  @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
327  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer', 'metric', 'itermax').
328
329  <b>Keyword Args:</b><br>
330  - observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration.
331  - metric (distance_metric): Metric that is used for distance calculation between two points (by default euclidean square distance).
332  - itermax (uint): Maximum number of iterations that is used for clustering process (by default: 200).
333
334  @see center_initializer
335
336  """
337  self.__pointer_data = numpy.array(data)
338  self.__clusters = []
339  self.__centers = numpy.array(initial_centers)
340  self.__tolerance = tolerance
341  self.__total_wce = 0
342
343  self.__observer = kwargs.get('observer', None)
344  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
345  self.__maxiter = kwargs.get('maxiter', 200)
346
347  if self.__metric.get_type() != type_metric.USER_DEFINED:
348  self.__metric.enable_numpy_usage()
349  else:
350  self.__metric.disable_numpy_usage()
351
352  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
353  if self.__ccore is True:
354  self.__ccore = ccore_library.workable()
355
356
357  def process(self):
358  """!
359  @brief Performs cluster analysis in line with rules of K-Means algorithm.
360
361  @return (kmeans) Returns itself (K-Means instance).
362
363  @remark Results of clustering can be obtained using corresponding get methods.
364
365  @see get_clusters()
366  @see get_centers()
367
368  """
369
370  if len(self.__pointer_data[0]) != len(self.__centers[0]):
371  raise ValueError("Dimension of the input data and dimension of the initial cluster centers must be equal.")
372
373  if self.__ccore is True:
374  self.__process_by_ccore()
375  else:
376  self.__process_by_python()
377
378  return self
379
380
381  def __process_by_ccore(self):
382  """!
383  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
384
385  """
386  ccore_metric = metric_wrapper.create_instance(self.__metric)
387
388  results = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance, (self.__observer is not None), ccore_metric.get_pointer())
389  self.__clusters = results[0]
390  self.__centers = results[1]
391
392  if self.__observer is not None:
393  self.__observer.set_evolution_clusters(results[2])
394  self.__observer.set_evolution_centers(results[3])
395
396  self.__total_wce = results[4][0]
397
398
399  def __process_by_python(self):
400  """!
401  @brief Performs cluster analysis using python code.
402
403  """
404
405  maximum_change = float('inf')
406  stop_condition = self.__tolerance * self.__tolerance
407  iteration = 0
408
409  if self.__observer is not None:
410  initial_clusters = self.__update_clusters()
411  self.__observer.notify(initial_clusters, self.__centers.tolist())
412
413  while maximum_change > stop_condition and iteration < self.__maxiter:
414  self.__clusters = self.__update_clusters()
415  updated_centers = self.__update_centers() # changes should be calculated before assignment
416
417  if self.__observer is not None:
418  self.__observer.notify(self.__clusters, updated_centers.tolist())
419
420  maximum_change = self.__calculate_changes(updated_centers)
421
422  self.__centers = updated_centers # assign center after change calculation
423  iteration += 1
424
425  self.__calculate_total_wce()
426
427
428  def get_clusters(self):
429  """!
430  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
431
432  @see process()
433  @see get_centers()
434
435  """
436
437  return self.__clusters
438
439
440  def get_centers(self):
441  """!
442  @brief Returns list of centers of allocated clusters.
443
444  @see process()
445  @see get_clusters()
446
447  """
448
449  if isinstance(self.__centers, list):
450  return self.__centers
451
452  return self.__centers.tolist()
453
454
455  def get_total_wce(self):
456  """!
457  @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
458  @details Sum of metric errors is calculated using distance between point and its center:
459  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
460
461  @see process()
462  @see get_clusters()
463
464  """
465
466  return self.__total_wce
467
468
470  """!
471  @brief Returns clustering result representation type that indicate how clusters are encoded.
472
473  @return (type_encoding) Clustering result representation.
474
475  @see get_clusters()
476
477  """
478
479  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
480
481
482  def __update_clusters(self):
483  """!
484  @brief Calculate Euclidean distance to each point from the each cluster. Nearest points are captured by according clusters and as a result clusters are updated.
485
486  @return (list) Updated clusters as list of clusters. Each cluster contains indexes of objects from data.
487
488  """
489
490  clusters = [[] for _ in range(len(self.__centers))]
491
492  dataset_differences = self.__calculate_dataset_difference(len(clusters))
493
494  optimum_indexes = numpy.argmin(dataset_differences, axis=0)
495  for index_point in range(len(optimum_indexes)):
496  index_cluster = optimum_indexes[index_point]
497  clusters[index_cluster].append(index_point)
498
499  clusters = [cluster for cluster in clusters if len(cluster) > 0]
500
501  return clusters
502
503
504  def __update_centers(self):
505  """!
506  @brief Calculate centers of clusters in line with contained objects.
507
508  @return (numpy.matrix) Updated centers as list of centers.
509
510  """
511
512  dimension = self.__pointer_data.shape[1]
513  centers = numpy.zeros((len(self.__clusters), dimension))
514
515  for index in range(len(self.__clusters)):
516  cluster_points = self.__pointer_data[self.__clusters[index], :]
517  centers[index] = cluster_points.mean(axis=0)
518
519  return numpy.array(centers)
520
521
522  def __calculate_total_wce(self):
523  """!
524  @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm.
525
526  """
527
528  dataset_differences = self.__calculate_dataset_difference(len(self.__clusters))
529
530  self.__total_wce = 0
531  for index_cluster in range(len(self.__clusters)):
532  for index_point in self.__clusters[index_cluster]:
533  self.__total_wce += dataset_differences[index_cluster][index_point]
534
535
536  def __calculate_dataset_difference(self, amount_clusters):
537  """!
538  @brief Calculate distance from each point to each cluster center.
539
540  """
541  dataset_differences = numpy.zeros((amount_clusters, len(self.__pointer_data)))
542  for index_center in range(amount_clusters):
543  if self.__metric.get_type() != type_metric.USER_DEFINED:
544  dataset_differences[index_center] = self.__metric(self.__pointer_data, self.__centers[index_center])
545  else:
546  dataset_differences[index_center] = [ self.__metric(point, self.__centers[index_center])
547  for point in self.__pointer_data ]
548
549  return dataset_differences
550
551
552  def __calculate_changes(self, updated_centers):
553  """!
554  @brief Calculates changes estimation between previous and current iteration using centers for that purpose.
555
556  @param[in] updated_centers (array_like): New cluster centers.
557
558  @return (float) Maximum changes between centers.
559
560  """
561  if len(self.__centers) != len(updated_centers):
562  maximum_change = float('inf')
563
564  else:
565  changes = self.__metric(self.__centers, updated_centers)
566  maximum_change = numpy.max(changes)
567
568  return maximum_change
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:359
pyclustering module for cluster analysis.
Definition: __init__.py:1
def notify(self, clusters, centers)
This method is called by K-Means algorithm to notify about changes.
Definition: kmeans.py:75
def get_centers(self)
Returns list of centers of allocated clusters.
Definition: kmeans.py:440
def __calculate_dataset_difference(self, amount_clusters)
Calculate distance from each point to each cluster center.
Definition: kmeans.py:536
def process(self)
Performs cluster analysis in line with rules of K-Means algorithm.
Definition: kmeans.py:357
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def get_clusters(self, iteration)
Get method to return allocated clusters at specific iteration of clustering process.
Definition: kmeans.py:119
def __calculate_changes(self, updated_centers)
Calculates changes estimation between previous and current iteration using centers for that purpose...
Definition: kmeans.py:552
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:58
Observer of K-Means algorithm that is used to collect information about clustering process on each it...
Definition: kmeans.py:49
def __update_centers(self)
Calculate centers of clusters in line with contained objects.
Definition: kmeans.py:504
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: kmeans.py:455
Class represents K-Means clustering algorithm.
Definition: kmeans.py:272
def __init__(self)
Initializer of observer of K-Means algorithm.
Definition: kmeans.py:57
Visualizer of K-Means algorithm&#39;s results.
Definition: kmeans.py:132
def get_centers(self, iteration)
Get method to return centers at specific iteration of clustering process.
Definition: kmeans.py:97
def __calculate_total_wce(self)
Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm...
Definition: kmeans.py:522
def set_evolution_centers(self, evolution_centers)
Set evolution of changes of centers during clustering process.
Definition: kmeans.py:87
def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Means.
Definition: kmeans.py:318
def __len__(self)
Returns amount of steps that were observer during clustering process in K-Means algorithm.
Definition: kmeans.py:67
def set_evolution_clusters(self, evolution_clusters)
Set evolution of changes of centers during clustering process.
Definition: kmeans.py:109
def show_clusters(sample, clusters, centers, initial_centers=None, kwargs)
Display K-Means clustering results.
Definition: kmeans.py:144
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmeans.py:428
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: kmeans.py:381
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmeans.py:469
def __update_clusters(self)
Calculate Euclidean distance to each point from the each cluster.
Definition: kmeans.py:482
def __process_by_python(self)
Performs cluster analysis using python code.
Definition: kmeans.py:399
def animate_cluster_allocation(data, observer, animation_velocity=500, movie_fps=1, save_movie=None)
Animates clustering process that is performed by K-Means algorithm.
Definition: kmeans.py:232