kmeans.py
1 """!
2
3 @brief Cluster analysis algorithm: K-Means
4 @details Implementation based on paper @cite inproceedings::kmeans::1.
5
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
9
11  PyClustering is free software: you can redistribute it and/or modify
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24
25 """
26
27
28 import numpy
29 import warnings
30
31 try:
32  import matplotlib.pyplot as plt
33  import matplotlib.animation as animation
34 except Exception as error_instance:
35  warnings.warn("Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization "
36  "functionality is not available (details: '%s')." % str(error_instance))
37
38 import pyclustering.core.kmeans_wrapper as wrapper
39
40 from pyclustering.core.wrapper import ccore_library
41 from pyclustering.core.metric_wrapper import metric_wrapper
42
43 from pyclustering.cluster.encoder import type_encoding
44 from pyclustering.cluster import cluster_visualizer
45
46 from pyclustering.utils.metric import distance_metric, type_metric
47
48
50  """!
51  @brief Observer of K-Means algorithm that is used to collect information about clustering process on each iteration of the algorithm.
52
53  @see kmeans
54
55  """
56
57  def __init__(self):
58  """!
59  @brief Initializer of observer of K-Means algorithm.
60
61  """
62  self.__evolution_clusters = []
63  self.__evolution_centers = []
64  self.__initial_centers = []
65
66
67  def __len__(self):
68  """!
69  @brief Returns amount of steps that were observer during clustering process in K-Means algorithm.
70
71  """
72  return len(self.__evolution_clusters)
73
74
75  def notify(self, clusters, centers):
76  """!
77  @brief This method is called by K-Means algorithm to notify about changes.
78
79  @param[in] clusters (array_like): Allocated clusters by K-Means algorithm.
80  @param[in] centers (array_like): Allocated centers by K-Means algorithm.
81
82  """
83  self.__evolution_clusters.append(clusters)
84  self.__evolution_centers.append(centers)
85
86
87  def set_evolution_centers(self, evolution_centers):
88  """!
89  @brief Set evolution of changes of centers during clustering process.
90
91  @param[in] evolution_centers (array_like): Evolution of changes of centers during clustering process.
92
93  """
94  self.__evolution_centers = evolution_centers
95
96
97  def get_centers(self, iteration):
98  """!
99  @brief Get method to return centers at specific iteration of clustering process.
100
101  @param[in] iteration (uint): Clustering process iteration at which centers are required.
102
103  @return (array_like) Centers at specific iteration.
104
105  """
106  return self.__evolution_centers[iteration]
107
108
109  def set_evolution_clusters(self, evolution_clusters):
110  """!
111  @brief Set evolution of changes of centers during clustering process.
112
113  @param[in] evolution_clusters (array_like): Evolution of changes of clusters during clustering process.
114
115  """
116  self.__evolution_clusters = evolution_clusters
117
118
119  def get_clusters(self, iteration):
120  """!
121  @brief Get method to return allocated clusters at specific iteration of clustering process.
122
123  @param[in] iteration (uint): Clustering process iteration at which clusters are required.
124
125  @return (array_like) Clusters at specific iteration.
126
127  """
128  return self.__evolution_clusters[iteration]
129
130
131
133  """!
134  @brief Visualizer of K-Means algorithm's results.
135  @details K-Means visualizer provides visualization services that are specific for K-Means algorithm.
136
137  """
138
139  __default_2d_marker_size = 15
140  __default_3d_marker_size = 70
141
142
143  @staticmethod
144  def show_clusters(sample, clusters, centers, initial_centers = None, **kwargs):
145  """!
146  @brief Display K-Means clustering results.
147
148  @param[in] sample (list): Dataset that was used for clustering.
149  @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
150  @param[in] centers (array_like): Centers that were allocated by the algorithm.
151  @param[in] initial_centers (array_like): Initial centers that were used by the algorithm, if 'None' then initial centers are not displyed.
152  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display', 'offset').
153
154  <b>Keyword Args:</b><br>
155  - figure (figure): If 'None' then new is figure is created, otherwise specified figure is used for visualization.
156  - display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'.
157  - offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified).
158
159  @return (figure) Figure where clusters were drawn.
160
161  """
162
163  visualizer = cluster_visualizer()
164  visualizer.append_clusters(clusters, sample)
165
166  offset = kwargs.get('offset', 0)
167  figure = kwargs.get('figure', None)
168  display = kwargs.get('display', True)
169
170  if figure is None:
171  figure = visualizer.show(display = False)
172  else:
173  visualizer.show(figure = figure, display = False)
174
175  kmeans_visualizer.__draw_centers(figure, offset, visualizer, centers, initial_centers)
176  kmeans_visualizer.__draw_rays(figure, offset, visualizer, sample, clusters, centers)
177
178  if display is True:
179  plt.show()
180
181  return figure
182
183
184  @staticmethod
185  def __draw_rays(figure, offset, visualizer, sample, clusters, centers):
186  ax = figure.get_axes()[offset]
187
188  for index_cluster in range(len(clusters)):
189  color = visualizer.get_cluster_color(index_cluster, 0)
190  kmeans_visualizer.__draw_cluster_rays(ax, color, sample, clusters[index_cluster], centers[index_cluster])
191
192
193  @staticmethod
194  def __draw_cluster_rays(ax, color, sample, cluster, center):
195  dimension = len(sample[0])
196
197  for index_point in cluster:
198  point = sample[index_point]
199  if dimension == 1:
200  ax.plot([point[0], center[0]], [0.0, 0.0], '-', color=color, linewidth=0.5)
201  elif dimension == 2:
202  ax.plot([point[0], center[0]], [point[1], center[1]], '-', color=color, linewidth=0.5)
203  elif dimension == 3:
204  ax.plot([point[0], center[0]], [point[1], center[1]], [point[2], center[2]], '-', color=color, linewidth=0.5)
205
206
207  @staticmethod
208  def __draw_center(ax, center, color, marker, alpha):
209  dimension = len(center)
210
211  if dimension == 1:
212  ax.plot(center[0], 0.0, color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
213  elif dimension == 2:
214  ax.plot(center[0], center[1], color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
215  elif dimension == 3:
216  ax.scatter(center[0], center[1], center[2], c=color, alpha=alpha, marker=marker, s=kmeans_visualizer.__default_3d_marker_size)
217
218
219  @staticmethod
220  def __draw_centers(figure, offset, visualizer, centers, initial_centers):
221  ax = figure.get_axes()[offset]
222
223  for index_center in range(len(centers)):
224  color = visualizer.get_cluster_color(index_center, 0)
225  kmeans_visualizer.__draw_center(ax, centers[index_center], color, '*', 1.0)
226
227  if initial_centers is not None:
228  kmeans_visualizer.__draw_center(ax, initial_centers[index_center], color, '*', 0.4)
229
230
231  @staticmethod
232  def animate_cluster_allocation(data, observer, animation_velocity = 500, movie_fps = 1, save_movie = None):
233  """!
234  @brief Animates clustering process that is performed by K-Means algorithm.
235
236  @param[in] data (list): Dataset that is used for clustering.
237  @param[in] observer (kmeans_observer): EM observer that was used for collection information about clustering process.
238  @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
239  @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
240  @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
241
242  """
243  figure = plt.figure()
244
245  def init_frame():
246  return frame_generation(0)
247
248  def frame_generation(index_iteration):
249  figure.clf()
250
251  figure.suptitle("K-Means algorithm (iteration: " + str(index_iteration) + ")", fontsize=18, fontweight='bold')
252
253  clusters = observer.get_clusters(index_iteration)
254  centers = observer.get_centers(index_iteration)
255  kmeans_visualizer.show_clusters(data, clusters, centers, None, figure=figure, display=False)
256
258
259  return [figure.gca()]
260
261  iterations = len(observer)
262  cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval=animation_velocity,
263  init_func=init_frame, repeat_delay=5000)
264
265  if save_movie is not None:
266  cluster_animation.save(save_movie, writer='ffmpeg', fps=movie_fps, bitrate=3000)
267  else:
268  plt.show()
269
270
271
272 class kmeans:
273  """!
274  @brief Class represents K-Means clustering algorithm.
275  @details CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
276
277  K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization
278  initial centers from module 'pyclustering.cluster.center_initializer'.
279
280  @image html kmeans_example_clustering.png "K-Means clustering results. At the left - 'Simple03.data' sample, at the right - 'Lsun.data' sample."
281
282  Example #1 - Clustering using K-Means++ for center initialization:
283  @code
284  from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
285  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
286  from pyclustering.samples.definitions import FCPS_SAMPLES
288
289  # Load list of points for cluster analysis.
291
292  # Prepare initial centers using K-Means++ method.
293  initial_centers = kmeans_plusplus_initializer(sample, 2).initialize()
294
295  # Create instance of K-Means algorithm with prepared centers.
296  kmeans_instance = kmeans(sample, initial_centers)
297
298  # Run cluster analysis and obtain results.
299  kmeans_instance.process()
300  clusters = kmeans_instance.get_clusters()
301  final_centers = kmeans_instance.get_centers()
302
303  # Visualize obtained results
304  kmeans_visualizer.show_clusters(sample, clusters, final_centers)
305  @endcode
306
307  Example #2 - Clustering using specific distance metric, for example, Manhattan distance:
308  @code
309  # prepare input data and initial centers for cluster analysis using K-Means
310
311  # create metric that will be used for clustering
312  manhattan_metric = distance_metric(type_metric.MANHATTAN)
313
314  # create instance of K-Means using specific distance metric:
315  kmeans_instance = kmeans(sample, initial_centers, metric=manhattan_metric)
316
317  # run cluster analysis and obtain results
318  kmeans_instance.process()
319  clusters = kmeans_instance.get_clusters()
320  @endcode
321
322  @see center_initializer
323
324  """
325
326  def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
327  """!
328  @brief Constructor of clustering algorithm K-Means.
329  @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
330
331  @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
332  @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
333  @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
334  @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
335  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer', 'metric', 'itermax').
336
337  <b>Keyword Args:</b><br>
338  - observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration.
339  - metric (distance_metric): Metric that is used for distance calculation between two points (by default euclidean square distance).
340  - itermax (uint): Maximum number of iterations that is used for clustering process (by default: 200).
341
342  @see center_initializer
343
344  """
345  self.__pointer_data = numpy.array(data)
346  self.__clusters = []
347  self.__centers = numpy.array(initial_centers)
348  self.__tolerance = tolerance
349  self.__total_wce = 0
350
351  self.__observer = kwargs.get('observer', None)
352  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
353  self.__itermax = kwargs.get('itermax', 100)
354
355  if self.__metric.get_type() != type_metric.USER_DEFINED:
356  self.__metric.enable_numpy_usage()
357  else:
358  self.__metric.disable_numpy_usage()
359
360  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
361  if self.__ccore is True:
362  self.__ccore = ccore_library.workable()
363
364
365  def process(self):
366  """!
367  @brief Performs cluster analysis in line with rules of K-Means algorithm.
368
369  @return (kmeans) Returns itself (K-Means instance).
370
371  @remark Results of clustering can be obtained using corresponding get methods.
372
373  @see get_clusters()
374  @see get_centers()
375
376  """
377
378  if len(self.__pointer_data[0]) != len(self.__centers[0]):
379  raise ValueError("Dimension of the input data and dimension of the initial cluster centers must be equal.")
380
381  if self.__ccore is True:
382  self.__process_by_ccore()
383  else:
384  self.__process_by_python()
385
386  return self
387
388
389  def __process_by_ccore(self):
390  """!
391  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
392
393  """
394  ccore_metric = metric_wrapper.create_instance(self.__metric)
395
396  results = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance, self.__itermax, (self.__observer is not None), ccore_metric.get_pointer())
397  self.__clusters = results[0]
398  self.__centers = results[1]
399
400  if self.__observer is not None:
401  self.__observer.set_evolution_clusters(results[2])
402  self.__observer.set_evolution_centers(results[3])
403
404  self.__total_wce = results[4][0]
405
406
407  def __process_by_python(self):
408  """!
409  @brief Performs cluster analysis using python code.
410
411  """
412
413  maximum_change = float('inf')
414  iteration = 0
415
416  if self.__observer is not None:
417  initial_clusters = self.__update_clusters()
418  self.__observer.notify(initial_clusters, self.__centers.tolist())
419
420  while maximum_change > self.__tolerance and iteration < self.__itermax:
421  self.__clusters = self.__update_clusters()
422  updated_centers = self.__update_centers() # changes should be calculated before assignment
423
424  if self.__observer is not None:
425  self.__observer.notify(self.__clusters, updated_centers.tolist())
426
427  maximum_change = self.__calculate_changes(updated_centers)
428
429  self.__centers = updated_centers # assign center after change calculation
430  iteration += 1
431
432  self.__calculate_total_wce()
433
434
435  def predict(self, points):
436  """!
437  @brief Calculates the closest cluster to each point.
438
439  @param[in] points (array_like): Points for which closest clusters are calculated.
440
441  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
442  collection if 'process()' method was not called.
443
444  """
445
446  nppoints = numpy.array(points)
447  if len(self.__clusters) == 0:
448  return []
449
450  differences = numpy.zeros((len(nppoints), len(self.__centers)))
451  for index_point in range(len(nppoints)):
452  if self.__metric.get_type() != type_metric.USER_DEFINED:
453  differences[index_point] = self.__metric(nppoints[index_point], self.__centers)
454  else:
455  differences[index_point] = [ self.__metric(nppoints[index_point], center) for center in self.__centers ]
456
457  return numpy.argmin(differences, axis=1)
458
459
460  def get_clusters(self):
461  """!
462  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
463
464  @see process()
465  @see get_centers()
466
467  """
468
469  return self.__clusters
470
471
472  def get_centers(self):
473  """!
474  @brief Returns list of centers of allocated clusters.
475
476  @see process()
477  @see get_clusters()
478
479  """
480
481  if isinstance(self.__centers, list):
482  return self.__centers
483
484  return self.__centers.tolist()
485
486
487  def get_total_wce(self):
488  """!
489  @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
490  @details Sum of metric errors is calculated using distance between point and its center:
491  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
492
493  @see process()
494  @see get_clusters()
495
496  """
497
498  return self.__total_wce
499
500
502  """!
503  @brief Returns clustering result representation type that indicate how clusters are encoded.
504
505  @return (type_encoding) Clustering result representation.
506
507  @see get_clusters()
508
509  """
510
511  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
512
513
514  def __update_clusters(self):
515  """!
516  @brief Calculate distance (in line with specified metric) to each point from the each cluster. Nearest points
517  are captured by according clusters and as a result clusters are updated.
518
519  @return (list) Updated clusters as list of clusters. Each cluster contains indexes of objects from data.
520
521  """
522
523  clusters = [[] for _ in range(len(self.__centers))]
524
525  dataset_differences = self.__calculate_dataset_difference(len(clusters))
526
527  optimum_indexes = numpy.argmin(dataset_differences, axis=0)
528  for index_point in range(len(optimum_indexes)):
529  index_cluster = optimum_indexes[index_point]
530  clusters[index_cluster].append(index_point)
531
532  clusters = [cluster for cluster in clusters if len(cluster) > 0]
533
534  return clusters
535
536
537  def __update_centers(self):
538  """!
539  @brief Calculate centers of clusters in line with contained objects.
540
541  @return (numpy.array) Updated centers.
542
543  """
544
545  dimension = self.__pointer_data.shape[1]
546  centers = numpy.zeros((len(self.__clusters), dimension))
547
548  for index in range(len(self.__clusters)):
549  cluster_points = self.__pointer_data[self.__clusters[index], :]
550  centers[index] = cluster_points.mean(axis=0)
551
552  return numpy.array(centers)
553
554
555  def __calculate_total_wce(self):
556  """!
557  @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm.
558
559  """
560
561  dataset_differences = self.__calculate_dataset_difference(len(self.__clusters))
562
563  self.__total_wce = 0
564  for index_cluster in range(len(self.__clusters)):
565  for index_point in self.__clusters[index_cluster]:
566  self.__total_wce += dataset_differences[index_cluster][index_point]
567
568
569  def __calculate_dataset_difference(self, amount_clusters):
570  """!
571  @brief Calculate distance from each point to each cluster center.
572
573  """
574  dataset_differences = numpy.zeros((amount_clusters, len(self.__pointer_data)))
575  for index_center in range(amount_clusters):
576  if self.__metric.get_type() != type_metric.USER_DEFINED:
577  dataset_differences[index_center] = self.__metric(self.__pointer_data, self.__centers[index_center])
578  else:
579  dataset_differences[index_center] = [ self.__metric(point, self.__centers[index_center])
580  for point in self.__pointer_data ]
581
582  return dataset_differences
583
584
585  def __calculate_changes(self, updated_centers):
586  """!
587  @brief Calculates changes estimation between previous and current iteration using centers for that purpose.
588
589  @param[in] updated_centers (array_like): New cluster centers.
590
591  @return (float) Maximum changes between centers.
592
593  """
594  if len(self.__centers) != len(updated_centers):
595  maximum_change = float('inf')
596
597  else:
598  changes = self.__metric(self.__centers, updated_centers)
599  maximum_change = numpy.max(changes)
600
601  return maximum_change
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:359
pyclustering module for cluster analysis.
Definition: __init__.py:1
def notify(self, clusters, centers)
This method is called by K-Means algorithm to notify about changes.
Definition: kmeans.py:75
def get_centers(self)
Returns list of centers of allocated clusters.
Definition: kmeans.py:472
def __calculate_dataset_difference(self, amount_clusters)
Calculate distance from each point to each cluster center.
Definition: kmeans.py:569
def process(self)
Performs cluster analysis in line with rules of K-Means algorithm.
Definition: kmeans.py:365
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def get_clusters(self, iteration)
Get method to return allocated clusters at specific iteration of clustering process.
Definition: kmeans.py:119
def __calculate_changes(self, updated_centers)
Calculates changes estimation between previous and current iteration using centers for that purpose...
Definition: kmeans.py:585
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:64
Observer of K-Means algorithm that is used to collect information about clustering process on each it...
Definition: kmeans.py:49
def __update_centers(self)
Calculate centers of clusters in line with contained objects.
Definition: kmeans.py:537
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: kmeans.py:487
Class represents K-Means clustering algorithm.
Definition: kmeans.py:272
def __init__(self)
Initializer of observer of K-Means algorithm.
Definition: kmeans.py:57
Visualizer of K-Means algorithm&#39;s results.
Definition: kmeans.py:132
def get_centers(self, iteration)
Get method to return centers at specific iteration of clustering process.
Definition: kmeans.py:97
def __calculate_total_wce(self)
Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm...
Definition: kmeans.py:555
def set_evolution_centers(self, evolution_centers)
Set evolution of changes of centers during clustering process.
Definition: kmeans.py:87
def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Means.
Definition: kmeans.py:326
def __len__(self)
Returns amount of steps that were observer during clustering process in K-Means algorithm.
Definition: kmeans.py:67
def set_evolution_clusters(self, evolution_clusters)
Set evolution of changes of centers during clustering process.
Definition: kmeans.py:109
def show_clusters(sample, clusters, centers, initial_centers=None, kwargs)
Display K-Means clustering results.
Definition: kmeans.py:144
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmeans.py:460
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: kmeans.py:389
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmeans.py:501
def __update_clusters(self)
Calculate distance (in line with specified metric) to each point from the each cluster.
Definition: kmeans.py:514
def __process_by_python(self)
Performs cluster analysis using python code.
Definition: kmeans.py:407
def animate_cluster_allocation(data, observer, animation_velocity=500, movie_fps=1, save_movie=None)
Animates clustering process that is performed by K-Means algorithm.
Definition: kmeans.py:232
def predict(self, points)
Calculates the closest cluster to each point.
Definition: kmeans.py:435