kmeans.py
1 """!
2 
3 @brief Cluster analysis algorithm: K-Means
4 @details Implementation based on paper @cite inproceedings::kmeans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import warnings
30 
31 try:
32  import matplotlib.pyplot as plt
33  import matplotlib.animation as animation
34 except Exception as error_instance:
35  warnings.warn("Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization "
36  "functionality is not available (details: '%s')." % str(error_instance))
37 
38 import pyclustering.core.kmeans_wrapper as wrapper
39 
40 from pyclustering.core.wrapper import ccore_library
41 from pyclustering.core.metric_wrapper import metric_wrapper
42 
43 from pyclustering.cluster.encoder import type_encoding
44 from pyclustering.cluster import cluster_visualizer
45 
46 from pyclustering.utils.metric import distance_metric, type_metric
47 
48 
50  """!
51  @brief Observer of K-Means algorithm that is used to collect information about clustering process on each iteration of the algorithm.
52 
53  @see kmeans
54 
55  """
56 
57  def __init__(self):
58  """!
59  @brief Initializer of observer of K-Means algorithm.
60 
61  """
62  self.__evolution_clusters = []
63  self.__evolution_centers = []
64  self.__initial_centers = []
65 
66 
67  def __len__(self):
68  """!
69  @brief Returns amount of steps that were observer during clustering process in K-Means algorithm.
70 
71  """
72  return len(self.__evolution_clusters)
73 
74 
75  def notify(self, clusters, centers):
76  """!
77  @brief This method is called by K-Means algorithm to notify about changes.
78 
79  @param[in] clusters (array_like): Allocated clusters by K-Means algorithm.
80  @param[in] centers (array_like): Allocated centers by K-Means algorithm.
81 
82  """
83  self.__evolution_clusters.append(clusters)
84  self.__evolution_centers.append(centers)
85 
86 
87  def set_evolution_centers(self, evolution_centers):
88  """!
89  @brief Set evolution of changes of centers during clustering process.
90 
91  @param[in] evolution_centers (array_like): Evolution of changes of centers during clustering process.
92 
93  """
94  self.__evolution_centers = evolution_centers
95 
96 
97  def get_centers(self, iteration):
98  """!
99  @brief Get method to return centers at specific iteration of clustering process.
100 
101  @param[in] iteration (uint): Clustering process iteration at which centers are required.
102 
103  @return (array_like) Centers at specific iteration.
104 
105  """
106  return self.__evolution_centers[iteration]
107 
108 
109  def set_evolution_clusters(self, evolution_clusters):
110  """!
111  @brief Set evolution of changes of centers during clustering process.
112 
113  @param[in] evolution_clusters (array_like): Evolution of changes of clusters during clustering process.
114 
115  """
116  self.__evolution_clusters = evolution_clusters
117 
118 
119  def get_clusters(self, iteration):
120  """!
121  @brief Get method to return allocated clusters at specific iteration of clustering process.
122 
123  @param[in] iteration (uint): Clustering process iteration at which clusters are required.
124 
125  @return (array_like) Clusters at specific iteration.
126 
127  """
128  return self.__evolution_clusters[iteration]
129 
130 
131 
133  """!
134  @brief Visualizer of K-Means algorithm's results.
135  @details K-Means visualizer provides visualization services that are specific for K-Means algorithm.
136 
137  """
138 
139  __default_2d_marker_size = 15
140  __default_3d_marker_size = 70
141 
142 
143  @staticmethod
144  def show_clusters(sample, clusters, centers, initial_centers = None, **kwargs):
145  """!
146  @brief Display K-Means clustering results.
147 
148  @param[in] sample (list): Dataset that was used for clustering.
149  @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
150  @param[in] centers (array_like): Centers that were allocated by the algorithm.
151  @param[in] initial_centers (array_like): Initial centers that were used by the algorithm, if 'None' then initial centers are not displyed.
152  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display', 'offset').
153 
154  <b>Keyword Args:</b><br>
155  - figure (figure): If 'None' then new is figure is created, otherwise specified figure is used for visualization.
156  - display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'.
157  - offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified).
158 
159  @return (figure) Figure where clusters were drawn.
160 
161  """
162 
163  visualizer = cluster_visualizer()
164  visualizer.append_clusters(clusters, sample)
165 
166  offset = kwargs.get('offset', 0)
167  figure = kwargs.get('figure', None)
168  display = kwargs.get('display', True)
169 
170  if figure is None:
171  figure = visualizer.show(display = False)
172  else:
173  visualizer.show(figure = figure, display = False)
174 
175  kmeans_visualizer.__draw_centers(figure, offset, visualizer, centers, initial_centers)
176  kmeans_visualizer.__draw_rays(figure, offset, visualizer, sample, clusters, centers)
177 
178  if display is True:
179  plt.show()
180 
181  return figure
182 
183 
184  @staticmethod
185  def __draw_rays(figure, offset, visualizer, sample, clusters, centers):
186  ax = figure.get_axes()[offset]
187 
188  for index_cluster in range(len(clusters)):
189  color = visualizer.get_cluster_color(index_cluster, 0)
190  kmeans_visualizer.__draw_cluster_rays(ax, color, sample, clusters[index_cluster], centers[index_cluster])
191 
192 
193  @staticmethod
194  def __draw_cluster_rays(ax, color, sample, cluster, center):
195  dimension = len(sample[0])
196 
197  for index_point in cluster:
198  point = sample[index_point]
199  if dimension == 1:
200  ax.plot([point[0], center[0]], [0.0, 0.0], '-', color=color, linewidth=0.5)
201  elif dimension == 2:
202  ax.plot([point[0], center[0]], [point[1], center[1]], '-', color=color, linewidth=0.5)
203  elif dimension == 3:
204  ax.plot([point[0], center[0]], [point[1], center[1]], [point[2], center[2]], '-', color=color, linewidth=0.5)
205 
206 
207  @staticmethod
208  def __draw_center(ax, center, color, marker, alpha):
209  dimension = len(center)
210 
211  if dimension == 1:
212  ax.plot(center[0], 0.0, color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
213  elif dimension == 2:
214  ax.plot(center[0], center[1], color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
215  elif dimension == 3:
216  ax.scatter(center[0], center[1], center[2], c=color, alpha=alpha, marker=marker, s=kmeans_visualizer.__default_3d_marker_size)
217 
218 
219  @staticmethod
220  def __draw_centers(figure, offset, visualizer, centers, initial_centers):
221  ax = figure.get_axes()[offset]
222 
223  for index_center in range(len(centers)):
224  color = visualizer.get_cluster_color(index_center, 0)
225  kmeans_visualizer.__draw_center(ax, centers[index_center], color, '*', 1.0)
226 
227  if initial_centers is not None:
228  kmeans_visualizer.__draw_center(ax, initial_centers[index_center], color, '*', 0.4)
229 
230 
231  @staticmethod
232  def animate_cluster_allocation(data, observer, animation_velocity = 500, movie_fps = 1, save_movie = None):
233  """!
234  @brief Animates clustering process that is performed by K-Means algorithm.
235 
236  @param[in] data (list): Dataset that is used for clustering.
237  @param[in] observer (kmeans_observer): EM observer that was used for collection information about clustering process.
238  @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
239  @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
240  @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
241 
242  """
243  figure = plt.figure()
244 
245  def init_frame():
246  return frame_generation(0)
247 
248  def frame_generation(index_iteration):
249  figure.clf()
250 
251  figure.suptitle("K-Means algorithm (iteration: " + str(index_iteration) + ")", fontsize=18, fontweight='bold')
252 
253  clusters = observer.get_clusters(index_iteration)
254  centers = observer.get_centers(index_iteration)
255  kmeans_visualizer.show_clusters(data, clusters, centers, None, figure=figure, display=False)
256 
257  figure.subplots_adjust(top=0.85)
258 
259  return [figure.gca()]
260 
261  iterations = len(observer)
262  cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval=animation_velocity,
263  init_func=init_frame, repeat_delay=5000)
264 
265  if save_movie is not None:
266  cluster_animation.save(save_movie, writer='ffmpeg', fps=movie_fps, bitrate=3000)
267  else:
268  plt.show()
269 
270 
271 
272 class kmeans:
273  """!
274  @brief Class represents K-Means clustering algorithm.
275  @details CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
276 
277  K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization
278  initial centers from module 'pyclustering.cluster.center_initializer'.
279 
280  @image html kmeans_example_clustering.png "K-Means clustering results. At the left - 'Simple03.data' sample, at the right - 'Lsun.data' sample."
281 
282  Example #1 - Clustering using K-Means++ for center initialization:
283  @code
284  from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
285  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
286  from pyclustering.samples.definitions import FCPS_SAMPLES
287  from pyclustering.utils import read_sample
288 
289  # Load list of points for cluster analysis.
290  sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
291 
292  # Prepare initial centers using K-Means++ method.
293  initial_centers = kmeans_plusplus_initializer(sample, 2).initialize()
294 
295  # Create instance of K-Means algorithm with prepared centers.
296  kmeans_instance = kmeans(sample, initial_centers)
297 
298  # Run cluster analysis and obtain results.
299  kmeans_instance.process()
300  clusters = kmeans_instance.get_clusters()
301  final_centers = kmeans_instance.get_centers()
302 
303  # Visualize obtained results
304  kmeans_visualizer.show_clusters(sample, clusters, final_centers)
305  @endcode
306 
307  Example #2 - Clustering using specific distance metric, for example, Manhattan distance:
308  @code
309  # prepare input data and initial centers for cluster analysis using K-Means
310 
311  # create metric that will be used for clustering
312  manhattan_metric = distance_metric(type_metric.MANHATTAN)
313 
314  # create instance of K-Means using specific distance metric:
315  kmeans_instance = kmeans(sample, initial_centers, metric=manhattan_metric)
316 
317  # run cluster analysis and obtain results
318  kmeans_instance.process()
319  clusters = kmeans_instance.get_clusters()
320  @endcode
321 
322  @see center_initializer
323 
324  """
325 
326  def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
327  """!
328  @brief Constructor of clustering algorithm K-Means.
329  @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
330 
331  @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
332  @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
333  @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
334  @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
335  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer', 'metric', 'itermax').
336 
337  <b>Keyword Args:</b><br>
338  - observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration.
339  - metric (distance_metric): Metric that is used for distance calculation between two points (by default euclidean square distance).
340  - itermax (uint): Maximum number of iterations that is used for clustering process (by default: 200).
341 
342  @see center_initializer
343 
344  """
345  self.__pointer_data = numpy.array(data)
346  self.__clusters = []
347  self.__centers = numpy.array(initial_centers)
348  self.__tolerance = tolerance
349  self.__total_wce = 0
350 
351  self.__observer = kwargs.get('observer', None)
352  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
353  self.__itermax = kwargs.get('itermax', 100)
354 
355  if self.__metric.get_type() != type_metric.USER_DEFINED:
356  self.__metric.enable_numpy_usage()
357  else:
358  self.__metric.disable_numpy_usage()
359 
360  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
361  if self.__ccore is True:
362  self.__ccore = ccore_library.workable()
363 
364 
365  def process(self):
366  """!
367  @brief Performs cluster analysis in line with rules of K-Means algorithm.
368 
369  @return (kmeans) Returns itself (K-Means instance).
370 
371  @remark Results of clustering can be obtained using corresponding get methods.
372 
373  @see get_clusters()
374  @see get_centers()
375 
376  """
377 
378  if len(self.__pointer_data[0]) != len(self.__centers[0]):
379  raise ValueError("Dimension of the input data and dimension of the initial cluster centers must be equal.")
380 
381  if self.__ccore is True:
382  self.__process_by_ccore()
383  else:
384  self.__process_by_python()
385 
386  return self
387 
388 
389  def __process_by_ccore(self):
390  """!
391  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
392 
393  """
394  ccore_metric = metric_wrapper.create_instance(self.__metric)
395 
396  results = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance, self.__itermax, (self.__observer is not None), ccore_metric.get_pointer())
397  self.__clusters = results[0]
398  self.__centers = results[1]
399 
400  if self.__observer is not None:
401  self.__observer.set_evolution_clusters(results[2])
402  self.__observer.set_evolution_centers(results[3])
403 
404  self.__total_wce = results[4][0]
405 
406 
407  def __process_by_python(self):
408  """!
409  @brief Performs cluster analysis using python code.
410 
411  """
412 
413  maximum_change = float('inf')
414  iteration = 0
415 
416  if self.__observer is not None:
417  initial_clusters = self.__update_clusters()
418  self.__observer.notify(initial_clusters, self.__centers.tolist())
419 
420  while maximum_change > self.__tolerance and iteration < self.__itermax:
421  self.__clusters = self.__update_clusters()
422  updated_centers = self.__update_centers() # changes should be calculated before assignment
423 
424  if self.__observer is not None:
425  self.__observer.notify(self.__clusters, updated_centers.tolist())
426 
427  maximum_change = self.__calculate_changes(updated_centers)
428 
429  self.__centers = updated_centers # assign center after change calculation
430  iteration += 1
431 
432  self.__calculate_total_wce()
433 
434 
435  def predict(self, points):
436  """!
437  @brief Calculates the closest cluster to each point.
438 
439  @param[in] points (array_like): Points for which closest clusters are calculated.
440 
441  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
442  collection if 'process()' method was not called.
443 
444  """
445 
446  nppoints = numpy.array(points)
447  if len(self.__clusters) == 0:
448  return []
449 
450  differences = numpy.zeros((len(nppoints), len(self.__centers)))
451  for index_point in range(len(nppoints)):
452  if self.__metric.get_type() != type_metric.USER_DEFINED:
453  differences[index_point] = self.__metric(nppoints[index_point], self.__centers)
454  else:
455  differences[index_point] = [ self.__metric(nppoints[index_point], center) for center in self.__centers ]
456 
457  return numpy.argmin(differences, axis=1)
458 
459 
460  def get_clusters(self):
461  """!
462  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
463 
464  @see process()
465  @see get_centers()
466 
467  """
468 
469  return self.__clusters
470 
471 
472  def get_centers(self):
473  """!
474  @brief Returns list of centers of allocated clusters.
475 
476  @see process()
477  @see get_clusters()
478 
479  """
480 
481  if isinstance(self.__centers, list):
482  return self.__centers
483 
484  return self.__centers.tolist()
485 
486 
487  def get_total_wce(self):
488  """!
489  @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
490  @details Sum of metric errors is calculated using distance between point and its center:
491  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
492 
493  @see process()
494  @see get_clusters()
495 
496  """
497 
498  return self.__total_wce
499 
500 
502  """!
503  @brief Returns clustering result representation type that indicate how clusters are encoded.
504 
505  @return (type_encoding) Clustering result representation.
506 
507  @see get_clusters()
508 
509  """
510 
511  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
512 
513 
514  def __update_clusters(self):
515  """!
516  @brief Calculate distance (in line with specified metric) to each point from the each cluster. Nearest points
517  are captured by according clusters and as a result clusters are updated.
518 
519  @return (list) Updated clusters as list of clusters. Each cluster contains indexes of objects from data.
520 
521  """
522 
523  clusters = [[] for _ in range(len(self.__centers))]
524 
525  dataset_differences = self.__calculate_dataset_difference(len(clusters))
526 
527  optimum_indexes = numpy.argmin(dataset_differences, axis=0)
528  for index_point in range(len(optimum_indexes)):
529  index_cluster = optimum_indexes[index_point]
530  clusters[index_cluster].append(index_point)
531 
532  clusters = [cluster for cluster in clusters if len(cluster) > 0]
533 
534  return clusters
535 
536 
537  def __update_centers(self):
538  """!
539  @brief Calculate centers of clusters in line with contained objects.
540 
541  @return (numpy.array) Updated centers.
542 
543  """
544 
545  dimension = self.__pointer_data.shape[1]
546  centers = numpy.zeros((len(self.__clusters), dimension))
547 
548  for index in range(len(self.__clusters)):
549  cluster_points = self.__pointer_data[self.__clusters[index], :]
550  centers[index] = cluster_points.mean(axis=0)
551 
552  return numpy.array(centers)
553 
554 
555  def __calculate_total_wce(self):
556  """!
557  @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm.
558 
559  """
560 
561  dataset_differences = self.__calculate_dataset_difference(len(self.__clusters))
562 
563  self.__total_wce = 0
564  for index_cluster in range(len(self.__clusters)):
565  for index_point in self.__clusters[index_cluster]:
566  self.__total_wce += dataset_differences[index_cluster][index_point]
567 
568 
569  def __calculate_dataset_difference(self, amount_clusters):
570  """!
571  @brief Calculate distance from each point to each cluster center.
572 
573  """
574  dataset_differences = numpy.zeros((amount_clusters, len(self.__pointer_data)))
575  for index_center in range(amount_clusters):
576  if self.__metric.get_type() != type_metric.USER_DEFINED:
577  dataset_differences[index_center] = self.__metric(self.__pointer_data, self.__centers[index_center])
578  else:
579  dataset_differences[index_center] = [ self.__metric(point, self.__centers[index_center])
580  for point in self.__pointer_data ]
581 
582  return dataset_differences
583 
584 
585  def __calculate_changes(self, updated_centers):
586  """!
587  @brief Calculates changes estimation between previous and current iteration using centers for that purpose.
588 
589  @param[in] updated_centers (array_like): New cluster centers.
590 
591  @return (float) Maximum changes between centers.
592 
593  """
594  if len(self.__centers) != len(updated_centers):
595  maximum_change = float('inf')
596 
597  else:
598  changes = self.__metric(self.__centers, updated_centers)
599  maximum_change = numpy.max(changes)
600 
601  return maximum_change
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:359
pyclustering module for cluster analysis.
Definition: __init__.py:1
def notify(self, clusters, centers)
This method is called by K-Means algorithm to notify about changes.
Definition: kmeans.py:75
def get_centers(self)
Returns list of centers of allocated clusters.
Definition: kmeans.py:472
def __calculate_dataset_difference(self, amount_clusters)
Calculate distance from each point to each cluster center.
Definition: kmeans.py:569
def process(self)
Performs cluster analysis in line with rules of K-Means algorithm.
Definition: kmeans.py:365
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def get_clusters(self, iteration)
Get method to return allocated clusters at specific iteration of clustering process.
Definition: kmeans.py:119
def __calculate_changes(self, updated_centers)
Calculates changes estimation between previous and current iteration using centers for that purpose...
Definition: kmeans.py:585
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:64
Observer of K-Means algorithm that is used to collect information about clustering process on each it...
Definition: kmeans.py:49
def __update_centers(self)
Calculate centers of clusters in line with contained objects.
Definition: kmeans.py:537
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: kmeans.py:487
Class represents K-Means clustering algorithm.
Definition: kmeans.py:272
def __init__(self)
Initializer of observer of K-Means algorithm.
Definition: kmeans.py:57
Visualizer of K-Means algorithm&#39;s results.
Definition: kmeans.py:132
def get_centers(self, iteration)
Get method to return centers at specific iteration of clustering process.
Definition: kmeans.py:97
def __calculate_total_wce(self)
Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm...
Definition: kmeans.py:555
def set_evolution_centers(self, evolution_centers)
Set evolution of changes of centers during clustering process.
Definition: kmeans.py:87
def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Means.
Definition: kmeans.py:326
def __len__(self)
Returns amount of steps that were observer during clustering process in K-Means algorithm.
Definition: kmeans.py:67
def set_evolution_clusters(self, evolution_clusters)
Set evolution of changes of centers during clustering process.
Definition: kmeans.py:109
def show_clusters(sample, clusters, centers, initial_centers=None, kwargs)
Display K-Means clustering results.
Definition: kmeans.py:144
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmeans.py:460
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: kmeans.py:389
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmeans.py:501
def __update_clusters(self)
Calculate distance (in line with specified metric) to each point from the each cluster.
Definition: kmeans.py:514
def __process_by_python(self)
Performs cluster analysis using python code.
Definition: kmeans.py:407
def animate_cluster_allocation(data, observer, animation_velocity=500, movie_fps=1, save_movie=None)
Animates clustering process that is performed by K-Means algorithm.
Definition: kmeans.py:232
def predict(self, points)
Calculates the closest cluster to each point.
Definition: kmeans.py:435