kmeans.py
1 """!
2 
3 @brief Cluster analysis algorithm: K-Means
4 @details Implementation based on paper @cite inproceedings::kmeans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import warnings
30 
31 try:
32  import matplotlib.pyplot as plt
33  import matplotlib.animation as animation
34 except Exception as error_instance:
35  warnings.warn("Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization "
36  "functionality is not available (details: '%s')." % str(error_instance))
37 
38 import pyclustering.core.kmeans_wrapper as wrapper
39 
40 from pyclustering.core.wrapper import ccore_library
41 from pyclustering.core.metric_wrapper import metric_wrapper
42 
43 from pyclustering.cluster.encoder import type_encoding
44 from pyclustering.cluster import cluster_visualizer
45 
46 from pyclustering.utils.metric import distance_metric, type_metric
47 
48 
50  """!
51  @brief Observer of K-Means algorithm that is used to collect information about clustering process on each iteration of the algorithm.
52 
53  @see kmeans
54 
55  """
56 
57  def __init__(self):
58  """!
59  @brief Initializer of observer of K-Means algorithm.
60 
61  """
62  self.__evolution_clusters = []
63  self.__evolution_centers = []
64  self.__initial_centers = []
65 
66 
67  def __len__(self):
68  """!
69  @brief Returns amount of steps that were observer during clustering process in K-Means algorithm.
70 
71  """
72  return len(self.__evolution_clusters)
73 
74 
75  def notify(self, clusters, centers):
76  """!
77  @brief This method is called by K-Means algorithm to notify about changes.
78 
79  @param[in] clusters (array_like): Allocated clusters by K-Means algorithm.
80  @param[in] centers (array_like): Allocated centers by K-Means algorithm.
81 
82  """
83  self.__evolution_clusters.append(clusters)
84  self.__evolution_centers.append(centers)
85 
86 
87  def set_evolution_centers(self, evolution_centers):
88  """!
89  @brief Set evolution of changes of centers during clustering process.
90 
91  @param[in] evolution_centers (array_like): Evolution of changes of centers during clustering process.
92 
93  """
94  self.__evolution_centers = evolution_centers
95 
96 
97  def get_centers(self, iteration):
98  """!
99  @brief Get method to return centers at specific iteration of clustering process.
100 
101  @param[in] iteration (uint): Clustering process iteration at which centers are required.
102 
103  @return (array_like) Centers at specific iteration.
104 
105  """
106  return self.__evolution_centers[iteration]
107 
108 
109  def set_evolution_clusters(self, evolution_clusters):
110  """!
111  @brief Set evolution of changes of centers during clustering process.
112 
113  @param[in] evolution_clusters (array_like): Evolution of changes of clusters during clustering process.
114 
115  """
116  self.__evolution_clusters = evolution_clusters
117 
118 
119  def get_clusters(self, iteration):
120  """!
121  @brief Get method to return allocated clusters at specific iteration of clustering process.
122 
123  @param[in] iteration (uint): Clustering process iteration at which clusters are required.
124 
125  @return (array_like) Clusters at specific iteration.
126 
127  """
128  return self.__evolution_clusters[iteration]
129 
130 
131 
133  """!
134  @brief Visualizer of K-Means algorithm's results.
135  @details K-Means visualizer provides visualization services that are specific for K-Means algorithm.
136 
137  """
138 
139  __default_2d_marker_size = 15
140  __default_3d_marker_size = 70
141 
142 
143  @staticmethod
144  def show_clusters(sample, clusters, centers, initial_centers = None, **kwargs):
145  """!
146  @brief Display K-Means clustering results.
147 
148  @param[in] sample (list): Dataset that was used for clustering.
149  @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
150  @param[in] centers (array_like): Centers that were allocated by the algorithm.
151  @param[in] initial_centers (array_like): Initial centers that were used by the algorithm, if 'None' then initial centers are not displyed.
152  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display', 'offset').
153 
154  <b>Keyword Args:</b><br>
155  - figure (figure): If 'None' then new is figure is created, otherwise specified figure is used for visualization.
156  - display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'.
157  - offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified).
158 
159  @return (figure) Figure where clusters were drawn.
160 
161  """
162 
163  visualizer = cluster_visualizer()
164  visualizer.append_clusters(clusters, sample)
165 
166  offset = kwargs.get('offset', 0)
167  figure = kwargs.get('figure', None)
168  display = kwargs.get('display', True)
169 
170  if figure is None:
171  figure = visualizer.show(display = False)
172  else:
173  visualizer.show(figure = figure, display = False)
174 
175  kmeans_visualizer.__draw_centers(figure, offset, visualizer, centers, initial_centers)
176  kmeans_visualizer.__draw_rays(figure, offset, visualizer, sample, clusters, centers)
177 
178  if display is True:
179  plt.show()
180 
181  return figure
182 
183 
184  @staticmethod
185  def __draw_rays(figure, offset, visualizer, sample, clusters, centers):
186  ax = figure.get_axes()[offset]
187 
188  for index_cluster in range(len(clusters)):
189  color = visualizer.get_cluster_color(index_cluster, 0)
190  kmeans_visualizer.__draw_cluster_rays(ax, color, sample, clusters[index_cluster], centers[index_cluster])
191 
192 
193  @staticmethod
194  def __draw_cluster_rays(ax, color, sample, cluster, center):
195  dimension = len(sample[0])
196 
197  for index_point in cluster:
198  point = sample[index_point]
199  if dimension == 1:
200  ax.plot([point[0], center[0]], [0.0, 0.0], '-', color=color, linewidth=0.5)
201  elif dimension == 2:
202  ax.plot([point[0], center[0]], [point[1], center[1]], '-', color=color, linewidth=0.5)
203  elif dimension == 3:
204  ax.plot([point[0], center[0]], [point[1], center[1]], [point[2], center[2]], '-', color=color, linewidth=0.5)
205 
206 
207  @staticmethod
208  def __draw_center(ax, center, color, marker, alpha):
209  dimension = len(center)
210 
211  if dimension == 1:
212  ax.plot(center[0], 0.0, color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
213  elif dimension == 2:
214  ax.plot(center[0], center[1], color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
215  elif dimension == 3:
216  ax.scatter(center[0], center[1], center[2], c=color, alpha=alpha, marker=marker, s=kmeans_visualizer.__default_3d_marker_size)
217 
218 
219  @staticmethod
220  def __draw_centers(figure, offset, visualizer, centers, initial_centers):
221  ax = figure.get_axes()[offset]
222 
223  for index_center in range(len(centers)):
224  color = visualizer.get_cluster_color(index_center, 0)
225  kmeans_visualizer.__draw_center(ax, centers[index_center], color, '*', 1.0)
226 
227  if initial_centers is not None:
228  kmeans_visualizer.__draw_center(ax, initial_centers[index_center], color, '*', 0.4)
229 
230 
231  @staticmethod
232  def animate_cluster_allocation(data, observer, animation_velocity = 500, movie_fps = 1, save_movie = None):
233  """!
234  @brief Animates clustering process that is performed by K-Means algorithm.
235 
236  @param[in] data (list): Dataset that is used for clustering.
237  @param[in] observer (kmeans_observer): EM observer that was used for collection information about clustering process.
238  @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
239  @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
240  @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
241 
242  """
243  figure = plt.figure()
244 
245  def init_frame():
246  return frame_generation(0)
247 
248  def frame_generation(index_iteration):
249  figure.clf()
250 
251  figure.suptitle("K-Means algorithm (iteration: " + str(index_iteration) + ")", fontsize=18, fontweight='bold')
252 
253  clusters = observer.get_clusters(index_iteration)
254  centers = observer.get_centers(index_iteration)
255  kmeans_visualizer.show_clusters(data, clusters, centers, None, figure=figure, display=False)
256 
257  figure.subplots_adjust(top=0.85)
258 
259  return [figure.gca()]
260 
261  iterations = len(observer)
262  cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval=animation_velocity,
263  init_func=init_frame, repeat_delay=5000)
264 
265  if save_movie is not None:
266  cluster_animation.save(save_movie, writer='ffmpeg', fps=movie_fps, bitrate=3000)
267  else:
268  plt.show()
269 
270 
271 
272 class kmeans:
273  """!
274  @brief Class represents K-Means clustering algorithm.
275  @details CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
276 
277  K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization
278  initial centers from module 'pyclustering.cluster.center_initializer'.
279 
280  @image html kmeans_example_clustering.png "K-Means clustering results. At the left - 'Simple03.data' sample, at the right - 'Lsun.data' sample."
281 
282  Example #1 - Clustering using K-Means++ for center initialization:
283  @code
284  from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
285  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
286  from pyclustering.samples.definitions import FCPS_SAMPLES
287  from pyclustering.utils import read_sample
288 
289  # Load list of points for cluster analysis.
290  sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
291 
292  # Prepare initial centers using K-Means++ method.
293  initial_centers = kmeans_plusplus_initializer(sample, 2).initialize()
294 
295  # Create instance of K-Means algorithm with prepared centers.
296  kmeans_instance = kmeans(sample, initial_centers)
297 
298  # Run cluster analysis and obtain results.
299  kmeans_instance.process()
300  clusters = kmeans_instance.get_clusters()
301  final_centers = kmeans_instance.get_centers()
302 
303  # Visualize obtained results
304  kmeans_visualizer.show_clusters(sample, clusters, final_centers)
305  @endcode
306 
307  Example #2 - Clustering using specific distance metric, for example, Manhattan distance:
308  @code
309  # prepare input data and initial centers for cluster analysis using K-Means
310 
311  # create metric that will be used for clustering
312  manhattan_metric = distance_metric(type_metric.MANHATTAN)
313 
314  # create instance of K-Means using specific distance metric:
315  kmeans_instance = kmeans(sample, initial_centers, metric=manhattan_metric)
316 
317  # run cluster analysis and obtain results
318  kmeans_instance.process()
319  clusters = kmeans_instance.get_clusters()
320  @endcode
321 
322  @see center_initializer
323 
324  """
325 
326  def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
327  """!
328  @brief Constructor of clustering algorithm K-Means.
329  @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
330 
331  @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
332  @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
333  @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
334  @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
335  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer', 'metric', 'itermax').
336 
337  <b>Keyword Args:</b><br>
338  - observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration.
339  - metric (distance_metric): Metric that is used for distance calculation between two points (by default euclidean square distance).
340  - itermax (uint): Maximum number of iterations that is used for clustering process (by default: 200).
341 
342  @see center_initializer
343 
344  """
345  self.__pointer_data = numpy.array(data)
346  self.__clusters = []
347  self.__centers = numpy.array(initial_centers)
348  self.__tolerance = tolerance
349  self.__total_wce = 0
350 
351  self.__observer = kwargs.get('observer', None)
352  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
353  self.__itermax = kwargs.get('itermax', 100)
354 
355  if self.__metric.get_type() != type_metric.USER_DEFINED:
356  self.__metric.enable_numpy_usage()
357  else:
358  self.__metric.disable_numpy_usage()
359 
360  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
361  if self.__ccore is True:
362  self.__ccore = ccore_library.workable()
363 
364 
365  def process(self):
366  """!
367  @brief Performs cluster analysis in line with rules of K-Means algorithm.
368 
369  @return (kmeans) Returns itself (K-Means instance).
370 
371  @remark Results of clustering can be obtained using corresponding get methods.
372 
373  @see get_clusters()
374  @see get_centers()
375 
376  """
377 
378  if len(self.__pointer_data[0]) != len(self.__centers[0]):
379  raise ValueError("Dimension of the input data and dimension of the initial cluster centers must be equal.")
380 
381  if self.__ccore is True:
382  self.__process_by_ccore()
383  else:
384  self.__process_by_python()
385 
386  return self
387 
388 
389  def __process_by_ccore(self):
390  """!
391  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
392 
393  """
394  ccore_metric = metric_wrapper.create_instance(self.__metric)
395 
396  results = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance, self.__itermax, (self.__observer is not None), ccore_metric.get_pointer())
397  self.__clusters = results[0]
398  self.__centers = results[1]
399 
400  if self.__observer is not None:
401  self.__observer.set_evolution_clusters(results[2])
402  self.__observer.set_evolution_centers(results[3])
403 
404  self.__total_wce = results[4][0]
405 
406 
407  def __process_by_python(self):
408  """!
409  @brief Performs cluster analysis using python code.
410 
411  """
412 
413  maximum_change = float('inf')
414  iteration = 0
415 
416  if self.__observer is not None:
417  initial_clusters = self.__update_clusters()
418  self.__observer.notify(initial_clusters, self.__centers.tolist())
419 
420  while maximum_change > self.__tolerance and iteration < self.__itermax:
421  self.__clusters = self.__update_clusters()
422  updated_centers = self.__update_centers() # changes should be calculated before assignment
423 
424  if self.__observer is not None:
425  self.__observer.notify(self.__clusters, updated_centers.tolist())
426 
427  maximum_change = self.__calculate_changes(updated_centers)
428 
429  self.__centers = updated_centers # assign center after change calculation
430  iteration += 1
431 
432  self.__calculate_total_wce()
433 
434 
435  def get_clusters(self):
436  """!
437  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
438 
439  @see process()
440  @see get_centers()
441 
442  """
443 
444  return self.__clusters
445 
446 
447  def get_centers(self):
448  """!
449  @brief Returns list of centers of allocated clusters.
450 
451  @see process()
452  @see get_clusters()
453 
454  """
455 
456  if isinstance(self.__centers, list):
457  return self.__centers
458 
459  return self.__centers.tolist()
460 
461 
462  def get_total_wce(self):
463  """!
464  @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
465  @details Sum of metric errors is calculated using distance between point and its center:
466  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
467 
468  @see process()
469  @see get_clusters()
470 
471  """
472 
473  return self.__total_wce
474 
475 
477  """!
478  @brief Returns clustering result representation type that indicate how clusters are encoded.
479 
480  @return (type_encoding) Clustering result representation.
481 
482  @see get_clusters()
483 
484  """
485 
486  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
487 
488 
489  def __update_clusters(self):
490  """!
491  @brief Calculate distance (in line with specified metric) to each point from the each cluster. Nearest points
492  are captured by according clusters and as a result clusters are updated.
493 
494  @return (list) Updated clusters as list of clusters. Each cluster contains indexes of objects from data.
495 
496  """
497 
498  clusters = [[] for _ in range(len(self.__centers))]
499 
500  dataset_differences = self.__calculate_dataset_difference(len(clusters))
501 
502  optimum_indexes = numpy.argmin(dataset_differences, axis=0)
503  for index_point in range(len(optimum_indexes)):
504  index_cluster = optimum_indexes[index_point]
505  clusters[index_cluster].append(index_point)
506 
507  clusters = [cluster for cluster in clusters if len(cluster) > 0]
508 
509  return clusters
510 
511 
512  def __update_centers(self):
513  """!
514  @brief Calculate centers of clusters in line with contained objects.
515 
516  @return (numpy.array) Updated centers.
517 
518  """
519 
520  dimension = self.__pointer_data.shape[1]
521  centers = numpy.zeros((len(self.__clusters), dimension))
522 
523  for index in range(len(self.__clusters)):
524  cluster_points = self.__pointer_data[self.__clusters[index], :]
525  centers[index] = cluster_points.mean(axis=0)
526 
527  return numpy.array(centers)
528 
529 
530  def __calculate_total_wce(self):
531  """!
532  @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm.
533 
534  """
535 
536  dataset_differences = self.__calculate_dataset_difference(len(self.__clusters))
537 
538  self.__total_wce = 0
539  for index_cluster in range(len(self.__clusters)):
540  for index_point in self.__clusters[index_cluster]:
541  self.__total_wce += dataset_differences[index_cluster][index_point]
542 
543 
544  def __calculate_dataset_difference(self, amount_clusters):
545  """!
546  @brief Calculate distance from each point to each cluster center.
547 
548  """
549  dataset_differences = numpy.zeros((amount_clusters, len(self.__pointer_data)))
550  for index_center in range(amount_clusters):
551  if self.__metric.get_type() != type_metric.USER_DEFINED:
552  dataset_differences[index_center] = self.__metric(self.__pointer_data, self.__centers[index_center])
553  else:
554  dataset_differences[index_center] = [ self.__metric(point, self.__centers[index_center])
555  for point in self.__pointer_data ]
556 
557  return dataset_differences
558 
559 
560  def __calculate_changes(self, updated_centers):
561  """!
562  @brief Calculates changes estimation between previous and current iteration using centers for that purpose.
563 
564  @param[in] updated_centers (array_like): New cluster centers.
565 
566  @return (float) Maximum changes between centers.
567 
568  """
569  if len(self.__centers) != len(updated_centers):
570  maximum_change = float('inf')
571 
572  else:
573  changes = self.__metric(self.__centers, updated_centers)
574  maximum_change = numpy.max(changes)
575 
576  return maximum_change
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:359
pyclustering module for cluster analysis.
Definition: __init__.py:1
def notify(self, clusters, centers)
This method is called by K-Means algorithm to notify about changes.
Definition: kmeans.py:75
def get_centers(self)
Returns list of centers of allocated clusters.
Definition: kmeans.py:447
def __calculate_dataset_difference(self, amount_clusters)
Calculate distance from each point to each cluster center.
Definition: kmeans.py:544
def process(self)
Performs cluster analysis in line with rules of K-Means algorithm.
Definition: kmeans.py:365
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def get_clusters(self, iteration)
Get method to return allocated clusters at specific iteration of clustering process.
Definition: kmeans.py:119
def __calculate_changes(self, updated_centers)
Calculates changes estimation between previous and current iteration using centers for that purpose...
Definition: kmeans.py:560
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:64
Observer of K-Means algorithm that is used to collect information about clustering process on each it...
Definition: kmeans.py:49
def __update_centers(self)
Calculate centers of clusters in line with contained objects.
Definition: kmeans.py:512
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: kmeans.py:462
Class represents K-Means clustering algorithm.
Definition: kmeans.py:272
def __init__(self)
Initializer of observer of K-Means algorithm.
Definition: kmeans.py:57
Visualizer of K-Means algorithm&#39;s results.
Definition: kmeans.py:132
def get_centers(self, iteration)
Get method to return centers at specific iteration of clustering process.
Definition: kmeans.py:97
def __calculate_total_wce(self)
Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm...
Definition: kmeans.py:530
def set_evolution_centers(self, evolution_centers)
Set evolution of changes of centers during clustering process.
Definition: kmeans.py:87
def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Means.
Definition: kmeans.py:326
def __len__(self)
Returns amount of steps that were observer during clustering process in K-Means algorithm.
Definition: kmeans.py:67
def set_evolution_clusters(self, evolution_clusters)
Set evolution of changes of centers during clustering process.
Definition: kmeans.py:109
def show_clusters(sample, clusters, centers, initial_centers=None, kwargs)
Display K-Means clustering results.
Definition: kmeans.py:144
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmeans.py:435
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: kmeans.py:389
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmeans.py:476
def __update_clusters(self)
Calculate distance (in line with specified metric) to each point from the each cluster.
Definition: kmeans.py:489
def __process_by_python(self)
Performs cluster analysis using python code.
Definition: kmeans.py:407
def animate_cluster_allocation(data, observer, animation_velocity=500, movie_fps=1, save_movie=None)
Animates clustering process that is performed by K-Means algorithm.
Definition: kmeans.py:232