3 @brief The module contains K-Means algorithm and other related services.
4 @details Implementation based on paper @cite inproceedings::kmeans::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
16 import matplotlib.pyplot
as plt
17 import matplotlib.animation
as animation
19 import pyclustering.core.kmeans_wrapper
as wrapper
21 from pyclustering.core.wrapper
import ccore_library
22 from pyclustering.core.metric_wrapper
import metric_wrapper
32 @brief Observer of K-Means algorithm that is used to collect information about clustering process on each iteration of the algorithm.
40 @brief Initializer of observer of K-Means algorithm.
50 @brief Returns amount of steps that were observer during clustering process in K-Means algorithm.
58 @brief This method is called by K-Means algorithm to notify about changes.
60 @param[in] clusters (array_like): Allocated clusters by K-Means algorithm.
61 @param[in] centers (array_like): Allocated centers by K-Means algorithm.
70 @brief Set evolution of changes of centers during clustering process.
72 @param[in] evolution_centers (array_like): Evolution of changes of centers during clustering process.
80 @brief Get method to return centers at specific iteration of clustering process.
82 @param[in] iteration (uint): Clustering process iteration at which centers are required.
84 @return (array_like) Centers at specific iteration.
92 @brief Set evolution of changes of centers during clustering process.
94 @param[in] evolution_clusters (array_like): Evolution of changes of clusters during clustering process.
102 @brief Get method to return allocated clusters at specific iteration of clustering process.
104 @param[in] iteration (uint): Clustering process iteration at which clusters are required.
106 @return (array_like) Clusters at specific iteration.
115 @brief Visualizer of K-Means algorithm's results.
116 @details K-Means visualizer provides visualization services that are specific for K-Means algorithm.
120 __default_2d_marker_size = 15
121 __default_3d_marker_size = 70
125 def show_clusters(sample, clusters, centers, initial_centers = None, **kwargs):
127 @brief Display K-Means clustering results.
129 @param[in] sample (list): Dataset that was used for clustering.
130 @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
131 @param[in] centers (array_like): Centers that were allocated by the algorithm.
132 @param[in] initial_centers (array_like): Initial centers that were used by the algorithm, if 'None' then initial centers are not displyed.
133 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display', 'offset').
135 <b>Keyword Args:</b><br>
136 - figure (figure): If 'None' then new is figure is created, otherwise specified figure is used for visualization.
137 - display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'.
138 - offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified).
140 @return (figure) Figure where clusters were drawn.
145 visualizer.append_clusters(clusters, sample)
147 offset = kwargs.get(
'offset', 0)
148 figure = kwargs.get(
'figure',
None)
149 display = kwargs.get(
'display',
True)
152 figure = visualizer.show(display=
False)
154 visualizer.show(figure=figure, display=
False)
156 kmeans_visualizer.__draw_centers(figure, offset, visualizer, centers, initial_centers)
157 kmeans_visualizer.__draw_rays(figure, offset, visualizer, sample, clusters, centers)
166 def __draw_rays(figure, offset, visualizer, sample, clusters, centers):
167 ax = figure.get_axes()[offset]
169 for index_cluster
in range(len(clusters)):
170 color = visualizer.get_cluster_color(index_cluster, 0)
171 kmeans_visualizer.__draw_cluster_rays(ax, color, sample, clusters[index_cluster], centers[index_cluster])
175 def __draw_cluster_rays(ax, color, sample, cluster, center):
176 dimension = len(sample[0])
178 for index_point
in cluster:
179 point = sample[index_point]
181 ax.plot([point[0], center[0]], [0.0, 0.0],
'-', color=color, linewidth=0.5)
183 ax.plot([point[0], center[0]], [point[1], center[1]],
'-', color=color, linewidth=0.5)
185 ax.plot([point[0], center[0]], [point[1], center[1]], [point[2], center[2]],
'-', color=color, linewidth=0.5)
189 def __draw_center(ax, center, color, marker, alpha):
190 dimension = len(center)
193 ax.plot(center[0], 0.0, color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
195 ax.plot(center[0], center[1], color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
197 ax.scatter(center[0], center[1], center[2], c=color, alpha=alpha, marker=marker, s=kmeans_visualizer.__default_3d_marker_size)
201 def __draw_centers(figure, offset, visualizer, centers, initial_centers):
202 ax = figure.get_axes()[offset]
204 for index_center
in range(len(centers)):
205 color = visualizer.get_cluster_color(index_center, 0)
206 kmeans_visualizer.__draw_center(ax, centers[index_center], color,
'*', 1.0)
208 if initial_centers
is not None:
209 kmeans_visualizer.__draw_center(ax, initial_centers[index_center], color,
'*', 0.4)
215 @brief Animates clustering process that is performed by K-Means algorithm.
217 @param[in] data (list): Dataset that is used for clustering.
218 @param[in] observer (kmeans_observer): EM observer that was used for collection information about clustering process.
219 @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
220 @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
221 @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
224 figure = plt.figure()
227 return frame_generation(0)
229 def frame_generation(index_iteration):
232 figure.suptitle(
"K-Means algorithm (iteration: " + str(index_iteration) +
")", fontsize=18, fontweight=
'bold')
234 clusters = observer.get_clusters(index_iteration)
235 centers = observer.get_centers(index_iteration)
236 kmeans_visualizer.show_clusters(data, clusters, centers,
None, figure=figure, display=
False)
238 figure.subplots_adjust(top=0.85)
240 return [figure.gca()]
242 iterations = len(observer)
243 cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval=animation_velocity,
244 init_func=init_frame, repeat_delay=5000)
246 if save_movie
is not None:
247 cluster_animation.save(save_movie, writer=
'ffmpeg', fps=movie_fps, bitrate=3000)
255 @brief Class implements K-Means clustering algorithm.
256 @details K-Means clustering aims to partition n observations into k clusters in which each observation belongs to
257 the cluster with the nearest mean, serving as a prototype of the cluster. This results in a partitioning
258 of the data space into Voronoi cells.
260 K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization of
261 initial centers - see module 'pyclustering.cluster.center_initializer'.
263 CCORE implementation (C/C++ part of the library) of the algorithm performs parallel processing to ensure maximum
266 Implementation based on the paper @cite inproceedings::kmeans::1.
268 @image html kmeans_example_clustering.png "Fig. 1. K-Means clustering results. At the left - 'Simple03.data' sample, at the right - 'Lsun.data' sample."
270 Example #1 - Clustering using K-Means++ for center initialization:
272 from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
273 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
274 from pyclustering.samples.definitions import FCPS_SAMPLES
275 from pyclustering.utils import read_sample
277 # Load list of points for cluster analysis.
278 sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
280 # Prepare initial centers using K-Means++ method.
281 initial_centers = kmeans_plusplus_initializer(sample, 2).initialize()
283 # Create instance of K-Means algorithm with prepared centers.
284 kmeans_instance = kmeans(sample, initial_centers)
286 # Run cluster analysis and obtain results.
287 kmeans_instance.process()
288 clusters = kmeans_instance.get_clusters()
289 final_centers = kmeans_instance.get_centers()
291 # Visualize obtained results
292 kmeans_visualizer.show_clusters(sample, clusters, final_centers)
295 Example #2 - Clustering using specific distance metric, for example, Manhattan distance:
297 # prepare input data and initial centers for cluster analysis using K-Means
299 # create metric that will be used for clustering
300 manhattan_metric = distance_metric(type_metric.MANHATTAN)
302 # create instance of K-Means using specific distance metric:
303 kmeans_instance = kmeans(sample, initial_centers, metric=manhattan_metric)
305 # run cluster analysis and obtain results
306 kmeans_instance.process()
307 clusters = kmeans_instance.get_clusters()
310 @see center_initializer
314 def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
316 @brief Constructor of clustering algorithm K-Means.
317 @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
319 @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
320 @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
321 @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
322 @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
323 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer', 'metric', 'itermax').
325 <b>Keyword Args:</b><br>
326 - observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration.
327 - metric (distance_metric): Metric that is used for distance calculation between two points (by default euclidean square distance).
328 - itermax (uint): Maximum number of iterations that is used for clustering process (by default: 200).
330 @see center_initializer
335 self.
__centers = numpy.array(initial_centers)
339 self.
__observer = kwargs.get(
'observer',
None)
341 self.
__itermax = kwargs.get(
'itermax', 100)
343 if self.
__metric.get_type() != type_metric.USER_DEFINED:
348 self.
__ccore = ccore
and self.
__metric.get_type() != type_metric.USER_DEFINED
350 self.
__ccore = ccore_library.workable()
357 @brief Performs cluster analysis in line with rules of K-Means algorithm.
359 @return (kmeans) Returns itself (K-Means instance).
367 raise ValueError(
"Dimension of the input data and dimension of the initial cluster centers must be equal.")
377 def __process_by_ccore(self):
379 @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
382 ccore_metric = metric_wrapper.create_instance(self.
__metric)
385 (self.
__observer is not None), ccore_metric.get_pointer())
391 self.
__observer.set_evolution_clusters(results[2])
392 self.
__observer.set_evolution_centers(results[3])
397 def __process_by_python(self):
399 @brief Performs cluster analysis using python code.
403 maximum_change = float(
'inf')
427 @brief Calculates the closest cluster to each point.
429 @param[in] points (array_like): Points for which closest clusters are calculated.
431 @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
432 collection if 'process()' method was not called.
436 nppoints = numpy.array(points)
440 differences = numpy.zeros((len(nppoints), len(self.
__centers)))
441 for index_point
in range(len(nppoints)):
442 if self.
__metric.get_type() != type_metric.USER_DEFINED:
445 differences[index_point] = [self.
__metric(nppoints[index_point], center)
for center
in self.
__centers]
447 return numpy.argmin(differences, axis=1)
452 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
464 @brief Returns list of centers of allocated clusters.
479 @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
480 @details Sum of metric errors is calculated using distance between point and its center:
481 \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
493 @brief Returns clustering result representation type that indicate how clusters are encoded.
495 @return (type_encoding) Clustering result representation.
501 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
504 def __update_clusters(self):
506 @brief Calculate distance (in line with specified metric) to each point from the each cluster. Nearest points
507 are captured by according clusters and as a result clusters are updated.
509 @return (list) Updated clusters as list of clusters. Each cluster contains indexes of objects from data.
513 clusters = [[]
for _
in range(len(self.
__centers))]
517 optimum_indexes = numpy.argmin(dataset_differences, axis=0)
518 for index_point
in range(len(optimum_indexes)):
519 index_cluster = optimum_indexes[index_point]
520 clusters[index_cluster].append(index_point)
522 clusters = [cluster
for cluster
in clusters
if len(cluster) > 0]
527 def __update_centers(self):
529 @brief Calculate centers of clusters in line with contained objects.
531 @return (numpy.array) Updated centers.
536 centers = numpy.zeros((len(self.
__clusters), dimension))
540 centers[index] = cluster_points.mean(axis=0)
542 return numpy.array(centers)
545 def __calculate_total_wce(self):
547 @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm.
554 for index_cluster
in range(len(self.
__clusters)):
555 for index_point
in self.
__clusters[index_cluster]:
556 self.
__total_wce += dataset_differences[index_cluster][index_point]
559 def __calculate_dataset_difference(self, amount_clusters):
561 @brief Calculate distance from each point to each cluster center.
564 dataset_differences = numpy.zeros((amount_clusters, len(self.
__pointer_data)))
565 for index_center
in range(amount_clusters):
566 if self.
__metric.get_type() != type_metric.USER_DEFINED:
569 dataset_differences[index_center] = [self.
__metric(point, self.
__centers[index_center])
572 return dataset_differences
575 def __calculate_changes(self, updated_centers):
577 @brief Calculates changes estimation between previous and current iteration using centers for that purpose.
579 @param[in] updated_centers (array_like): New cluster centers.
581 @return (float) Maximum changes between centers.
584 if len(self.
__centers) != len(updated_centers):
585 maximum_change = float(
'inf')
588 if self.
__metric.get_type() != type_metric.USER_DEFINED:
591 changes = [self.
__metric(center, updated_center)
for center, updated_center
in zip(self.
__centers, updated_centers)]
593 maximum_change = numpy.max(changes)
595 return maximum_change
598 def __verify_arguments(self):
600 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
604 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__pointer_data))
607 raise ValueError(
"Initial centers are empty (size: '%d')." % len(self.
__pointer_data))
610 raise ValueError(
"Tolerance (current value: '%d') should be greater or equal to 0." %
614 raise ValueError(
"Maximum iterations (current value: '%d') should be greater or equal to 0." %