3 @brief pyclustering module for cluster analysis.
5 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @copyright BSD-3-Clause
14 import matplotlib.pyplot
as plt
15 import matplotlib.gridspec
as gridspec
22 @brief Description of cluster for representation on canvas.
26 def __init__(self, cluster, data, marker, markersize, color):
28 @brief Constructor of cluster representation on the canvas.
30 @param[in] cluster (list): Single cluster that consists of objects or indexes from data.
31 @param[in] data (list): Objects that should be displayed, can be None if clusters consist of objects instead of indexes.
32 @param[in] marker (string): Type of marker that is used for drawing objects.
33 @param[in] markersize (uint): Size of marker that is used for drawing objects.
34 @param[in] color (string): Color of the marker that is used for drawing objects.
58 @brief Visualizer for cluster in multi-dimensional data.
59 @details This cluster visualizer is useful for clusters in data whose dimension is greater than 3. The
60 multidimensional visualizer helps to overcome 'cluster_visualizer' shortcoming - ability to display
61 clusters in 1D, 2D or 3D dimensional data space.
63 Example of clustering results visualization where 'Iris' is used:
65 from pyclustering.utils import read_sample
66 from pyclustering.samples.definitions import FAMOUS_SAMPLES
67 from pyclustering.cluster import cluster_visualizer_multidim
69 # load 4D data sample 'Iris'
70 sample_4d = read_sample(FAMOUS_SAMPLES.SAMPLE_IRIS)
72 # initialize 3 initial centers using K-Means++ algorithm
73 centers = kmeans_plusplus_initializer(sample_4d, 3).initialize()
75 # performs cluster analysis using X-Means
76 xmeans_instance = xmeans(sample_4d, centers)
77 xmeans_instance.process()
78 clusters = xmeans_instance.get_clusters()
80 # visualize obtained clusters in multi-dimensional space
81 visualizer = cluster_visualizer_multidim()
82 visualizer.append_clusters(clusters, sample_4d)
83 visualizer.show(max_row_size=3)
86 Visualized clustering results of 'Iris' data (multi-dimensional data):
87 @image html xmeans_clustering_famous_iris.png "Fig. 1. X-Means clustering results (data 'Iris')."
89 Sometimes no need to display results in all dimensions. Parameter 'filter' can be used to display only
90 interesting coordinate pairs. Here is an example of visualization of pair coordinates (x0, x1) and (x0, x2) for
91 previous clustering results:
93 visualizer = cluster_visualizer_multidim()
94 visualizer.append_clusters(clusters, sample_4d)
95 visualizer.show(pair_filter=[[0, 1], [0, 2]])
98 Visualized results of specified coordinate pairs:
99 @image html xmeans_clustering_famous_iris_filtered.png "Fig. 2. X-Means clustering results (x0, x1) and (x0, x2) (data 'Iris')."
105 @brief Constructs cluster visualizer for multidimensional data.
106 @details The visualizer is suitable more data whose dimension is bigger than 3.
114 def append_cluster(self, cluster, data = None, marker = '.', markersize = None, color = None):
116 @brief Appends cluster for visualization.
118 @param[in] cluster (list): cluster that may consist of indexes of objects from the data or object itself.
119 @param[in] data (list): If defines that each element of cluster is considered as a index of object from the data.
120 @param[in] marker (string): Marker that is used for displaying objects from cluster on the canvas.
121 @param[in] markersize (uint): Size of marker.
122 @param[in] color (string): Color of marker.
124 @return Returns index of cluster descriptor on the canvas.
127 if len(cluster) == 0:
128 raise ValueError(
"Empty cluster is provided.")
130 markersize = markersize
or 5
132 index_color = len(self.
__clusters) % len(color_list.TITLES)
133 color = color_list.TITLES[index_color]
141 @brief Appends list of cluster for visualization.
143 @param[in] clusters (list): List of clusters where each cluster may consist of indexes of objects from the data or object itself.
144 @param[in] data (list): If defines that each element of cluster is considered as a index of object from the data.
145 @param[in] marker (string): Marker that is used for displaying objects from clusters on the canvas.
146 @param[in] markersize (uint): Size of marker.
150 for cluster
in clusters:
154 def save(self, filename, **kwargs):
157 @brief Saves figure to the specified file.
159 @param[in] filename (string): File where the visualized clusters should be stored.
160 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'visible_axis' 'visible_labels', 'visible_grid', 'row_size', 'show').
162 <b>Keyword Args:</b><br>
163 - visible_axis (bool): Defines visibility of axes on each canvas, if True - axes are visible.
164 By default axis of each canvas are not displayed.
165 - visible_labels (bool): Defines visibility of labels on each canvas, if True - labels is displayed.
166 By default labels of each canvas are displayed.
167 - visible_grid (bool): Defines visibility of grid on each canvas, if True - grid is displayed.
168 By default grid of each canvas is displayed.
169 - max_row_size (uint): Maximum number of canvases on one row.
173 if len(filename) == 0:
174 raise ValueError(
"Impossible to save visualization to file: empty file path is specified.")
177 visible_axis=kwargs.get(
'visible_axis',
False),
178 visible_labels=kwargs.get(
'visible_labels',
True),
179 visible_grid=kwargs.get(
'visible_grid',
True),
180 max_row_size=kwargs.get(
'max_row_size', 4))
181 plt.savefig(filename)
184 def show(self, pair_filter=None, **kwargs):
186 @brief Shows clusters (visualize) in multi-dimensional space.
188 @param[in] pair_filter (list): List of coordinate pairs that should be displayed. This argument is used as a filter.
189 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'visible_axis' 'visible_labels', 'visible_grid', 'row_size', 'show').
191 <b>Keyword Args:</b><br>
192 - visible_axis (bool): Defines visibility of axes on each canvas, if True - axes are visible.
193 By default axis of each canvas are not displayed.
194 - visible_labels (bool): Defines visibility of labels on each canvas, if True - labels is displayed.
195 By default labels of each canvas are displayed.
196 - visible_grid (bool): Defines visibility of grid on each canvas, if True - grid is displayed.
197 By default grid of each canvas is displayed.
198 - max_row_size (uint): Maximum number of canvases on one row. By default the maximum value is 4.
199 - show (bool): If True - then displays visualized clusters. By default is `True`.
204 raise ValueError(
"There is no non-empty clusters for visualization.")
207 dimension = len(cluster_data[0])
209 acceptable_pairs = pair_filter
or []
216 amount_axis = len(pairs)
221 for index
in range(amount_axis):
223 axis_storage.append(ax)
228 if kwargs.get(
'show',
True):
232 def __create_grid_spec(self, amount_axis, max_row_size):
234 @brief Create grid specification for figure to place canvases.
236 @param[in] amount_axis (uint): Amount of canvases that should be organized by the created grid specification.
237 @param[in] max_row_size (max_row_size): Maximum number of canvases on one row.
239 @return (gridspec.GridSpec) Grid specification to place canvases on figure.
242 row_size = amount_axis
243 if row_size > max_row_size:
244 row_size = max_row_size
246 col_size = math.ceil(amount_axis / row_size)
247 return gridspec.GridSpec(col_size, row_size)
250 def __create_pairs(self, dimension, acceptable_pairs):
252 @brief Create coordinate pairs that should be displayed.
254 @param[in] dimension (uint): Data-space dimension.
255 @param[in] acceptable_pairs (list): List of coordinate pairs that should be displayed.
257 @return (list) List of coordinate pairs that should be displayed.
260 if len(acceptable_pairs) > 0:
261 return acceptable_pairs
263 return list(itertools.combinations(range(dimension), 2))
266 def __create_canvas(self, dimension, pairs, position, **kwargs):
268 @brief Create new canvas with user defined parameters to display cluster or chunk of cluster on it.
270 @param[in] dimension (uint): Data-space dimension.
271 @param[in] pairs (list): Pair of coordinates that will be displayed on the canvas. If empty than label will not
272 be displayed on the canvas.
273 @param[in] position (uint): Index position of canvas on a grid.
274 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'visible_axis' 'visible_labels', 'visible_grid').
276 <b>Keyword Args:</b><br>
277 - visible_axis (bool): Defines visibility of axes on each canvas, if True - axes are visible.
278 By default axis are not displayed.
279 - visible_labels (bool): Defines visibility of labels on each canvas, if True - labels is displayed.
280 By default labels are displayed.
281 - visible_grid (bool): Defines visibility of grid on each canvas, if True - grid is displayed.
282 By default grid is displayed.
284 @return (matplotlib.Axis) Canvas to display cluster of chuck of cluster.
287 visible_grid = kwargs.get(
'visible_grid',
True)
288 visible_labels = kwargs.get(
'visible_labels',
True)
289 visible_axis = kwargs.get(
'visible_axis',
False)
295 ax.set_xlabel(
"x%d" % pairs[position][0])
296 ax.set_ylabel(
"x%d" % pairs[position][1])
298 ax.set_ylim(-0.5, 0.5)
299 ax.set_yticklabels([])
305 ax.set_yticklabels([])
306 ax.set_xticklabels([])
311 def __draw_canvas_cluster(self, axis_storage, cluster_descr, pairs):
313 @brief Draw clusters.
315 @param[in] axis_storage (list): List of matplotlib axis where cluster dimensional chunks are displayed.
316 @param[in] cluster_descr (canvas_cluster_descr): Canvas cluster descriptor that should be displayed.
317 @param[in] pairs (list): List of coordinates that should be displayed.
321 for index_axis
in range(len(axis_storage)):
322 for item
in cluster_descr.cluster:
329 def __draw_cluster_item_multi_dimension(self, ax, pair, item, cluster_descr):
331 @brief Draw cluster chunk defined by pair coordinates in data space with dimension greater than 1.
333 @param[in] ax (axis): Matplotlib axis that is used to display chunk of cluster point.
334 @param[in] pair (list): Coordinate of the point that should be displayed.
335 @param[in] item (list): Data point or index of data point.
336 @param[in] cluster_descr (canvas_cluster_descr): Cluster description whose point is visualized.
340 index_dimension1 = pair[0]
341 index_dimension2 = pair[1]
343 if cluster_descr.data
is None:
344 ax.plot(item[index_dimension1], item[index_dimension2],
345 color=cluster_descr.color, marker=cluster_descr.marker, markersize=cluster_descr.markersize)
347 ax.plot(cluster_descr.data[item][index_dimension1], cluster_descr.data[item][index_dimension2],
348 color=cluster_descr.color, marker=cluster_descr.marker, markersize=cluster_descr.markersize)
351 def __draw_cluster_item_one_dimension(self, ax, item, cluster_descr):
353 @brief Draw cluster point in one dimensional data space..
355 @param[in] ax (axis): Matplotlib axis that is used to display chunk of cluster point.
356 @param[in] item (list): Data point or index of data point.
357 @param[in] cluster_descr (canvas_cluster_descr): Cluster description whose point is visualized.
361 if cluster_descr.data
is None:
362 ax.plot(item[0], 0.0,
363 color=cluster_descr.color, marker=cluster_descr.marker, markersize=cluster_descr.markersize)
365 ax.plot(cluster_descr.data[item][0], 0.0,
366 color=cluster_descr.color, marker=cluster_descr.marker, markersize=cluster_descr.markersize)
372 @brief Common visualizer of clusters on 1D, 2D or 3D surface.
373 @details Use 'cluster_visualizer_multidim' visualizer in case of data dimension is greater than 3.
375 @see cluster_visualizer_multidim
379 def __init__(self, number_canvases=1, size_row=1, titles=None):
381 @brief Constructor of cluster visualizer.
383 @param[in] number_canvases (uint): Number of canvases that is used for visualization.
384 @param[in] size_row (uint): Amount of canvases that can be placed in one row.
385 @param[in] titles (list): List of canvas's titles.
389 # load 2D data sample
390 sample_2d = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1);
392 # load 3D data sample
393 sample_3d = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA);
395 # extract clusters from the first sample using DBSCAN algorithm
396 dbscan_instance = dbscan(sample_2d, 0.4, 2, False);
397 dbscan_instance.process();
398 clusters_sample_2d = dbscan_instance.get_clusters();
400 # extract clusters from the second sample using DBSCAN algorithm
401 dbscan_instance = dbscan(sample_3d, 1, 3, True);
402 dbscan_instance.process();
403 clusters_sample_3d = dbscan_instance.get_clusters();
405 # create plot with two canvases where each row contains 2 canvases.
408 visualizer = cluster_visualizer(size, row_size);
410 # place clustering result of sample_2d to the first canvas
411 visualizer.append_clusters(clusters_sample_2d, sample_2d, 0, markersize = 5);
413 # place clustering result of sample_3d to the second canvas
414 visualizer.append_clusters(clusters_sample_3d, sample_3d, 1, markersize = 30);
428 if titles
is not None:
435 def append_cluster(self, cluster, data=None, canvas=0, marker='.', markersize=None, color=None):
437 @brief Appends cluster to canvas for drawing.
439 @param[in] cluster (list): cluster that may consist of indexes of objects from the data or object itself.
440 @param[in] data (list): If defines that each element of cluster is considered as a index of object from the data.
441 @param[in] canvas (uint): Number of canvas that should be used for displaying cluster.
442 @param[in] marker (string): Marker that is used for displaying objects from cluster on the canvas.
443 @param[in] markersize (uint): Size of marker.
444 @param[in] color (string): Color of marker.
446 @return Returns index of cluster descriptor on the canvas.
450 if len(cluster) == 0:
454 raise ValueError(
"Canvas index '%d' is out of range [0; %d]." % self.
__number_canvases or canvas)
458 color = color_list.TITLES[index_color]
464 dimension = len(cluster[0])
468 raise ValueError(
"Only clusters with the same dimension of objects can be displayed on canvas.")
471 dimension = len(data[0])
475 raise ValueError(
"Only clusters with the same dimension of objects can be displayed on canvas.")
477 if (dimension < 1)
or (dimension > 3):
478 raise ValueError(
"Only objects with size dimension 1 (1D plot), 2 (2D plot) or 3 (3D plot) "
479 "can be displayed. For multi-dimensional data use 'cluster_visualizer_multidim'.")
481 if markersize
is None:
482 if (dimension == 1)
or (dimension == 2):
492 @brief Append cluster attribure for cluster on specific canvas.
493 @details Attribute it is data that is visualized for specific cluster using its color, marker and markersize if last two is not specified.
495 @param[in] index_canvas (uint): Index canvas where cluster is located.
496 @param[in] index_cluster (uint): Index cluster whose attribute should be added.
497 @param[in] data (list): List of points (data) that represents attribute.
498 @param[in] marker (string): Marker that is used for displaying objects from cluster on the canvas.
499 @param[in] markersize (uint): Size of marker.
504 attribute_marker = marker
505 if attribute_marker
is None:
506 attribute_marker = cluster_descr.marker
508 attribure_markersize = markersize
509 if attribure_markersize
is None:
510 attribure_markersize = cluster_descr.markersize
512 attribute_color = cluster_descr.color
514 added_attribute_cluster_descriptor =
canvas_cluster_descr(data,
None, attribute_marker, attribure_markersize, attribute_color)
515 self.
__canvas_clusters[index_canvas][index_cluster].attributes.append(added_attribute_cluster_descriptor)
518 def append_clusters(self, clusters, data=None, canvas=0, marker='.', markersize=None):
520 @brief Appends list of cluster to canvas for drawing.
522 @param[in] clusters (list): List of clusters where each cluster may consist of indexes of objects from the data or object itself.
523 @param[in] data (list): If defines that each element of cluster is considered as a index of object from the data.
524 @param[in] canvas (uint): Number of canvas that should be used for displaying clusters.
525 @param[in] marker (string): Marker that is used for displaying objects from clusters on the canvas.
526 @param[in] markersize (uint): Size of marker.
530 for cluster
in clusters:
536 @brief Set title for specified canvas.
538 @param[in] text (string): Title for the canvas.
539 @param[in] canvas (uint): Index of the canvas where title should be displayed.
544 raise ValueError(
"Canvas with index '%d' does not exists (total amount of canvases: '%d')." %
552 @brief Returns cluster color on specified canvas.
558 def save(self, filename, **kwargs):
561 @brief Saves figure to the specified file.
563 @param[in] filename (string): File where the visualized clusters should be stored.
564 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'invisible_axis', 'visible_grid').
566 <b>Keyword Args:</b><br>
567 - invisible_axis (bool): Defines visibility of axes on each canvas, if `True` - axes are invisible.
568 By default axis are invisible.
569 - visible_grid (bool): Defines visibility of grid on each canvas, if `True` - grid is displayed.
570 By default grid of each canvas is displayed.
572 There is an example how to save visualized clusters to the PNG file without showing them on a screen:
574 from pyclustering.cluster import cluster_visualizer
576 data = [[1.1], [1.7], [3.7], [5.3], [2.5], [-1.5], [-0.9], [6.3], [6.5], [8.1]]
577 clusters = [[0, 1, 2, 4, 5, 6], [3, 7, 8, 9]]
579 visualizer = cluster_visualizer()
580 visualizer.append_clusters(clusters, data)
581 visualizer.save("1-dimensional-clustering.png")
586 if len(filename) == 0:
587 raise ValueError(
"Impossible to save visualization to file: empty file path is specified.")
589 invisible_axis = kwargs.get(
'invisible_axis',
True)
590 visible_grid = kwargs.get(
'visible_grid',
True)
592 self.
show(
None, invisible_axis, visible_grid,
False)
593 plt.savefig(filename)
596 def show(self, figure=None, invisible_axis=True, visible_grid=True, display=True, shift=None):
598 @brief Shows clusters (visualize).
600 @param[in] figure (fig): Defines requirement to use specified figure, if None - new figure is created for drawing clusters.
601 @param[in] invisible_axis (bool): Defines visibility of axes on each canvas, if True - axes are invisible.
602 @param[in] visible_grid (bool): Defines visibility of grid on each canvas, if True - grid is displayed.
603 @param[in] display (bool): Defines requirement to display clusters on a stage, if True - clusters are displayed,
604 if False - plt.show() should be called by user."
605 @param[in] shift (uint): Force canvas shift value - defines canvas index from which custers should be visualized.
607 @return (fig) Figure where clusters are shown.
612 if canvas_shift
is None:
613 if figure
is not None:
614 canvas_shift = len(figure.get_axes())
618 if figure
is not None:
619 cluster_figure = figure
621 cluster_figure = plt.figure()
624 maximum_rows = math.ceil( (self.
__number_canvases + canvas_shift) / maximum_cols)
626 grid_spec = gridspec.GridSpec(maximum_rows, maximum_cols)
630 if len(canvas_data) == 0:
636 if (dimension == 1)
or (dimension == 2):
637 ax = cluster_figure.add_subplot(grid_spec[index_canvas + canvas_shift])
639 ax = cluster_figure.add_subplot(grid_spec[index_canvas + canvas_shift], projection=
'3d')
641 if len(canvas_data) == 0:
642 plt.setp(ax, visible=
False)
644 for cluster_descr
in canvas_data:
647 for attribute_descr
in cluster_descr.attributes:
650 if invisible_axis
is True:
651 ax.xaxis.set_ticklabels([])
652 ax.yaxis.set_ticklabels([])
655 ax.zaxis.set_ticklabels([])
660 ax.grid(visible_grid)
665 return cluster_figure
668 def __draw_canvas_cluster(self, ax, dimension, cluster_descr):
670 @brief Draw canvas cluster descriptor.
672 @param[in] ax (Axis): Axis of the canvas where canvas cluster descriptor should be displayed.
673 @param[in] dimension (uint): Canvas dimension.
674 @param[in] cluster_descr (canvas_cluster_descr): Canvas cluster descriptor that should be displayed.
676 @return (fig) Figure where clusters are shown.
680 cluster = cluster_descr.cluster
681 data = cluster_descr.data
682 marker = cluster_descr.marker
683 markersize = cluster_descr.markersize
684 color = cluster_descr.color
689 ax.plot(item[0], 0.0, color = color, marker = marker, markersize = markersize)
691 ax.plot(data[item][0], 0.0, color = color, marker = marker, markersize = markersize)
695 ax.plot(item[0], item[1], color = color, marker = marker, markersize = markersize)
697 ax.plot(data[item][0], data[item][1], color = color, marker = marker, markersize = markersize)
701 ax.scatter(item[0], item[1], item[2], c = color, marker = marker, s = markersize)
703 ax.scatter(data[item][0], data[item][1], data[item][2], c = color, marker = marker, s = markersize)