3 @brief Cluster analysis algorithm: CLIQUE
4 @details Implementation based on paper @cite article::clique::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
18 from pyclustering.core.wrapper
import ccore_library
20 import pyclustering.core.clique_wrapper
as wrapper
23 import matplotlib.gridspec
as gridspec
24 import matplotlib.pyplot
as plt
25 import matplotlib.patches
as patches
30 @brief Visualizer of CLIQUE algorithm's results.
31 @details CLIQUE visualizer provides visualization services that are specific for CLIQUE algorithm, for example,
32 to display grid and its density.
36 __maximum_density_alpha = 0.6
41 @brief Show CLIQUE blocks as a grid in data space.
42 @details Each block contains points and according to this density is displayed. CLIQUE grid helps to visualize
43 grid that was used for clustering process.
45 @param[in] cells (list): List of cells that is produced by CLIQUE algorithm.
46 @param[in] data (array_like): Input data that was used for clustering process.
49 dimension = cells[0].dimensions
53 amount_canvases = int(dimension * (dimension - 1) / 2)
56 grid_spec = gridspec.GridSpec(1, amount_canvases)
58 pairs = list(itertools.combinations(range(dimension), 2))
59 if len(pairs) == 0: pairs = [(0, 0)]
61 for index
in range(amount_canvases):
62 ax = figure.add_subplot(grid_spec[index])
63 clique_visualizer.__draw_cells(ax, cells, pairs[index])
64 clique_visualizer.__draw_two_dimension_data(ax, data, pairs[index])
72 @brief Display CLIQUE clustering results.
74 @param[in] data (list): Data that was used for clustering.
75 @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
76 @param[in] noise (array_like): Noise that were allocated by the algorithm.
80 visualizer.append_clusters(clusters, data)
81 visualizer.append_cluster(noise
or [], data, marker=
'x')
86 def __draw_two_dimension_data(ax, data, pair):
88 @brief Display data in two-dimensional canvas.
90 @param[in] ax (Axis): Canvas where data should be displayed.
91 @param[in] data (list): Data points that should be displayed.
92 @param[in] pair (tuple): Pair of dimension indexes.
95 ax.set_xlabel(
"x%d" % pair[0])
96 ax.set_ylabel(
"x%d" % pair[1])
100 ax.plot(point[pair[0]], point[pair[1]], color=
'red', marker=
'.')
102 ax.plot(point[pair[0]], 0, color=
'red', marker=
'.')
103 ax.yaxis.set_ticklabels([])
107 def __draw_cells(ax, cells, pair):
110 density_scale = max(len(cell.points)
for cell
in cells)
112 clique_visualizer.__draw_cell(ax, pair, cell, density_scale)
116 def __draw_cell(ax, pair, cell, density_scale):
117 max_corner, min_corner = clique_visualizer.__get_rectangle_description(cell, pair)
119 belong_cluster = (len(cell.points) > 0)
121 if density_scale != 0.0:
122 density_scale = clique_visualizer.__maximum_density_alpha * len(cell.points) / density_scale
124 face_color = matplotlib.colors.to_rgba(
'blue', alpha=density_scale)
125 edge_color = matplotlib.colors.to_rgba(
'black', alpha=1.0)
127 rect = patches.Rectangle(min_corner, max_corner[0] - min_corner[0], max_corner[1] - min_corner[1],
129 facecolor=face_color,
130 edgecolor=edge_color,
137 def __get_rectangle_description(cell, pair):
138 max_corner, min_corner = cell.spatial_location.get_corners()
140 max_corner = [max_corner[pair[0]], max_corner[pair[1]]]
141 min_corner = [min_corner[pair[0]], min_corner[pair[1]]]
144 max_corner[1], min_corner[1] = 1.0, -1.0
146 return max_corner, min_corner
152 @brief Geometrical description of CLIQUE block in data space.
153 @details Provides services related to spatial functionality.
161 @brief Creates spatial block in data space.
163 @param[in] max_corner (array_like): Maximum corner coordinates of the block.
164 @param[in] min_corner (array_like): Minimal corner coordinates of the block.
173 @brief Returns string block description.
175 @return String representation of the block.
183 @brief Point is considered as contained if it lies in block (belong to it).
185 @return (bool) True if point is in block, otherwise False.
188 for i
in range(len(point)):
197 @brief Return spatial description of current block.
199 @return (tuple) Pair of maximum and minimum corners (max_corner, min_corner).
208 @brief CLIQUE block contains information about its logical location in grid, spatial location in data space and
209 points that are covered by the block.
213 def __init__(self, logical_location=None, spatial_location=None, points=None, visited=False):
215 @brief Initializes CLIQUE block.
217 @param[in] logical_location (list): Logical location of the block in CLIQUE grid.
218 @param[in] spatial_location (spatial_block): Spatial location in data space.
219 @param[in] points (array_like): Points that belong to this block (can be obtained by method 'capture_points',
220 this parameter is used by CLIQUE in case of processing by C++ implementation when clustering
221 result are passed back to Python code.
222 @param[in] visited (bool): Marks if block is visited during clustering process.
232 @brief Returns string representation of the block using its logical location in CLIQUE grid.
239 @brief Returns string representation of the block using its logical location in CLIQUE grid.
247 @brief Logical location is represented by coordinates in CLIQUE grid, for example, in case of 2x2 grid blocks
248 may have following coordinates: [0, 0], [0, 1], [1, 0], [1, 1].
249 @return (list) Logical location of the block in CLIQUE grid.
254 @logical_location.setter
257 @brief Assign logical location to CLIQUE block.
259 @param[in] location (list): New logical location of the block in CLIQUE grid.
267 @brief Spatial location is represented by real data space coordinates.
268 @return (spatial_block) Spatial block that describes location in data space.
273 @spatial_location.setter
276 @brief Assign spatial location to CLIQUE block.
278 @param[in] location (spatial_block): New spatial location of the block.
286 @brief Amount of dimensions where CLIQUE block is located.
287 @return (uint) Amount of dimensions where CLIQUE block is located.
295 @brief Points that belong to the CLIQUE block.
296 @details Points are represented by indexes that correspond to points in input data space.
298 @return (array_like) Points that belong to the CLIQUE block.
308 @brief Defines whether block is visited during cluster analysis.
309 @details If cluster analysis has not been performed then value will False.
311 @return (bool) True if block has been visited during processing, False otherwise.
319 @brief Marks or unmarks block as a visited.
320 @details This setter is used by CLIQUE algorithm.
322 @param[in] visited (bool): New visited state for the CLIQUE block.
330 @brief Finds points that belong to this block using availability map to reduce computational complexity by
331 checking whether the point belongs to the block.
332 @details Algorithm complexity of this method is O(n).
334 @param[in] data (array_like): Data where points are represented as coordinates.
335 @param[in] point_availability (array_like): Contains boolean values that denote whether point is already belong
336 to another CLIQUE block.
339 for index_point
in range(len(data)):
340 if (point_availability[index_point]
is True)
and (data[index_point]
in self.
__spatial_location):
342 point_availability[index_point] =
False
347 @brief Forms list of logical location of each neighbor for this particular CLIQUE block.
349 @param[in] edge (uint): Amount of intervals in each dimension that is used for clustering process.
351 @return (list) Logical location of each neighbor for this particular CLIQUE block.
359 position[index_dimension] += 1
360 neighbors.append(position)
364 position[index_dimension] -= 1
365 neighbors.append(position)
373 @brief Coordinate iterator is used to generate logical location description for each CLIQUE block.
374 @details This class is used by CLIQUE algorithm for clustering process.
380 @brief Initializes coordinate iterator for CLIQUE algorithm.
382 @param[in] dimension (uint): Amount of dimensions in input data space.
383 @param[in] intervals (uint): Amount of intervals in each dimension.
393 @brief Returns current block coordinate.
401 @brief Forms logical location for next block.
417 @brief Class implements CLIQUE grid based clustering algorithm.
418 @details CLIQUE automatically finds subspaces with high-density clusters. It produces identical results
419 irrespective of the order in which the input records are presented and it does not presume any canonical
420 distribution for input data @cite article::clique::1.
422 Here is an example where data in two-dimensional space is clustered using CLIQUE algorithm:
424 from pyclustering.cluster.clique import clique, clique_visualizer
425 from pyclustering.utils import read_sample
426 from pyclustering.samples.definitions import FCPS_SAMPLES
428 # read two-dimensional input data 'Target'
429 data = read_sample(FCPS_SAMPLES.SAMPLE_TARGET)
431 # create CLIQUE algorithm for processing
432 intervals = 10 # defines amount of cells in grid in each dimension
433 threshold = 0 # lets consider each point as non-outlier
434 clique_instance = clique(data, intervals, threshold)
436 # start clustering process and obtain results
437 clique_instance.process()
438 clusters = clique_instance.get_clusters() # allocated clusters
439 noise = clique_instance.get_noise() # points that are considered as outliers (in this example should be empty)
440 cells = clique_instance.get_cells() # CLIQUE blocks that forms grid
442 print("Amount of clusters:", len(clusters))
444 # visualize clustering results
445 clique_visualizer.show_grid(cells, data) # show grid that has been formed by the algorithm
446 clique_visualizer.show_clusters(data, clusters, noise) # show clustering results
449 In this example 6 clusters are allocated including four small cluster where each such small cluster consists of
450 three points. There are visualized clustering results - grid that has been formed by CLIQUE algorithm with
451 density and clusters itself:
452 @image html clique_clustering_target.png "Fig. 1. CLIQUE clustering results (grid and clusters itself)."
454 Sometimes such small clusters should be considered as outliers taking into account fact that two clusters in the
455 central are relatively huge. To treat them as a noise threshold value should be increased:
458 threshold = 3 # block that contains 3 or less points is considered as a outlier as well as its points
459 clique_instance = clique(data, intervals, threshold)
462 Two clusters are allocated, but in this case some points in cluster-"circle" are also considered as outliers,
463 because CLIQUE operates with blocks, not with points:
464 @image html clique_clustering_with_noise.png "Fig. 2. Noise allocation by CLIQUE."
468 def __init__(self, data, amount_intervals, density_threshold, **kwargs):
470 @brief Create CLIQUE clustering algorithm.
472 @param[in] data (list): Input data (list of points) that should be clustered.
473 @param[in] amount_intervals (uint): Amount of intervals in each dimension that defines amount of CLIQUE blocks
474 as \f[N_{blocks} = intervals^{dimensions}\f].
475 @param[in] density_threshold (uint): Minimum number of points that should contain CLIQUE block to consider its
476 points as non-outliers.
477 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'ccore').
479 <b>Keyword Args:</b><br>
480 - ccore (bool): By default is True. If True then C++ implementation is used for cluster analysis, otherwise
481 Python implementation is used.
488 self.
__ccore = kwargs.get(
'ccore',
True)
490 self.
__ccore = ccore_library.workable()
503 @brief Performs clustering process in line with rules of CLIQUE clustering algorithm.
505 @return (clique) Returns itself (CLIQUE instance).
523 @brief Returns allocated clusters.
525 @remark Allocated clusters are returned only after data processing (method process()). Otherwise empty list is returned.
527 @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
538 @brief Returns allocated noise.
540 @remark Allocated noise is returned only after data processing (method process()). Otherwise empty list is returned.
542 @return (list) List of indexes that are marked as a noise.
553 @brief Returns CLIQUE blocks that are formed during clustering process.
554 @details CLIQUE blocks can be used for visualization purposes. Each CLIQUE block contain its logical location
555 in grid, spatial location in data space and points that belong to block.
557 @return (list) List of CLIQUE blocks.
565 @brief Returns clustering result representation type that indicate how clusters are encoded.
567 @return (type_encoding) Clustering result representation.
573 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
576 def __process_by_ccore(self):
578 @brief Performs cluster analysis using C++ implementation of CLIQUE algorithm that is used by default if
579 user's target platform is supported.
584 (self.
__clusters, self.
__noise, block_logical_locations, max_corners, min_corners, block_points) = result
586 amount_cells = len(block_logical_locations)
587 for i
in range(amount_cells):
594 def __process_by_python(self):
596 @brief Performs cluster analysis using Python implementation of CLIQUE algorithm.
605 def __validate_arguments(self):
607 @brief Check input arguments of CLIQUE algorithm and if one of them is not correct then appropriate exception
613 raise ValueError(
"Empty input data. Data should contain at least one point.")
616 raise ValueError(
"Incorrect amount of intervals '%d'. Amount of intervals value should be greater than 0." % self.
__amount_intervals)
619 raise ValueError(
"Incorrect density threshold '%f'. Density threshold should not be negative." % self.
__density_threshold)
622 def __allocate_clusters(self):
624 @brief Performs cluster analysis using formed CLIQUE blocks.
628 if cell.visited
is False:
632 def __expand_cluster(self, cell):
634 @brief Tries to expand cluster from specified cell.
635 @details During expanding points are marked as noise or append to new cluster.
637 @param[in] cell (clique_block): CLIQUE block from that cluster should be expanded.
643 if len(cell.points) > 0:
644 self.
__noise.extend(cell.points)
648 cluster = cell.points[:]
651 for neighbor
in neighbors:
653 cluster.extend(neighbor.points)
656 elif len(neighbor.points) > 0:
657 self.
__noise.extend(neighbor.points)
662 def __get_neighbors(self, cell):
664 @brief Returns neighbors for specified CLIQUE block as clique_block objects.
666 @return (list) Neighbors as clique_block objects.
672 for i
in range(len(location_neighbors)):
676 if not candidate_neighbor.visited:
677 candidate_neighbor.visited =
True
678 neighbors.append(candidate_neighbor)
683 def __create_grid(self):
685 @brief Creates CLIQUE grid that consists of CLIQUE blocks for clustering process.
689 dimension = len(self.
__data[0])
691 cell_sizes = [dimension_length / self.
__amount_intervals for dimension_length
in data_sizes]
696 point_availability = [
True] * len(self.
__data)
698 for index_cell
in range(len(self.
__cells)):
699 logical_location = iterator.get_coordinate()
702 self.
__cells[index_cell].logical_location = logical_location[:]
704 cur_max_corner, cur_min_corner = self.
__get_spatial_location(logical_location, min_corner, max_corner, cell_sizes)
707 self.
__cells[index_cell].capture_points(self.
__data, point_availability)
712 def __location_to_key(self, location):
714 @brief Forms key using logical location of a CLIQUE block.
716 @return (string) Key for CLIQUE block map.
719 return ''.join(str(e) +
'.' for e
in location)
722 def __get_spatial_location(self, logical_location, min_corner, max_corner, cell_sizes):
724 @brief Calculates spatial location for CLIQUE block with logical coordinates defined by logical_location.
726 @param[in] logical_location (list): Logical location of CLIQUE block for that spatial location should be calculated.
727 @param[in] min_corner (list): Minimum corner of an input data.
728 @param[in] max_corner (list): Maximum corner of an input data.
729 @param[in] cell_sizes (list): Size of CLIQUE block in each dimension.
731 @return (list, list): Maximum and minimum corners for the specified CLIQUE block.
734 cur_min_corner = min_corner[:]
735 cur_max_corner = min_corner[:]
736 dimension = len(self.
__data[0])
737 for index_dimension
in range(dimension):
738 cur_min_corner[index_dimension] += cell_sizes[index_dimension] * logical_location[index_dimension]
741 cur_max_corner[index_dimension] = max_corner[index_dimension]
743 cur_max_corner[index_dimension] = cur_min_corner[index_dimension] + cell_sizes[index_dimension]
745 return cur_max_corner, cur_min_corner
748 def __get_data_size_derscription(self):
750 @brief Calculates input data description that is required to create CLIQUE grid.
752 @return (list, list, list): Data size in each dimension, minimum and maximum corners.
755 min_corner = self.
__data[0][:]
756 max_corner = self.
__data[0][:]
758 dimension = len(self.
__data[0])
760 for index_point
in range(1, len(self.
__data)):
761 for index_dimension
in range(dimension):
762 coordinate = self.
__data[index_point][index_dimension]
763 if coordinate > max_corner[index_dimension]:
764 max_corner[index_dimension] = coordinate
766 if coordinate < min_corner[index_dimension]:
767 min_corner[index_dimension] = coordinate
769 data_sizes = [0.0] * dimension
770 for index_dimension
in range(dimension):
771 data_sizes[index_dimension] = max_corner[index_dimension] - min_corner[index_dimension]
773 return data_sizes, min_corner, max_corner