3 @brief Cluster analysis algorithm: CURE
4 @details Implementation based on paper @cite article::cure::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
21 from pyclustering.core.wrapper
import ccore_library
23 import pyclustering.core.cure_wrapper
as wrapper
28 @brief Represents data cluster in CURE term.
29 @details CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center.
35 @brief Constructor of CURE cluster.
37 @param[in] point (list): Point represented by list of coordinates.
38 @param[in] index (uint): Index point in dataset.
68 @brief Displays distance to closest cluster and points that are contained by current cluster.
76 @brief Class represents clustering algorithm CURE with KD-tree optimization.
77 @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
79 Here is an example how to perform cluster analysis of sample 'Lsun':
81 from pyclustering.cluster import cluster_visualizer;
82 from pyclustering.cluster.cure import cure;
83 from pyclustering.utils import read_sample;
84 from pyclustering.samples.definitions import FCPS_SAMPLES;
86 # Input data in following format [ [0.1, 0.5], [0.3, 0.1], ... ].
87 input_data = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);
89 # Allocate three clusters.
90 cure_instance = cure(input_data, 3);
91 cure_instance.process();
92 clusters = cure_instance.get_clusters();
94 # Visualize allocated clusters.
95 visualizer = cluster_visualizer();
96 visualizer.append_clusters(clusters, input_data);
102 def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = True):
104 @brief Constructor of clustering algorithm CURE.
106 @param[in] data (array_like): Input data that should be processed.
107 @param[in] number_cluster (uint): Number of clusters that should be allocated.
108 @param[in] number_represent_points (uint): Number of representative points for each cluster.
109 @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1.
110 @param[in] ccore (bool): If True then CCORE (C++ solution) will be used for solving.
126 self.
__ccore = ccore_library.workable()
133 @brief Performs cluster analysis in line with rules of CURE algorithm.
135 @return (cure) Returns itself (CURE instance).
150 def __process_by_ccore(self):
152 @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
158 self.
__clusters = wrapper.cure_get_clusters(cure_data_pointer)
159 self.
__representors = wrapper.cure_get_representors(cure_data_pointer)
160 self.
__means = wrapper.cure_get_means(cure_data_pointer)
162 wrapper.cure_data_destroy(cure_data_pointer)
165 def __process_by_python(self):
167 @brief Performs cluster analysis using python code.
175 cluster2 = cluster1.closest
188 cluster_relocation_requests = []
192 merged_cluster.closest = self.
__queue[0]
193 merged_cluster.distance = self.
__cluster_distance(merged_cluster, merged_cluster.closest)
198 if distance < merged_cluster.distance:
199 merged_cluster.closest = item
200 merged_cluster.distance = distance
203 if (item.closest
is cluster1)
or (item.closest
is cluster2):
206 if item.distance < distance:
212 if item.closest
is None:
213 item.closest = merged_cluster
214 item.distance = distance
217 item.closest = merged_cluster
218 item.distance = distance
220 cluster_relocation_requests.append(item)
224 for item
in cluster_relocation_requests:
228 self.
__clusters = [cure_cluster_unit.indexes
for cure_cluster_unit
in self.
__queue]
230 self.
__means = [cure_cluster_unit.mean
for cure_cluster_unit
in self.
__queue]
235 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
237 @return (list) List of allocated clusters.
240 @see get_representors()
250 @brief Returns list of point-representors of each cluster.
251 @details Cluster index should be used for navigation between lists of point-representors.
253 @return (list) List of point-representors of each cluster.
265 @brief Returns list of mean values of each cluster.
266 @details Cluster index should be used for navigation between mean values.
268 @return (list) List of mean values of each cluster.
271 @see get_representors()
280 @brief Returns clustering result representation type that indicate how clusters are encoded.
282 @return (type_encoding) Clustering result representation.
288 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
291 def __prepare_data_points(self, sample):
293 @brief Prepare data points for clustering.
294 @details In case of numpy.array there are a lot of overloaded basic operators, such as __contains__, __eq__.
296 @return (list) Returns sample in list format.
299 if isinstance(sample, numpy.ndarray):
300 return sample.tolist()
305 def __validate_arguments(self):
307 @brief Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception
313 raise ValueError(
"Empty input data. Data should contain at least one point.")
316 raise ValueError(
"Incorrect amount of clusters '%d'. Amount of cluster should be greater than 0." % self.
__number_cluster)
319 raise ValueError(
"Incorrect compression level '%f'. Compression should not be negative." % self.
__compression)
322 raise ValueError(
"Incorrect amount of representatives '%d'. Amount of representatives should be greater than 0." % self.
__number_cluster)
325 def __insert_cluster(self, cluster):
327 @brief Insert cluster to the list (sorted queue) in line with sequence order (distance).
329 @param[in] cluster (cure_cluster): Cluster that should be inserted.
333 for index
in range(len(self.
__queue)):
334 if cluster.distance < self.
__queue[index].distance:
335 self.
__queue.insert(index, cluster)
341 def __relocate_cluster(self, cluster):
343 @brief Relocate cluster in list in line with distance order.
345 @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order.
353 def __closest_cluster(self, cluster, distance):
355 @brief Find closest cluster to the specified cluster in line with distance.
357 @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found.
358 @param[in] distance (double): Closest distance to the previous cluster.
360 @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned.
364 nearest_cluster =
None
365 nearest_distance = float(
'inf')
367 real_euclidean_distance = distance ** 0.5
369 for point
in cluster.rep:
371 nearest_nodes = self.
__tree.find_nearest_dist_nodes(point, real_euclidean_distance)
372 for (candidate_distance, kdtree_node)
in nearest_nodes:
373 if (candidate_distance < nearest_distance)
and (kdtree_node
is not None)
and (kdtree_node.payload
is not cluster):
374 nearest_distance = candidate_distance
375 nearest_cluster = kdtree_node.payload
377 return (nearest_cluster, nearest_distance)
380 def __insert_represented_points(self, cluster):
382 @brief Insert representation points to the k-d tree.
384 @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted.
388 for point
in cluster.rep:
389 self.
__tree.insert(point, cluster)
392 def __delete_represented_points(self, cluster):
394 @brief Remove representation points of clusters from the k-d tree
396 @param[in] cluster (cure_cluster): Cluster whose representation points should be removed.
400 for point
in cluster.rep:
401 self.
__tree.remove(point, payload=cluster)
404 def __merge_clusters(self, cluster1, cluster2):
406 @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster.
408 @param[in] cluster1 (cure_cluster): Cluster that should be merged.
409 @param[in] cluster2 (cure_cluster): Cluster that should be merged.
411 @return (cure_cluster) New merged CURE cluster.
417 merged_cluster.points = cluster1.points + cluster2.points
418 merged_cluster.indexes = cluster1.indexes + cluster2.indexes
421 dimension = len(cluster1.mean)
422 merged_cluster.mean = [0] * dimension
423 if merged_cluster.points[1:] == merged_cluster.points[:-1]:
424 merged_cluster.mean = merged_cluster.points[0]
426 for index
in range(dimension):
427 merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
435 for point
in merged_cluster.points:
438 minimal_distance = euclidean_distance_square(point, merged_cluster.mean)
441 minimal_distance = min([euclidean_distance_square(point, p)
for p
in temporary])
444 if minimal_distance >= maximal_distance:
445 maximal_distance = minimal_distance
446 maximal_point = point
448 if maximal_point
not in temporary:
449 temporary.append(maximal_point)
451 for point
in temporary:
452 representative_point = [0] * dimension
453 for index
in range(dimension):
454 representative_point[index] = point[index] + self.
__compression * (merged_cluster.mean[index] - point[index])
456 merged_cluster.rep.append(representative_point)
458 return merged_cluster
461 def __create_queue(self):
463 @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point.
465 @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
467 @return (list) Create queue of sorted clusters by distance between them.
474 for i
in range(0, len(self.
__queue)):
475 minimal_distance = float(
'inf')
476 closest_index_cluster = -1
478 for k
in range(0, len(self.
__queue)):
481 if dist < minimal_distance:
482 minimal_distance = dist
483 closest_index_cluster = k
486 self.
__queue[i].distance = minimal_distance
489 self.
__queue.sort(key=
lambda x: x.distance, reverse =
False)
492 def __create_kdtree(self):
494 @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set.
496 @return (kdtree) k-d tree that consist of representative points of CURE clusters.
500 representatives, payloads = [], []
501 for current_cluster
in self.
__queue:
502 for representative_point
in current_cluster.rep:
503 representatives.append(representative_point)
504 payloads.append(current_cluster)
511 def __cluster_distance(self, cluster1, cluster2):
513 @brief Calculate minimal distance between clusters using representative points.
515 @param[in] cluster1 (cure_cluster): The first cluster.
516 @param[in] cluster2 (cure_cluster): The second cluster.
518 @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters.
522 distance = float(
'inf')
523 for i
in range(0, len(cluster1.rep)):
524 for k
in range(0, len(cluster2.rep)):
525 dist = euclidean_distance_square(cluster1.rep[i], cluster2.rep[k])