3 @brief Cluster analysis algorithm: CURE 4 @details Implementation based on paper @cite article::cure::1. 6 @authors Andrei Novikov (pyclustering@yandex.ru) 8 @copyright GNU Public License 10 @cond GNU_PUBLIC_LICENSE 11 PyClustering is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 16 PyClustering is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 21 You should have received a copy of the GNU General Public License 22 along with this program. If not, see <http://www.gnu.org/licenses/>. 36 from pyclustering.core.wrapper
import ccore_library
38 import pyclustering.core.cure_wrapper
as wrapper
43 @brief Represents data cluster in CURE term. 44 @details CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center. 50 @brief Constructor of CURE cluster. 52 @param[in] point (list): Point represented by list of coordinates. 53 @param[in] index (uint): Index point in dataset. 83 @brief Displays distance to closest cluster and points that are contained by current cluster. 91 @brief Class represents clustering algorithm CURE with KD-tree optimization. 92 @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance. 94 Here is an example how to perform cluster analysis of sample 'Lsun': 96 from pyclustering.cluster import cluster_visualizer; 97 from pyclustering.cluster.cure import cure; 98 from pyclustering.utils import read_sample; 99 from pyclustering.samples.definitions import FCPS_SAMPLES; 101 # Input data in following format [ [0.1, 0.5], [0.3, 0.1], ... ]. 102 input_data = read_sample(FCPS_SAMPLES.SAMPLE_LSUN); 104 # Allocate three clusters. 105 cure_instance = cure(input_data, 3); 106 cure_instance.process(); 107 clusters = cure_instance.get_clusters(); 109 # Visualize allocated clusters. 110 visualizer = cluster_visualizer(); 111 visualizer.append_clusters(clusters, input_data); 117 def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = True):
119 @brief Constructor of clustering algorithm CURE. 121 @param[in] data (array_like): Input data that should be processed. 122 @param[in] number_cluster (uint): Number of clusters that should be allocated. 123 @param[in] number_represent_points (uint): Number of representative points for each cluster. 124 @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1. 125 @param[in] ccore (bool): If True then CCORE (C++ solution) will be used for solving. 141 self.
__ccore = ccore_library.workable()
148 @brief Performs cluster analysis in line with rules of CURE algorithm. 150 @remark Results of clustering can be obtained using corresponding get methods. 163 def __process_by_ccore(self):
165 @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library). 171 self.
__clusters = wrapper.cure_get_clusters(cure_data_pointer)
172 self.
__representors = wrapper.cure_get_representors(cure_data_pointer)
173 self.
__means = wrapper.cure_get_means(cure_data_pointer)
175 wrapper.cure_data_destroy(cure_data_pointer)
178 def __process_by_python(self):
180 @brief Performs cluster analysis using python code. 188 cluster2 = cluster1.closest
201 cluster_relocation_requests = []
205 merged_cluster.closest = self.
__queue[0]
206 merged_cluster.distance = self.
__cluster_distance(merged_cluster, merged_cluster.closest)
211 if distance < merged_cluster.distance:
212 merged_cluster.closest = item
213 merged_cluster.distance = distance
216 if (item.closest
is cluster1)
or (item.closest
is cluster2):
219 if item.distance < distance:
225 if item.closest
is None:
226 item.closest = merged_cluster
227 item.distance = distance
230 item.closest = merged_cluster
231 item.distance = distance
233 cluster_relocation_requests.append(item)
237 for item
in cluster_relocation_requests:
241 self.
__clusters = [cure_cluster_unit.indexes
for cure_cluster_unit
in self.
__queue]
243 self.
__means = [cure_cluster_unit.mean
for cure_cluster_unit
in self.
__queue]
248 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. 250 @return (list) List of allocated clusters. 253 @see get_representors() 263 @brief Returns list of point-representors of each cluster. 264 @details Cluster index should be used for navigation between lists of point-representors. 266 @return (list) List of point-representors of each cluster. 278 @brief Returns list of mean values of each cluster. 279 @details Cluster index should be used for navigation between mean values. 281 @return (list) List of mean values of each cluster. 284 @see get_representors() 293 @brief Returns clustering result representation type that indicate how clusters are encoded. 295 @return (type_encoding) Clustering result representation. 301 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
304 def __prepare_data_points(self, sample):
306 @brief Prepare data points for clustering. 307 @details In case of numpy.array there are a lot of overloaded basic operators, such as __contains__, __eq__. 309 @return (list) Returns sample in list format. 312 if isinstance(sample, numpy.ndarray):
313 return sample.tolist()
318 def __validate_arguments(self):
320 @brief Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception 326 raise ValueError(
"Empty input data. Data should contain at least one point.")
329 raise ValueError(
"Incorrect amount of clusters '%d'. Amount of cluster should be greater than 0." % self.
__number_cluster)
332 raise ValueError(
"Incorrect compression level '%f'. Compression should not be negative." % self.
__compression)
335 raise ValueError(
"Incorrect amount of representatives '%d'. Amount of representatives should be greater than 0." % self.
__number_cluster)
338 def __insert_cluster(self, cluster):
340 @brief Insert cluster to the list (sorted queue) in line with sequence order (distance). 342 @param[in] cluster (cure_cluster): Cluster that should be inserted. 346 for index
in range(len(self.
__queue)):
347 if cluster.distance < self.
__queue[index].distance:
348 self.
__queue.insert(index, cluster)
354 def __relocate_cluster(self, cluster):
356 @brief Relocate cluster in list in line with distance order. 358 @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order. 366 def __closest_cluster(self, cluster, distance):
368 @brief Find closest cluster to the specified cluster in line with distance. 370 @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found. 371 @param[in] distance (double): Closest distance to the previous cluster. 373 @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned. 377 nearest_cluster =
None 378 nearest_distance = float(
'inf')
380 real_euclidean_distance = distance ** 0.5
382 for point
in cluster.rep:
384 nearest_nodes = self.
__tree.find_nearest_dist_nodes(point, real_euclidean_distance)
385 for (candidate_distance, kdtree_node)
in nearest_nodes:
386 if (candidate_distance < nearest_distance)
and (kdtree_node
is not None)
and (kdtree_node.payload
is not cluster):
387 nearest_distance = candidate_distance
388 nearest_cluster = kdtree_node.payload
390 return (nearest_cluster, nearest_distance)
393 def __insert_represented_points(self, cluster):
395 @brief Insert representation points to the k-d tree. 397 @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted. 401 for point
in cluster.rep:
402 self.
__tree.insert(point, cluster)
405 def __delete_represented_points(self, cluster):
407 @brief Remove representation points of clusters from the k-d tree 409 @param[in] cluster (cure_cluster): Cluster whose representation points should be removed. 413 for point
in cluster.rep:
414 self.
__tree.remove(point, payload=cluster)
417 def __merge_clusters(self, cluster1, cluster2):
419 @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster. 421 @param[in] cluster1 (cure_cluster): Cluster that should be merged. 422 @param[in] cluster2 (cure_cluster): Cluster that should be merged. 424 @return (cure_cluster) New merged CURE cluster. 430 merged_cluster.points = cluster1.points + cluster2.points
431 merged_cluster.indexes = cluster1.indexes + cluster2.indexes
434 dimension = len(cluster1.mean)
435 merged_cluster.mean = [0] * dimension
436 if merged_cluster.points[1:] == merged_cluster.points[:-1]:
437 merged_cluster.mean = merged_cluster.points[0]
439 for index
in range(dimension):
440 merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
448 for point
in merged_cluster.points:
451 minimal_distance = euclidean_distance_square(point, merged_cluster.mean)
454 minimal_distance = min([euclidean_distance_square(point, p)
for p
in temporary])
457 if minimal_distance >= maximal_distance:
458 maximal_distance = minimal_distance
459 maximal_point = point
461 if maximal_point
not in temporary:
462 temporary.append(maximal_point)
464 for point
in temporary:
465 representative_point = [0] * dimension
466 for index
in range(dimension):
467 representative_point[index] = point[index] + self.
__compression * (merged_cluster.mean[index] - point[index])
469 merged_cluster.rep.append(representative_point)
471 return merged_cluster
474 def __create_queue(self):
476 @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point. 478 @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. 480 @return (list) Create queue of sorted clusters by distance between them. 487 for i
in range(0, len(self.
__queue)):
488 minimal_distance = float(
'inf')
489 closest_index_cluster = -1
491 for k
in range(0, len(self.
__queue)):
494 if dist < minimal_distance:
495 minimal_distance = dist
496 closest_index_cluster = k
499 self.
__queue[i].distance = minimal_distance
502 self.
__queue.sort(key =
lambda x: x.distance, reverse =
False)
505 def __create_kdtree(self):
507 @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set. 509 @return (kdtree) k-d tree that consist of representative points of CURE clusters. 514 for current_cluster
in self.
__queue:
515 for representative_point
in current_cluster.rep:
516 self.
__tree.insert(representative_point, current_cluster)
519 def __cluster_distance(self, cluster1, cluster2):
521 @brief Calculate minimal distance between clusters using representative points. 523 @param[in] cluster1 (cure_cluster): The first cluster. 524 @param[in] cluster2 (cure_cluster): The second cluster. 526 @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters. 530 distance = float(
'inf')
531 for i
in range(0, len(cluster1.rep)):
532 for k
in range(0, len(cluster2.rep)):
533 dist = euclidean_distance_square(cluster1.rep[i], cluster2.rep[k]);
def __insert_represented_points(self, cluster)
Insert representation points to the k-d tree.
Class represents clustering algorithm CURE with KD-tree optimization.
Represents data cluster in CURE term.
rep
List of points that represents clusters.
def __validate_arguments(self)
Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception ...
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Utils that are used by modules of pyclustering.
def get_means(self)
Returns list of mean values of each cluster.
Module for representing clustering results.
indexes
Point indexes in dataset.
def __init__(self, point, index)
Constructor of CURE cluster.
Represents KD Tree that is a space-partitioning data structure for organizing points in a k-dimension...
def __process_by_python(self)
Performs cluster analysis using python code.
def __delete_represented_points(self, cluster)
Remove representation points of clusters from the k-d tree.
points
List of points that make up cluster.
closest
Pointer to the closest cluster.
def get_representors(self)
Returns list of point-representors of each cluster.
distance
Distance to the closest cluster.
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
def __create_kdtree(self)
Create k-d tree in line with created clusters.
def process(self)
Performs cluster analysis in line with rules of CURE algorithm.
def __insert_cluster(self, cluster)
Insert cluster to the list (sorted queue) in line with sequence order (distance). ...
def __create_queue(self)
Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbo...
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
mean
Mean of points that make up cluster.
def __cluster_distance(self, cluster1, cluster2)
Calculate minimal distance between clusters using representative points.
def __closest_cluster(self, cluster, distance)
Find closest cluster to the specified cluster in line with distance.
def __relocate_cluster(self, cluster)
Relocate cluster in list in line with distance order.
def __merge_clusters(self, cluster1, cluster2)
Merges two clusters and returns new merged cluster.
def __repr__(self)
Displays distance to closest cluster and points that are contained by current cluster.
def __init__(self, data, number_cluster, number_represent_points=5, compression=0.5, ccore=True)
Constructor of clustering algorithm CURE.
__number_represent_points
def __prepare_data_points(self, sample)
Prepare data points for clustering.