3 @brief Cluster analysis algorithm: CURE 4 @details Implementation based on paper @cite article::cure::1. 6 @authors Andrei Novikov (pyclustering@yandex.ru) 8 @copyright GNU Public License 10 @cond GNU_PUBLIC_LICENSE 11 PyClustering is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 16 PyClustering is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 21 You should have received a copy of the GNU General Public License 22 along with this program. If not, see <http://www.gnu.org/licenses/>. 36 from pyclustering.core.wrapper
import ccore_library
38 import pyclustering.core.cure_wrapper
as wrapper
43 @brief Represents data cluster in CURE term. 44 @details CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center. 50 @brief Constructor of CURE cluster. 52 @param[in] point (list): Point represented by list of coordinates. 53 @param[in] index (uint): Index point in dataset. 83 @brief Displays distance to closest cluster and points that are contained by current cluster. 91 @brief Class represents clustering algorithm CURE with KD-tree optimization. 92 @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance. 94 Here is an example how to perform cluster analysis of sample 'Lsun': 96 from pyclustering.cluster import cluster_visualizer; 97 from pyclustering.cluster.cure import cure; 98 from pyclustering.utils import read_sample; 99 from pyclustering.samples.definitions import FCPS_SAMPLES; 101 # Input data in following format [ [0.1, 0.5], [0.3, 0.1], ... ]. 102 input_data = read_sample(FCPS_SAMPLES.SAMPLE_LSUN); 104 # Allocate three clusters. 105 cure_instance = cure(input_data, 3); 106 cure_instance.process(); 107 clusters = cure_instance.get_clusters(); 109 # Visualize allocated clusters. 110 visualizer = cluster_visualizer(); 111 visualizer.append_clusters(clusters, input_data); 117 def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = True):
119 @brief Constructor of clustering algorithm CURE. 121 @param[in] data (array_like): Input data that should be processed. 122 @param[in] number_cluster (uint): Number of clusters that should be allocated. 123 @param[in] number_represent_points (uint): Number of representative points for each cluster. 124 @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1. 125 @param[in] ccore (bool): If True then CCORE (C++ solution) will be used for solving. 141 self.
__ccore = ccore_library.workable()
148 @brief Performs cluster analysis in line with rules of CURE algorithm. 150 @return (cure) Returns itself (CURE instance). 165 def __process_by_ccore(self):
167 @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library). 173 self.
__clusters = wrapper.cure_get_clusters(cure_data_pointer)
174 self.
__representors = wrapper.cure_get_representors(cure_data_pointer)
175 self.
__means = wrapper.cure_get_means(cure_data_pointer)
177 wrapper.cure_data_destroy(cure_data_pointer)
180 def __process_by_python(self):
182 @brief Performs cluster analysis using python code. 190 cluster2 = cluster1.closest
203 cluster_relocation_requests = []
207 merged_cluster.closest = self.
__queue[0]
208 merged_cluster.distance = self.
__cluster_distance(merged_cluster, merged_cluster.closest)
213 if distance < merged_cluster.distance:
214 merged_cluster.closest = item
215 merged_cluster.distance = distance
218 if (item.closest
is cluster1)
or (item.closest
is cluster2):
221 if item.distance < distance:
227 if item.closest
is None:
228 item.closest = merged_cluster
229 item.distance = distance
232 item.closest = merged_cluster
233 item.distance = distance
235 cluster_relocation_requests.append(item)
239 for item
in cluster_relocation_requests:
243 self.
__clusters = [cure_cluster_unit.indexes
for cure_cluster_unit
in self.
__queue]
245 self.
__means = [cure_cluster_unit.mean
for cure_cluster_unit
in self.
__queue]
250 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. 252 @return (list) List of allocated clusters. 255 @see get_representors() 265 @brief Returns list of point-representors of each cluster. 266 @details Cluster index should be used for navigation between lists of point-representors. 268 @return (list) List of point-representors of each cluster. 280 @brief Returns list of mean values of each cluster. 281 @details Cluster index should be used for navigation between mean values. 283 @return (list) List of mean values of each cluster. 286 @see get_representors() 295 @brief Returns clustering result representation type that indicate how clusters are encoded. 297 @return (type_encoding) Clustering result representation. 303 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
306 def __prepare_data_points(self, sample):
308 @brief Prepare data points for clustering. 309 @details In case of numpy.array there are a lot of overloaded basic operators, such as __contains__, __eq__. 311 @return (list) Returns sample in list format. 314 if isinstance(sample, numpy.ndarray):
315 return sample.tolist()
320 def __validate_arguments(self):
322 @brief Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception 328 raise ValueError(
"Empty input data. Data should contain at least one point.")
331 raise ValueError(
"Incorrect amount of clusters '%d'. Amount of cluster should be greater than 0." % self.
__number_cluster)
334 raise ValueError(
"Incorrect compression level '%f'. Compression should not be negative." % self.
__compression)
337 raise ValueError(
"Incorrect amount of representatives '%d'. Amount of representatives should be greater than 0." % self.
__number_cluster)
340 def __insert_cluster(self, cluster):
342 @brief Insert cluster to the list (sorted queue) in line with sequence order (distance). 344 @param[in] cluster (cure_cluster): Cluster that should be inserted. 348 for index
in range(len(self.
__queue)):
349 if cluster.distance < self.
__queue[index].distance:
350 self.
__queue.insert(index, cluster)
356 def __relocate_cluster(self, cluster):
358 @brief Relocate cluster in list in line with distance order. 360 @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order. 368 def __closest_cluster(self, cluster, distance):
370 @brief Find closest cluster to the specified cluster in line with distance. 372 @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found. 373 @param[in] distance (double): Closest distance to the previous cluster. 375 @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned. 379 nearest_cluster =
None 380 nearest_distance = float(
'inf')
382 real_euclidean_distance = distance ** 0.5
384 for point
in cluster.rep:
386 nearest_nodes = self.
__tree.find_nearest_dist_nodes(point, real_euclidean_distance)
387 for (candidate_distance, kdtree_node)
in nearest_nodes:
388 if (candidate_distance < nearest_distance)
and (kdtree_node
is not None)
and (kdtree_node.payload
is not cluster):
389 nearest_distance = candidate_distance
390 nearest_cluster = kdtree_node.payload
392 return (nearest_cluster, nearest_distance)
395 def __insert_represented_points(self, cluster):
397 @brief Insert representation points to the k-d tree. 399 @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted. 403 for point
in cluster.rep:
404 self.
__tree.insert(point, cluster)
407 def __delete_represented_points(self, cluster):
409 @brief Remove representation points of clusters from the k-d tree 411 @param[in] cluster (cure_cluster): Cluster whose representation points should be removed. 415 for point
in cluster.rep:
416 self.
__tree.remove(point, payload=cluster)
419 def __merge_clusters(self, cluster1, cluster2):
421 @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster. 423 @param[in] cluster1 (cure_cluster): Cluster that should be merged. 424 @param[in] cluster2 (cure_cluster): Cluster that should be merged. 426 @return (cure_cluster) New merged CURE cluster. 432 merged_cluster.points = cluster1.points + cluster2.points
433 merged_cluster.indexes = cluster1.indexes + cluster2.indexes
436 dimension = len(cluster1.mean)
437 merged_cluster.mean = [0] * dimension
438 if merged_cluster.points[1:] == merged_cluster.points[:-1]:
439 merged_cluster.mean = merged_cluster.points[0]
441 for index
in range(dimension):
442 merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
450 for point
in merged_cluster.points:
453 minimal_distance = euclidean_distance_square(point, merged_cluster.mean)
456 minimal_distance = min([euclidean_distance_square(point, p)
for p
in temporary])
459 if minimal_distance >= maximal_distance:
460 maximal_distance = minimal_distance
461 maximal_point = point
463 if maximal_point
not in temporary:
464 temporary.append(maximal_point)
466 for point
in temporary:
467 representative_point = [0] * dimension
468 for index
in range(dimension):
469 representative_point[index] = point[index] + self.
__compression * (merged_cluster.mean[index] - point[index])
471 merged_cluster.rep.append(representative_point)
473 return merged_cluster
476 def __create_queue(self):
478 @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point. 480 @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. 482 @return (list) Create queue of sorted clusters by distance between them. 489 for i
in range(0, len(self.
__queue)):
490 minimal_distance = float(
'inf')
491 closest_index_cluster = -1
493 for k
in range(0, len(self.
__queue)):
496 if dist < minimal_distance:
497 minimal_distance = dist
498 closest_index_cluster = k
501 self.
__queue[i].distance = minimal_distance
504 self.
__queue.sort(key =
lambda x: x.distance, reverse =
False)
507 def __create_kdtree(self):
509 @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set. 511 @return (kdtree) k-d tree that consist of representative points of CURE clusters. 516 for current_cluster
in self.
__queue:
517 for representative_point
in current_cluster.rep:
518 self.
__tree.insert(representative_point, current_cluster)
521 def __cluster_distance(self, cluster1, cluster2):
523 @brief Calculate minimal distance between clusters using representative points. 525 @param[in] cluster1 (cure_cluster): The first cluster. 526 @param[in] cluster2 (cure_cluster): The second cluster. 528 @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters. 532 distance = float(
'inf')
533 for i
in range(0, len(cluster1.rep)):
534 for k
in range(0, len(cluster2.rep)):
535 dist = euclidean_distance_square(cluster1.rep[i], cluster2.rep[k]);
def __insert_represented_points(self, cluster)
Insert representation points to the k-d tree.
Class represents clustering algorithm CURE with KD-tree optimization.
Represents data cluster in CURE term.
rep
List of points that represents clusters.
def __validate_arguments(self)
Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception ...
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Utils that are used by modules of pyclustering.
def get_means(self)
Returns list of mean values of each cluster.
Module for representing clustering results.
indexes
Point indexes in dataset.
def __init__(self, point, index)
Constructor of CURE cluster.
Represents KD Tree that is a space-partitioning data structure for organizing points in a k-dimension...
def __process_by_python(self)
Performs cluster analysis using python code.
def __delete_represented_points(self, cluster)
Remove representation points of clusters from the k-d tree.
points
List of points that make up cluster.
closest
Pointer to the closest cluster.
def get_representors(self)
Returns list of point-representors of each cluster.
distance
Distance to the closest cluster.
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
def __create_kdtree(self)
Create k-d tree in line with created clusters.
def process(self)
Performs cluster analysis in line with rules of CURE algorithm.
def __insert_cluster(self, cluster)
Insert cluster to the list (sorted queue) in line with sequence order (distance). ...
def __create_queue(self)
Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbo...
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
mean
Mean of points that make up cluster.
def __cluster_distance(self, cluster1, cluster2)
Calculate minimal distance between clusters using representative points.
def __closest_cluster(self, cluster, distance)
Find closest cluster to the specified cluster in line with distance.
def __relocate_cluster(self, cluster)
Relocate cluster in list in line with distance order.
def __merge_clusters(self, cluster1, cluster2)
Merges two clusters and returns new merged cluster.
def __repr__(self)
Displays distance to closest cluster and points that are contained by current cluster.
def __init__(self, data, number_cluster, number_represent_points=5, compression=0.5, ccore=True)
Constructor of clustering algorithm CURE.
__number_represent_points
def __prepare_data_points(self, sample)
Prepare data points for clustering.