pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
cure.py
1 """!
2 
3 @brief Cluster analysis algorithm: CURE
4 @details Implementation based on paper @cite article::cure::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 
13 import numpy
14 
15 from pyclustering.cluster.encoder import type_encoding
16 
17 from pyclustering.utils import euclidean_distance_square
18 
19 from pyclustering.container.kdtree import kdtree
20 
21 from pyclustering.core.wrapper import ccore_library
22 
23 import pyclustering.core.cure_wrapper as wrapper
24 
25 
27  """!
28  @brief Represents data cluster in CURE term.
29  @details CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center.
30 
31  """
32 
33  def __init__(self, point, index):
34  """!
35  @brief Constructor of CURE cluster.
36 
37  @param[in] point (list): Point represented by list of coordinates.
38  @param[in] index (uint): Index point in dataset.
39 
40  """
41 
42 
43  self.points = [ ]
44 
45 
46  self.indexes = -1
47 
48 
49  self.mean = None
50 
51 
52  self.rep = [ ]
53 
54  if point is not None:
55  self.points = [ point ]
56  self.indexes = [ index ]
57  self.mean = point
58  self.rep = [ point ]
59 
60 
61  self.closest = None
62 
63 
64  self.distance = float('inf') # calculation of distance is really complexity operation (even square distance), so let's store distance to closest cluster.
65 
66  def __repr__(self):
67  """!
68  @brief Displays distance to closest cluster and points that are contained by current cluster.
69 
70  """
71  return "%s, %s" % (self.distance, self.points)
72 
73 
74 class cure:
75  """!
76  @brief Class represents clustering algorithm CURE with KD-tree optimization.
77  @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
78 
79  Here is an example how to perform cluster analysis of sample 'Lsun':
80  @code
81  from pyclustering.cluster import cluster_visualizer;
82  from pyclustering.cluster.cure import cure;
83  from pyclustering.utils import read_sample;
84  from pyclustering.samples.definitions import FCPS_SAMPLES;
85 
86  # Input data in following format [ [0.1, 0.5], [0.3, 0.1], ... ].
87  input_data = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);
88 
89  # Allocate three clusters.
90  cure_instance = cure(input_data, 3);
91  cure_instance.process();
92  clusters = cure_instance.get_clusters();
93 
94  # Visualize allocated clusters.
95  visualizer = cluster_visualizer();
96  visualizer.append_clusters(clusters, input_data);
97  visualizer.show();
98  @endcode
99 
100  """
101 
102  def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = True):
103  """!
104  @brief Constructor of clustering algorithm CURE.
105 
106  @param[in] data (array_like): Input data that should be processed.
107  @param[in] number_cluster (uint): Number of clusters that should be allocated.
108  @param[in] number_represent_points (uint): Number of representative points for each cluster.
109  @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1.
110  @param[in] ccore (bool): If True then CCORE (C++ solution) will be used for solving.
111 
112  """
113 
114  self.__pointer_data = self.__prepare_data_points(data)
115 
116  self.__clusters = None
117  self.__representors = None
118  self.__means = None
119 
120  self.__number_cluster = number_cluster
121  self.__number_represent_points = number_represent_points
122  self.__compression = compression
123 
124  self.__ccore = ccore
125  if self.__ccore:
126  self.__ccore = ccore_library.workable()
127 
128  self.__validate_arguments()
129 
130 
131  def process(self):
132  """!
133  @brief Performs cluster analysis in line with rules of CURE algorithm.
134 
135  @return (cure) Returns itself (CURE instance).
136 
137  @see get_clusters()
138 
139  """
140 
141  if self.__ccore is True:
142  self.__process_by_ccore()
143 
144  else:
145  self.__process_by_python()
146 
147  return self
148 
149 
150  def __process_by_ccore(self):
151  """!
152  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
153 
154  """
155  cure_data_pointer = wrapper.cure_algorithm(self.__pointer_data, self.__number_cluster,
157 
158  self.__clusters = wrapper.cure_get_clusters(cure_data_pointer)
159  self.__representors = wrapper.cure_get_representors(cure_data_pointer)
160  self.__means = wrapper.cure_get_means(cure_data_pointer)
161 
162  wrapper.cure_data_destroy(cure_data_pointer)
163 
164 
165  def __process_by_python(self):
166  """!
167  @brief Performs cluster analysis using python code.
168 
169  """
170  self.__create_queue() # queue
171  self.__create_kdtree() # create k-d tree
172 
173  while len(self.__queue) > self.__number_cluster:
174  cluster1 = self.__queue[0] # cluster that has nearest neighbor.
175  cluster2 = cluster1.closest # closest cluster.
176 
177  self.__queue.remove(cluster1)
178  self.__queue.remove(cluster2)
179 
180  self.__delete_represented_points(cluster1)
181  self.__delete_represented_points(cluster2)
182 
183  merged_cluster = self.__merge_clusters(cluster1, cluster2)
184 
185  self.__insert_represented_points(merged_cluster)
186 
187  # Pointers to clusters that should be relocated is stored here.
188  cluster_relocation_requests = []
189 
190  # Check for the last cluster
191  if len(self.__queue) > 0:
192  merged_cluster.closest = self.__queue[0] # arbitrary cluster from queue
193  merged_cluster.distance = self.__cluster_distance(merged_cluster, merged_cluster.closest)
194 
195  for item in self.__queue:
196  distance = self.__cluster_distance(merged_cluster, item)
197  # Check if distance between new cluster and current is the best than now.
198  if distance < merged_cluster.distance:
199  merged_cluster.closest = item
200  merged_cluster.distance = distance
201 
202  # Check if current cluster has removed neighbor.
203  if (item.closest is cluster1) or (item.closest is cluster2):
204  # If previous distance was less then distance to new cluster then nearest cluster should
205  # be found in the tree.
206  if item.distance < distance:
207  (item.closest, item.distance) = self.__closest_cluster(item, distance)
208 
209  # TODO: investigation is required. There is assumption that itself and merged cluster
210  # should be always in list of neighbors in line with specified radius. But merged cluster
211  # may not be in list due to error calculation, therefore it should be added manually.
212  if item.closest is None:
213  item.closest = merged_cluster
214  item.distance = distance
215 
216  else:
217  item.closest = merged_cluster
218  item.distance = distance
219 
220  cluster_relocation_requests.append(item)
221 
222  # New cluster and updated clusters should relocated in queue
223  self.__insert_cluster(merged_cluster)
224  for item in cluster_relocation_requests:
225  self.__relocate_cluster(item)
226 
227  # Change cluster representation
228  self.__clusters = [cure_cluster_unit.indexes for cure_cluster_unit in self.__queue]
229  self.__representors = [cure_cluster_unit.rep for cure_cluster_unit in self.__queue]
230  self.__means = [cure_cluster_unit.mean for cure_cluster_unit in self.__queue]
231 
232 
233  def get_clusters(self):
234  """!
235  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
236 
237  @return (list) List of allocated clusters.
238 
239  @see process()
240  @see get_representors()
241  @see get_means()
242 
243  """
244 
245  return self.__clusters
246 
247 
248  def get_representors(self):
249  """!
250  @brief Returns list of point-representors of each cluster.
251  @details Cluster index should be used for navigation between lists of point-representors.
252 
253  @return (list) List of point-representors of each cluster.
254 
255  @see get_clusters()
256  @see get_means()
257 
258  """
259 
260  return self.__representors
261 
262 
263  def get_means(self):
264  """!
265  @brief Returns list of mean values of each cluster.
266  @details Cluster index should be used for navigation between mean values.
267 
268  @return (list) List of mean values of each cluster.
269 
270  @see get_clusters()
271  @see get_representors()
272 
273  """
274 
275  return self.__means
276 
277 
279  """!
280  @brief Returns clustering result representation type that indicate how clusters are encoded.
281 
282  @return (type_encoding) Clustering result representation.
283 
284  @see get_clusters()
285 
286  """
287 
288  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
289 
290 
291  def __prepare_data_points(self, sample):
292  """!
293  @brief Prepare data points for clustering.
294  @details In case of numpy.array there are a lot of overloaded basic operators, such as __contains__, __eq__.
295 
296  @return (list) Returns sample in list format.
297 
298  """
299  if isinstance(sample, numpy.ndarray):
300  return sample.tolist()
301 
302  return sample
303 
304 
305  def __validate_arguments(self):
306  """!
307  @brief Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception
308  is thrown.
309 
310  """
311 
312  if len(self.__pointer_data) == 0:
313  raise ValueError("Empty input data. Data should contain at least one point.")
314 
315  if self.__number_cluster <= 0:
316  raise ValueError("Incorrect amount of clusters '%d'. Amount of cluster should be greater than 0." % self.__number_cluster)
317 
318  if self.__compression < 0:
319  raise ValueError("Incorrect compression level '%f'. Compression should not be negative." % self.__compression)
320 
321  if self.__number_represent_points <= 0:
322  raise ValueError("Incorrect amount of representatives '%d'. Amount of representatives should be greater than 0." % self.__number_cluster)
323 
324 
325  def __insert_cluster(self, cluster):
326  """!
327  @brief Insert cluster to the list (sorted queue) in line with sequence order (distance).
328 
329  @param[in] cluster (cure_cluster): Cluster that should be inserted.
330 
331  """
332 
333  for index in range(len(self.__queue)):
334  if cluster.distance < self.__queue[index].distance:
335  self.__queue.insert(index, cluster)
336  return
337 
338  self.__queue.append(cluster)
339 
340 
341  def __relocate_cluster(self, cluster):
342  """!
343  @brief Relocate cluster in list in line with distance order.
344 
345  @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order.
346 
347  """
348 
349  self.__queue.remove(cluster)
350  self.__insert_cluster(cluster)
351 
352 
353  def __closest_cluster(self, cluster, distance):
354  """!
355  @brief Find closest cluster to the specified cluster in line with distance.
356 
357  @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found.
358  @param[in] distance (double): Closest distance to the previous cluster.
359 
360  @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned.
361 
362  """
363 
364  nearest_cluster = None
365  nearest_distance = float('inf')
366 
367  real_euclidean_distance = distance ** 0.5
368 
369  for point in cluster.rep:
370  # Nearest nodes should be returned (at least it will return itself).
371  nearest_nodes = self.__tree.find_nearest_dist_nodes(point, real_euclidean_distance)
372  for (candidate_distance, kdtree_node) in nearest_nodes:
373  if (candidate_distance < nearest_distance) and (kdtree_node is not None) and (kdtree_node.payload is not cluster):
374  nearest_distance = candidate_distance
375  nearest_cluster = kdtree_node.payload
376 
377  return (nearest_cluster, nearest_distance)
378 
379 
380  def __insert_represented_points(self, cluster):
381  """!
382  @brief Insert representation points to the k-d tree.
383 
384  @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted.
385 
386  """
387 
388  for point in cluster.rep:
389  self.__tree.insert(point, cluster)
390 
391 
392  def __delete_represented_points(self, cluster):
393  """!
394  @brief Remove representation points of clusters from the k-d tree
395 
396  @param[in] cluster (cure_cluster): Cluster whose representation points should be removed.
397 
398  """
399 
400  for point in cluster.rep:
401  self.__tree.remove(point, payload=cluster)
402 
403 
404  def __merge_clusters(self, cluster1, cluster2):
405  """!
406  @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster.
407 
408  @param[in] cluster1 (cure_cluster): Cluster that should be merged.
409  @param[in] cluster2 (cure_cluster): Cluster that should be merged.
410 
411  @return (cure_cluster) New merged CURE cluster.
412 
413  """
414 
415  merged_cluster = cure_cluster(None, None)
416 
417  merged_cluster.points = cluster1.points + cluster2.points
418  merged_cluster.indexes = cluster1.indexes + cluster2.indexes
419 
420  # merged_cluster.mean = ( len(cluster1.points) * cluster1.mean + len(cluster2.points) * cluster2.mean ) / ( len(cluster1.points) + len(cluster2.points) );
421  dimension = len(cluster1.mean)
422  merged_cluster.mean = [0] * dimension
423  if merged_cluster.points[1:] == merged_cluster.points[:-1]:
424  merged_cluster.mean = merged_cluster.points[0]
425  else:
426  for index in range(dimension):
427  merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
428 
429  temporary = list()
430 
431  for index in range(self.__number_represent_points):
432  maximal_distance = 0
433  maximal_point = None
434 
435  for point in merged_cluster.points:
436  minimal_distance = 0
437  if index == 0:
438  minimal_distance = euclidean_distance_square(point, merged_cluster.mean)
439  #minimal_distance = euclidean_distance_sqrt(point, merged_cluster.mean);
440  else:
441  minimal_distance = min([euclidean_distance_square(point, p) for p in temporary])
442  #minimal_distance = cluster_distance(cure_cluster(point), cure_cluster(temporary[0]));
443 
444  if minimal_distance >= maximal_distance:
445  maximal_distance = minimal_distance
446  maximal_point = point
447 
448  if maximal_point not in temporary:
449  temporary.append(maximal_point)
450 
451  for point in temporary:
452  representative_point = [0] * dimension
453  for index in range(dimension):
454  representative_point[index] = point[index] + self.__compression * (merged_cluster.mean[index] - point[index])
455 
456  merged_cluster.rep.append(representative_point)
457 
458  return merged_cluster
459 
460 
461  def __create_queue(self):
462  """!
463  @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point.
464 
465  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
466 
467  @return (list) Create queue of sorted clusters by distance between them.
468 
469  """
470 
471  self.__queue = [cure_cluster(self.__pointer_data[index_point], index_point) for index_point in range(len(self.__pointer_data))]
472 
473  # set closest clusters
474  for i in range(0, len(self.__queue)):
475  minimal_distance = float('inf')
476  closest_index_cluster = -1
477 
478  for k in range(0, len(self.__queue)):
479  if i != k:
480  dist = self.__cluster_distance(self.__queue[i], self.__queue[k])
481  if dist < minimal_distance:
482  minimal_distance = dist
483  closest_index_cluster = k
484 
485  self.__queue[i].closest = self.__queue[closest_index_cluster]
486  self.__queue[i].distance = minimal_distance
487 
488  # sort clusters
489  self.__queue.sort(key=lambda x: x.distance, reverse = False)
490 
491 
492  def __create_kdtree(self):
493  """!
494  @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set.
495 
496  @return (kdtree) k-d tree that consist of representative points of CURE clusters.
497 
498  """
499 
500  representatives, payloads = [], []
501  for current_cluster in self.__queue:
502  for representative_point in current_cluster.rep:
503  representatives.append(representative_point)
504  payloads.append(current_cluster)
505 
506  # initialize it using constructor to have balanced tree at the beginning to ensure the highest performance
507  # when we have the biggest amount of nodes in the tree.
508  self.__tree = kdtree(representatives, payloads)
509 
510 
511  def __cluster_distance(self, cluster1, cluster2):
512  """!
513  @brief Calculate minimal distance between clusters using representative points.
514 
515  @param[in] cluster1 (cure_cluster): The first cluster.
516  @param[in] cluster2 (cure_cluster): The second cluster.
517 
518  @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters.
519 
520  """
521 
522  distance = float('inf')
523  for i in range(0, len(cluster1.rep)):
524  for k in range(0, len(cluster2.rep)):
525  dist = euclidean_distance_square(cluster1.rep[i], cluster2.rep[k]) # Fast mode
526  if dist < distance:
527  distance = dist
528 
529  return distance
pyclustering.cluster.cure.cure
Class represents clustering algorithm CURE with KD-tree optimization.
Definition: cure.py:74
pyclustering.container.kdtree
Data Structure: KD-Tree.
Definition: kdtree.py:1
pyclustering.cluster.cure.cure.get_representors
def get_representors(self)
Returns list of point-representors of each cluster.
Definition: cure.py:248
pyclustering.cluster.cure.cure.__delete_represented_points
def __delete_represented_points(self, cluster)
Remove representation points of clusters from the k-d tree.
Definition: cure.py:392
pyclustering.cluster.cure.cure.__queue
__queue
Definition: cure.py:471
pyclustering.cluster.cure.cure.__compression
__compression
Definition: cure.py:122
pyclustering.cluster.cure.cure_cluster.points
points
List of points that make up cluster.
Definition: cure.py:43
pyclustering.cluster.cure.cure.get_cluster_encoding
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: cure.py:278
pyclustering.cluster.cure.cure.__init__
def __init__(self, data, number_cluster, number_represent_points=5, compression=0.5, ccore=True)
Constructor of clustering algorithm CURE.
Definition: cure.py:102
pyclustering.cluster.cure.cure_cluster.__init__
def __init__(self, point, index)
Constructor of CURE cluster.
Definition: cure.py:33
pyclustering.container.kdtree.kdtree
Represents KD Tree that is a space-partitioning data structure for organizing points in a k-dimension...
Definition: kdtree.py:503
pyclustering.cluster.cure.cure.__prepare_data_points
def __prepare_data_points(self, sample)
Prepare data points for clustering.
Definition: cure.py:291
pyclustering.cluster.cure.cure.__pointer_data
__pointer_data
Definition: cure.py:114
pyclustering.cluster.cure.cure_cluster.mean
mean
Mean of points that make up cluster.
Definition: cure.py:49
pyclustering.cluster.cure.cure.__representors
__representors
Definition: cure.py:117
pyclustering.cluster.cure.cure_cluster.__repr__
def __repr__(self)
Displays distance to closest cluster and points that are contained by current cluster.
Definition: cure.py:66
pyclustering.cluster.cure.cure.__validate_arguments
def __validate_arguments(self)
Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception ...
Definition: cure.py:305
pyclustering.cluster.cure.cure.__process_by_python
def __process_by_python(self)
Performs cluster analysis using python code.
Definition: cure.py:165
pyclustering.cluster.cure.cure.__closest_cluster
def __closest_cluster(self, cluster, distance)
Find closest cluster to the specified cluster in line with distance.
Definition: cure.py:353
pyclustering.cluster.cure.cure.get_clusters
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
Definition: cure.py:233
pyclustering.cluster.cure.cure.process
def process(self)
Performs cluster analysis in line with rules of CURE algorithm.
Definition: cure.py:131
pyclustering.cluster.cure.cure.__means
__means
Definition: cure.py:118
pyclustering.cluster.cure.cure_cluster.indexes
indexes
Point indexes in dataset.
Definition: cure.py:46
pyclustering.cluster.cure.cure.__merge_clusters
def __merge_clusters(self, cluster1, cluster2)
Merges two clusters and returns new merged cluster.
Definition: cure.py:404
pyclustering.cluster.cure.cure.__tree
__tree
Definition: cure.py:508
pyclustering.cluster.cure.cure.__clusters
__clusters
Definition: cure.py:116
pyclustering.cluster.cure.cure.__process_by_ccore
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: cure.py:150
pyclustering.cluster.cure.cure_cluster.distance
distance
Distance to the closest cluster.
Definition: cure.py:64
pyclustering.cluster.cure.cure.__cluster_distance
def __cluster_distance(self, cluster1, cluster2)
Calculate minimal distance between clusters using representative points.
Definition: cure.py:511
pyclustering.cluster.cure.cure.__create_queue
def __create_queue(self)
Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbo...
Definition: cure.py:461
pyclustering.cluster.cure.cure_cluster
Represents data cluster in CURE term.
Definition: cure.py:26
pyclustering.cluster.cure.cure.__insert_cluster
def __insert_cluster(self, cluster)
Insert cluster to the list (sorted queue) in line with sequence order (distance).
Definition: cure.py:325
pyclustering.cluster.cure.cure.__relocate_cluster
def __relocate_cluster(self, cluster)
Relocate cluster in list in line with distance order.
Definition: cure.py:341
pyclustering.cluster.cure.cure.__insert_represented_points
def __insert_represented_points(self, cluster)
Insert representation points to the k-d tree.
Definition: cure.py:380
pyclustering.cluster.cure.cure.__number_cluster
__number_cluster
Definition: cure.py:120
pyclustering.cluster.cure.cure.get_means
def get_means(self)
Returns list of mean values of each cluster.
Definition: cure.py:263
pyclustering.cluster.cure.cure.__ccore
__ccore
Definition: cure.py:124
pyclustering.utils
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
pyclustering.cluster.cure.cure_cluster.closest
closest
Pointer to the closest cluster.
Definition: cure.py:61
pyclustering.cluster.cure.cure_cluster.rep
rep
List of points that represents clusters.
Definition: cure.py:52
pyclustering.cluster.encoder
Module for representing clustering results.
Definition: encoder.py:1
pyclustering.cluster.cure.cure.__number_represent_points
__number_represent_points
Definition: cure.py:121
pyclustering.cluster.cure.cure.__create_kdtree
def __create_kdtree(self)
Create k-d tree in line with created clusters.
Definition: cure.py:492