cure.py
1 """!
2 
3 @brief Cluster analysis algorithm: CURE
4 @details Implementation based on paper @cite article::cure::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.utils import euclidean_distance_square
33 
34 from pyclustering.container.kdtree import kdtree
35 
36 from pyclustering.core.wrapper import ccore_library
37 
38 import pyclustering.core.cure_wrapper as wrapper
39 
40 
42  """!
43  @brief Represents data cluster in CURE term.
44  @details CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center.
45 
46  """
47 
48  def __init__(self, point, index):
49  """!
50  @brief Constructor of CURE cluster.
51 
52  @param[in] point (list): Point represented by list of coordinates.
53  @param[in] index (uint): Index point in dataset.
54 
55  """
56 
57 
58  self.points = [ ]
59 
60 
61  self.indexes = -1
62 
63 
64  self.mean = None
65 
66 
67  self.rep = [ ]
68 
69  if point is not None:
70  self.points = [ point ]
71  self.indexes = [ index ]
72  self.mean = point
73  self.rep = [ point ]
74 
75 
76  self.closest = None
77 
78 
79  self.distance = float('inf') # calculation of distance is really complexity operation (even square distance), so let's store distance to closest cluster.
80 
81  def __repr__(self):
82  """!
83  @brief Displays distance to closest cluster and points that are contained by current cluster.
84 
85  """
86  return "%s, %s" % (self.distance, self.points)
87 
88 
89 class cure:
90  """!
91  @brief Class represents clustering algorithm CURE with KD-tree optimization.
92  @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
93 
94  Here is an example how to perform cluster analysis of sample 'Lsun':
95  @code
96  from pyclustering.cluster import cluster_visualizer;
97  from pyclustering.cluster.cure import cure;
98  from pyclustering.utils import read_sample;
99  from pyclustering.samples.definitions import FCPS_SAMPLES;
100 
101  # Input data in following format [ [0.1, 0.5], [0.3, 0.1], ... ].
102  input_data = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);
103 
104  # Allocate three clusters.
105  cure_instance = cure(input_data, 3);
106  cure_instance.process();
107  clusters = cure_instance.get_clusters();
108 
109  # Visualize allocated clusters.
110  visualizer = cluster_visualizer();
111  visualizer.append_clusters(clusters, input_data);
112  visualizer.show();
113  @endcode
114 
115  """
116 
117  def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = True):
118  """!
119  @brief Constructor of clustering algorithm CURE.
120 
121  @param[in] data (array_like): Input data that should be processed.
122  @param[in] number_cluster (uint): Number of clusters that should be allocated.
123  @param[in] number_represent_points (uint): Number of representative points for each cluster.
124  @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1.
125  @param[in] ccore (bool): If True then CCORE (C++ solution) will be used for solving.
126 
127  """
128 
129  self.__pointer_data = self.__prepare_data_points(data)
130 
131  self.__clusters = None
132  self.__representors = None
133  self.__means = None
134 
135  self.__number_cluster = number_cluster
136  self.__number_represent_points = number_represent_points
137  self.__compression = compression
138 
139  self.__ccore = ccore
140  if self.__ccore:
141  self.__ccore = ccore_library.workable()
142 
143  self.__validate_arguments()
144 
145 
146  def process(self):
147  """!
148  @brief Performs cluster analysis in line with rules of CURE algorithm.
149 
150  @remark Results of clustering can be obtained using corresponding get methods.
151 
152  @see get_clusters()
153 
154  """
155 
156  if self.__ccore is True:
157  self.__process_by_ccore()
158 
159  else:
160  self.__process_by_python()
161 
162 
163  def __process_by_ccore(self):
164  """!
165  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
166 
167  """
168  cure_data_pointer = wrapper.cure_algorithm(self.__pointer_data, self.__number_cluster,
170 
171  self.__clusters = wrapper.cure_get_clusters(cure_data_pointer)
172  self.__representors = wrapper.cure_get_representors(cure_data_pointer)
173  self.__means = wrapper.cure_get_means(cure_data_pointer)
174 
175  wrapper.cure_data_destroy(cure_data_pointer)
176 
177 
178  def __process_by_python(self):
179  """!
180  @brief Performs cluster analysis using python code.
181 
182  """
183  self.__create_queue() # queue
184  self.__create_kdtree() # create k-d tree
185 
186  while len(self.__queue) > self.__number_cluster:
187  cluster1 = self.__queue[0] # cluster that has nearest neighbor.
188  cluster2 = cluster1.closest # closest cluster.
189 
190  self.__queue.remove(cluster1)
191  self.__queue.remove(cluster2)
192 
193  self.__delete_represented_points(cluster1)
194  self.__delete_represented_points(cluster2)
195 
196  merged_cluster = self.__merge_clusters(cluster1, cluster2)
197 
198  self.__insert_represented_points(merged_cluster)
199 
200  # Pointers to clusters that should be relocated is stored here.
201  cluster_relocation_requests = []
202 
203  # Check for the last cluster
204  if len(self.__queue) > 0:
205  merged_cluster.closest = self.__queue[0] # arbitrary cluster from queue
206  merged_cluster.distance = self.__cluster_distance(merged_cluster, merged_cluster.closest)
207 
208  for item in self.__queue:
209  distance = self.__cluster_distance(merged_cluster, item)
210  # Check if distance between new cluster and current is the best than now.
211  if distance < merged_cluster.distance:
212  merged_cluster.closest = item
213  merged_cluster.distance = distance
214 
215  # Check if current cluster has removed neighbor.
216  if (item.closest is cluster1) or (item.closest is cluster2):
217  # If previous distance was less then distance to new cluster then nearest cluster should
218  # be found in the tree.
219  if item.distance < distance:
220  (item.closest, item.distance) = self.__closest_cluster(item, distance)
221 
222  # TODO: investigation is required. There is assumption that itself and merged cluster
223  # should be always in list of neighbors in line with specified radius. But merged cluster
224  # may not be in list due to error calculation, therefore it should be added manually.
225  if item.closest is None:
226  item.closest = merged_cluster
227  item.distance = distance
228 
229  else:
230  item.closest = merged_cluster
231  item.distance = distance
232 
233  cluster_relocation_requests.append(item)
234 
235  # New cluster and updated clusters should relocated in queue
236  self.__insert_cluster(merged_cluster)
237  for item in cluster_relocation_requests:
238  self.__relocate_cluster(item)
239 
240  # Change cluster representation
241  self.__clusters = [cure_cluster_unit.indexes for cure_cluster_unit in self.__queue]
242  self.__representors = [cure_cluster_unit.rep for cure_cluster_unit in self.__queue]
243  self.__means = [cure_cluster_unit.mean for cure_cluster_unit in self.__queue]
244 
245 
246  def get_clusters(self):
247  """!
248  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
249 
250  @return (list) List of allocated clusters.
251 
252  @see process()
253  @see get_representors()
254  @see get_means()
255 
256  """
257 
258  return self.__clusters
259 
260 
261  def get_representors(self):
262  """!
263  @brief Returns list of point-representors of each cluster.
264  @details Cluster index should be used for navigation between lists of point-representors.
265 
266  @return (list) List of point-representors of each cluster.
267 
268  @see get_clusters()
269  @see get_means()
270 
271  """
272 
273  return self.__representors
274 
275 
276  def get_means(self):
277  """!
278  @brief Returns list of mean values of each cluster.
279  @details Cluster index should be used for navigation between mean values.
280 
281  @return (list) List of mean values of each cluster.
282 
283  @see get_clusters()
284  @see get_representors()
285 
286  """
287 
288  return self.__means
289 
290 
292  """!
293  @brief Returns clustering result representation type that indicate how clusters are encoded.
294 
295  @return (type_encoding) Clustering result representation.
296 
297  @see get_clusters()
298 
299  """
300 
301  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
302 
303 
304  def __prepare_data_points(self, sample):
305  """!
306  @brief Prepare data points for clustering.
307  @details In case of numpy.array there are a lot of overloaded basic operators, such as __contains__, __eq__.
308 
309  @return (list) Returns sample in list format.
310 
311  """
312  if isinstance(sample, numpy.ndarray):
313  return sample.tolist()
314 
315  return sample
316 
317 
318  def __validate_arguments(self):
319  """!
320  @brief Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception
321  is thrown.
322 
323  """
324 
325  if len(self.__pointer_data) == 0:
326  raise ValueError("Empty input data. Data should contain at least one point.")
327 
328  if self.__number_cluster <= 0:
329  raise ValueError("Incorrect amount of clusters '%d'. Amount of cluster should be greater than 0." % self.__number_cluster)
330 
331  if self.__compression < 0:
332  raise ValueError("Incorrect compression level '%f'. Compression should not be negative." % self.__compression)
333 
334  if self.__number_represent_points <= 0:
335  raise ValueError("Incorrect amount of representatives '%d'. Amount of representatives should be greater than 0." % self.__number_cluster)
336 
337 
338  def __insert_cluster(self, cluster):
339  """!
340  @brief Insert cluster to the list (sorted queue) in line with sequence order (distance).
341 
342  @param[in] cluster (cure_cluster): Cluster that should be inserted.
343 
344  """
345 
346  for index in range(len(self.__queue)):
347  if cluster.distance < self.__queue[index].distance:
348  self.__queue.insert(index, cluster)
349  return
350 
351  self.__queue.append(cluster)
352 
353 
354  def __relocate_cluster(self, cluster):
355  """!
356  @brief Relocate cluster in list in line with distance order.
357 
358  @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order.
359 
360  """
361 
362  self.__queue.remove(cluster)
363  self.__insert_cluster(cluster)
364 
365 
366  def __closest_cluster(self, cluster, distance):
367  """!
368  @brief Find closest cluster to the specified cluster in line with distance.
369 
370  @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found.
371  @param[in] distance (double): Closest distance to the previous cluster.
372 
373  @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned.
374 
375  """
376 
377  nearest_cluster = None
378  nearest_distance = float('inf')
379 
380  real_euclidean_distance = distance ** 0.5
381 
382  for point in cluster.rep:
383  # Nearest nodes should be returned (at least it will return itself).
384  nearest_nodes = self.__tree.find_nearest_dist_nodes(point, real_euclidean_distance)
385  for (candidate_distance, kdtree_node) in nearest_nodes:
386  if (candidate_distance < nearest_distance) and (kdtree_node is not None) and (kdtree_node.payload is not cluster):
387  nearest_distance = candidate_distance
388  nearest_cluster = kdtree_node.payload
389 
390  return (nearest_cluster, nearest_distance)
391 
392 
393  def __insert_represented_points(self, cluster):
394  """!
395  @brief Insert representation points to the k-d tree.
396 
397  @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted.
398 
399  """
400 
401  for point in cluster.rep:
402  self.__tree.insert(point, cluster)
403 
404 
405  def __delete_represented_points(self, cluster):
406  """!
407  @brief Remove representation points of clusters from the k-d tree
408 
409  @param[in] cluster (cure_cluster): Cluster whose representation points should be removed.
410 
411  """
412 
413  for point in cluster.rep:
414  self.__tree.remove(point, payload=cluster)
415 
416 
417  def __merge_clusters(self, cluster1, cluster2):
418  """!
419  @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster.
420 
421  @param[in] cluster1 (cure_cluster): Cluster that should be merged.
422  @param[in] cluster2 (cure_cluster): Cluster that should be merged.
423 
424  @return (cure_cluster) New merged CURE cluster.
425 
426  """
427 
428  merged_cluster = cure_cluster(None, None)
429 
430  merged_cluster.points = cluster1.points + cluster2.points
431  merged_cluster.indexes = cluster1.indexes + cluster2.indexes
432 
433  # merged_cluster.mean = ( len(cluster1.points) * cluster1.mean + len(cluster2.points) * cluster2.mean ) / ( len(cluster1.points) + len(cluster2.points) );
434  dimension = len(cluster1.mean)
435  merged_cluster.mean = [0] * dimension
436  if merged_cluster.points[1:] == merged_cluster.points[:-1]:
437  merged_cluster.mean = merged_cluster.points[0]
438  else:
439  for index in range(dimension):
440  merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
441 
442  temporary = list()
443 
444  for index in range(self.__number_represent_points):
445  maximal_distance = 0
446  maximal_point = None
447 
448  for point in merged_cluster.points:
449  minimal_distance = 0
450  if index == 0:
451  minimal_distance = euclidean_distance_square(point, merged_cluster.mean)
452  #minimal_distance = euclidean_distance_sqrt(point, merged_cluster.mean);
453  else:
454  minimal_distance = min([euclidean_distance_square(point, p) for p in temporary])
455  #minimal_distance = cluster_distance(cure_cluster(point), cure_cluster(temporary[0]));
456 
457  if minimal_distance >= maximal_distance:
458  maximal_distance = minimal_distance
459  maximal_point = point
460 
461  if maximal_point not in temporary:
462  temporary.append(maximal_point)
463 
464  for point in temporary:
465  representative_point = [0] * dimension
466  for index in range(dimension):
467  representative_point[index] = point[index] + self.__compression * (merged_cluster.mean[index] - point[index])
468 
469  merged_cluster.rep.append(representative_point)
470 
471  return merged_cluster
472 
473 
474  def __create_queue(self):
475  """!
476  @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point.
477 
478  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
479 
480  @return (list) Create queue of sorted clusters by distance between them.
481 
482  """
483 
484  self.__queue = [cure_cluster(self.__pointer_data[index_point], index_point) for index_point in range(len(self.__pointer_data))]
485 
486  # set closest clusters
487  for i in range(0, len(self.__queue)):
488  minimal_distance = float('inf')
489  closest_index_cluster = -1
490 
491  for k in range(0, len(self.__queue)):
492  if i != k:
493  dist = self.__cluster_distance(self.__queue[i], self.__queue[k])
494  if dist < minimal_distance:
495  minimal_distance = dist
496  closest_index_cluster = k
497 
498  self.__queue[i].closest = self.__queue[closest_index_cluster]
499  self.__queue[i].distance = minimal_distance
500 
501  # sort clusters
502  self.__queue.sort(key = lambda x: x.distance, reverse = False)
503 
504 
505  def __create_kdtree(self):
506  """!
507  @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set.
508 
509  @return (kdtree) k-d tree that consist of representative points of CURE clusters.
510 
511  """
512 
513  self.__tree = kdtree()
514  for current_cluster in self.__queue:
515  for representative_point in current_cluster.rep:
516  self.__tree.insert(representative_point, current_cluster)
517 
518 
519  def __cluster_distance(self, cluster1, cluster2):
520  """!
521  @brief Calculate minimal distance between clusters using representative points.
522 
523  @param[in] cluster1 (cure_cluster): The first cluster.
524  @param[in] cluster2 (cure_cluster): The second cluster.
525 
526  @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters.
527 
528  """
529 
530  distance = float('inf')
531  for i in range(0, len(cluster1.rep)):
532  for k in range(0, len(cluster2.rep)):
533  dist = euclidean_distance_square(cluster1.rep[i], cluster2.rep[k]); # Fast mode
534  #dist = euclidean_distance(cluster1.rep[i], cluster2.rep[k]) # Slow mode
535  if dist < distance:
536  distance = dist
537 
538  return distance
def __insert_represented_points(self, cluster)
Insert representation points to the k-d tree.
Definition: cure.py:393
Class represents clustering algorithm CURE with KD-tree optimization.
Definition: cure.py:89
Represents data cluster in CURE term.
Definition: cure.py:41
rep
List of points that represents clusters.
Definition: cure.py:67
def __validate_arguments(self)
Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception ...
Definition: cure.py:318
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: cure.py:163
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def get_means(self)
Returns list of mean values of each cluster.
Definition: cure.py:276
Module for representing clustering results.
Definition: encoder.py:1
indexes
Point indexes in dataset.
Definition: cure.py:61
def __init__(self, point, index)
Constructor of CURE cluster.
Definition: cure.py:48
Represents KD Tree that is a space-partitioning data structure for organizing points in a k-dimension...
Definition: kdtree.py:157
def __process_by_python(self)
Performs cluster analysis using python code.
Definition: cure.py:178
def __delete_represented_points(self, cluster)
Remove representation points of clusters from the k-d tree.
Definition: cure.py:405
points
List of points that make up cluster.
Definition: cure.py:58
closest
Pointer to the closest cluster.
Definition: cure.py:76
def get_representors(self)
Returns list of point-representors of each cluster.
Definition: cure.py:261
distance
Distance to the closest cluster.
Definition: cure.py:79
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: cure.py:246
def __create_kdtree(self)
Create k-d tree in line with created clusters.
Definition: cure.py:505
def process(self)
Performs cluster analysis in line with rules of CURE algorithm.
Definition: cure.py:146
def __insert_cluster(self, cluster)
Insert cluster to the list (sorted queue) in line with sequence order (distance). ...
Definition: cure.py:338
def __create_queue(self)
Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbo...
Definition: cure.py:474
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: cure.py:291
mean
Mean of points that make up cluster.
Definition: cure.py:64
def __cluster_distance(self, cluster1, cluster2)
Calculate minimal distance between clusters using representative points.
Definition: cure.py:519
def __closest_cluster(self, cluster, distance)
Find closest cluster to the specified cluster in line with distance.
Definition: cure.py:366
def __relocate_cluster(self, cluster)
Relocate cluster in list in line with distance order.
Definition: cure.py:354
def __merge_clusters(self, cluster1, cluster2)
Merges two clusters and returns new merged cluster.
Definition: cure.py:417
def __repr__(self)
Displays distance to closest cluster and points that are contained by current cluster.
Definition: cure.py:81
def __init__(self, data, number_cluster, number_represent_points=5, compression=0.5, ccore=True)
Constructor of clustering algorithm CURE.
Definition: cure.py:117
def __prepare_data_points(self, sample)
Prepare data points for clustering.
Definition: cure.py:304
Data Structure: KD-Tree.
Definition: kdtree.py:1