cure.py
1 """!
2 
3 @brief Cluster analysis algorithm: CURE
4 @details Implementation based on paper @cite article::cure::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.utils import euclidean_distance_square
33 
34 from pyclustering.container.kdtree import kdtree
35 
36 from pyclustering.core.wrapper import ccore_library
37 
38 import pyclustering.core.cure_wrapper as wrapper
39 
40 
42  """!
43  @brief Represents data cluster in CURE term.
44  @details CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center.
45 
46  """
47 
48  def __init__(self, point, index):
49  """!
50  @brief Constructor of CURE cluster.
51 
52  @param[in] point (list): Point represented by list of coordinates.
53  @param[in] index (uint): Index point in dataset.
54 
55  """
56 
57 
58  self.points = [ ]
59 
60 
61  self.indexes = -1
62 
63 
64  self.mean = None
65 
66 
67  self.rep = [ ]
68 
69  if point is not None:
70  self.points = [ point ]
71  self.indexes = [ index ]
72  self.mean = point
73  self.rep = [ point ]
74 
75 
76  self.closest = None
77 
78 
79  self.distance = float('inf') # calculation of distance is really complexity operation (even square distance), so let's store distance to closest cluster.
80 
81  def __repr__(self):
82  """!
83  @brief Displays distance to closest cluster and points that are contained by current cluster.
84 
85  """
86  return "%s, %s" % (self.distance, self.points)
87 
88 
89 class cure:
90  """!
91  @brief Class represents clustering algorithm CURE with KD-tree optimization.
92  @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
93 
94  Example:
95  @code
96  # read data for clustering from some file
97  sample = read_sample(path_to_data);
98 
99  # create instance of cure algorithm for cluster analysis
100  # request for allocation of two clusters.
101  cure_instance = cure(sample, 2, 5, 0.5, True);
102 
103  # run cluster analysis
104  cure_instance.process();
105 
106  # get results of clustering
107  clusters = cure_instance.get_clusters();
108  @endcode
109 
110  """
111 
112  def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = True):
113  """!
114  @brief Constructor of clustering algorithm CURE.
115 
116  @param[in] data (array_like): Input data that should be processed.
117  @param[in] number_cluster (uint): Number of clusters that should be allocated.
118  @param[in] number_represent_points (uint): Number of representative points for each cluster.
119  @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1.
120  @param[in] ccore (bool): If True then CCORE (C++ solution) will be used for solving.
121 
122  """
123 
124  self.__pointer_data = self.__prepare_data_points(data)
125 
126  self.__clusters = None
127  self.__representors = None
128  self.__means = None
129 
130  self.__number_cluster = number_cluster
131  self.__number_represent_points = number_represent_points
132  self.__compression = compression
133 
134  self.__ccore = ccore
135  if self.__ccore:
136  self.__ccore = ccore_library.workable()
137 
138  self.__validate_arguments()
139 
140 
141  def process(self):
142  """!
143  @brief Performs cluster analysis in line with rules of CURE algorithm.
144 
145  @remark Results of clustering can be obtained using corresponding get methods.
146 
147  @see get_clusters()
148 
149  """
150 
151  if self.__ccore is True:
152  self.__process_by_ccore()
153 
154  else:
155  self.__process_by_python()
156 
157 
158  def __process_by_ccore(self):
159  """!
160  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
161 
162  """
163  cure_data_pointer = wrapper.cure_algorithm(self.__pointer_data, self.__number_cluster,
165 
166  self.__clusters = wrapper.cure_get_clusters(cure_data_pointer)
167  self.__representors = wrapper.cure_get_representors(cure_data_pointer)
168  self.__means = wrapper.cure_get_means(cure_data_pointer)
169 
170  wrapper.cure_data_destroy(cure_data_pointer)
171 
172 
173  def __process_by_python(self):
174  """!
175  @brief Performs cluster analysis using python code.
176 
177  """
178  self.__create_queue() # queue
179  self.__create_kdtree() # create k-d tree
180 
181  while len(self.__queue) > self.__number_cluster:
182  cluster1 = self.__queue[0] # cluster that has nearest neighbor.
183  cluster2 = cluster1.closest # closest cluster.
184 
185  self.__queue.remove(cluster1)
186  self.__queue.remove(cluster2)
187 
188  self.__delete_represented_points(cluster1)
189  self.__delete_represented_points(cluster2)
190 
191  merged_cluster = self.__merge_clusters(cluster1, cluster2)
192 
193  self.__insert_represented_points(merged_cluster)
194 
195  # Pointers to clusters that should be relocated is stored here.
196  cluster_relocation_requests = []
197 
198  # Check for the last cluster
199  if len(self.__queue) > 0:
200  merged_cluster.closest = self.__queue[0] # arbitrary cluster from queue
201  merged_cluster.distance = self.__cluster_distance(merged_cluster, merged_cluster.closest)
202 
203  for item in self.__queue:
204  distance = self.__cluster_distance(merged_cluster, item)
205  # Check if distance between new cluster and current is the best than now.
206  if distance < merged_cluster.distance:
207  merged_cluster.closest = item
208  merged_cluster.distance = distance
209 
210  # Check if current cluster has removed neighbor.
211  if (item.closest is cluster1) or (item.closest is cluster2):
212  # If previous distance was less then distance to new cluster then nearest cluster should be found in the tree.
213  # print("Update: ", item);
214  if item.distance < distance:
215  (item.closest, item.distance) = self.__closest_cluster(item, distance)
216 
217  # TODO: investigation of root cause is required.
218  # Itself and merged cluster should be always in list of neighbors in line with specified radius.
219  # But merged cluster may not be in list due to error calculation, therefore it should be added
220  # manually.
221  if item.closest is None:
222  item.closest = merged_cluster
223  item.distance = distance
224 
225  # Otherwise new cluster is nearest.
226  else:
227  item.closest = merged_cluster
228  item.distance = distance
229 
230  cluster_relocation_requests.append(item)
231  elif item.distance > distance:
232  item.closest = merged_cluster
233  item.distance = distance
234 
235  cluster_relocation_requests.append(item)
236 
237  # New cluster and updated clusters should relocated in queue
238  self.__insert_cluster(merged_cluster)
239  for item in cluster_relocation_requests:
240  self.__relocate_cluster(item)
241 
242  # Change cluster representation
243  self.__clusters = [cure_cluster_unit.indexes for cure_cluster_unit in self.__queue]
244  self.__representors = [cure_cluster_unit.rep for cure_cluster_unit in self.__queue]
245  self.__means = [cure_cluster_unit.mean for cure_cluster_unit in self.__queue]
246 
247 
248  def get_clusters(self):
249  """!
250  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
251 
252  @return (list) List of allocated clusters.
253 
254  @see process()
255  @see get_representors()
256  @see get_means()
257 
258  """
259 
260  return self.__clusters
261 
262 
263  def get_representors(self):
264  """!
265  @brief Returns list of point-representors of each cluster.
266  @details Cluster index should be used for navigation between lists of point-representors.
267 
268  @return (list) List of point-representors of each cluster.
269 
270  @see get_clusters()
271  @see get_means()
272 
273  """
274 
275  return self.__representors
276 
277 
278  def get_means(self):
279  """!
280  @brief Returns list of mean values of each cluster.
281  @details Cluster index should be used for navigation between mean values.
282 
283  @return (list) List of mean values of each cluster.
284 
285  @see get_clusters()
286  @see get_representors()
287 
288  """
289 
290  return self.__means
291 
292 
294  """!
295  @brief Returns clustering result representation type that indicate how clusters are encoded.
296 
297  @return (type_encoding) Clustering result representation.
298 
299  @see get_clusters()
300 
301  """
302 
303  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
304 
305 
306  def __prepare_data_points(self, sample):
307  """!
308  @brief Prepare data points for clustering.
309  @details In case of numpy.array there are a lot of overloaded basic operators, such as __contains__, __eq__.
310 
311  @return (list) Returns sample in list format.
312 
313  """
314  if isinstance(sample, numpy.ndarray):
315  return sample.tolist()
316 
317  return sample
318 
319 
320  def __validate_arguments(self):
321  """!
322  @brief Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception
323  is thrown.
324 
325  """
326 
327  if len(self.__pointer_data) == 0:
328  raise ValueError("Empty input data. Data should contain at least one point.")
329 
330  if self.__number_cluster <= 0:
331  raise ValueError("Incorrect amount of clusters '%d'. Amount of cluster should be greater than 0." % self.__number_cluster)
332 
333  if self.__compression < 0:
334  raise ValueError("Incorrect compression level '%f'. Compression should not be negative." % self.__compression)
335 
336  if self.__number_represent_points <= 0:
337  raise ValueError("Incorrect amount of representatives '%d'. Amount of representatives should be greater than 0." % self.__number_cluster)
338 
339 
340  def __insert_cluster(self, cluster):
341  """!
342  @brief Insert cluster to the list (sorted queue) in line with sequence order (distance).
343 
344  @param[in] cluster (cure_cluster): Cluster that should be inserted.
345 
346  """
347 
348  for index in range(len(self.__queue)):
349  if cluster.distance < self.__queue[index].distance:
350  self.__queue.insert(index, cluster)
351  return
352 
353  self.__queue.append(cluster)
354 
355 
356  def __relocate_cluster(self, cluster):
357  """!
358  @brief Relocate cluster in list in line with distance order.
359 
360  @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order.
361 
362  """
363 
364  self.__queue.remove(cluster)
365  self.__insert_cluster(cluster)
366 
367 
368  def __closest_cluster(self, cluster, distance):
369  """!
370  @brief Find closest cluster to the specified cluster in line with distance.
371 
372  @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found.
373  @param[in] distance (double): Closest distance to the previous cluster.
374 
375  @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned.
376 
377  """
378 
379  nearest_cluster = None
380  nearest_distance = float('inf')
381 
382  real_euclidean_distance = distance ** 0.5
383 
384  for point in cluster.rep:
385  # Nearest nodes should be returned (at least it will return itself).
386  nearest_nodes = self.__tree.find_nearest_dist_nodes(point, real_euclidean_distance)
387  for (candidate_distance, kdtree_node) in nearest_nodes:
388  if (candidate_distance < nearest_distance) and (kdtree_node is not None) and (kdtree_node.payload is not cluster):
389  nearest_distance = candidate_distance
390  nearest_cluster = kdtree_node.payload
391 
392  return (nearest_cluster, nearest_distance)
393 
394 
395  def __insert_represented_points(self, cluster):
396  """!
397  @brief Insert representation points to the k-d tree.
398 
399  @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted.
400 
401  """
402 
403  for point in cluster.rep:
404  self.__tree.insert(point, cluster)
405 
406 
407  def __delete_represented_points(self, cluster):
408  """!
409  @brief Remove representation points of clusters from the k-d tree
410 
411  @param[in] cluster (cure_cluster): Cluster whose representation points should be removed.
412 
413  """
414 
415  for point in cluster.rep:
416  self.__tree.remove(point, payload=cluster)
417 
418 
419  def __merge_clusters(self, cluster1, cluster2):
420  """!
421  @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster.
422 
423  @param[in] cluster1 (cure_cluster): Cluster that should be merged.
424  @param[in] cluster2 (cure_cluster): Cluster that should be merged.
425 
426  @return (cure_cluster) New merged CURE cluster.
427 
428  """
429 
430  merged_cluster = cure_cluster(None, None)
431 
432  merged_cluster.points = cluster1.points + cluster2.points
433  merged_cluster.indexes = cluster1.indexes + cluster2.indexes
434 
435  # merged_cluster.mean = ( len(cluster1.points) * cluster1.mean + len(cluster2.points) * cluster2.mean ) / ( len(cluster1.points) + len(cluster2.points) );
436  dimension = len(cluster1.mean)
437  merged_cluster.mean = [0] * dimension
438  if merged_cluster.points[1:] == merged_cluster.points[:-1]:
439  merged_cluster.mean = merged_cluster.points[0]
440  else:
441  for index in range(dimension):
442  merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
443 
444  temporary = list()
445 
446  for index in range(self.__number_represent_points):
447  maximal_distance = 0
448  maximal_point = None
449 
450  for point in merged_cluster.points:
451  minimal_distance = 0
452  if index == 0:
453  minimal_distance = euclidean_distance_square(point, merged_cluster.mean)
454  #minimal_distance = euclidean_distance_sqrt(point, merged_cluster.mean);
455  else:
456  minimal_distance = min([euclidean_distance_square(point, p) for p in temporary])
457  #minimal_distance = cluster_distance(cure_cluster(point), cure_cluster(temporary[0]));
458 
459  if minimal_distance >= maximal_distance:
460  maximal_distance = minimal_distance
461  maximal_point = point
462 
463  if maximal_point not in temporary:
464  temporary.append(maximal_point)
465 
466  for point in temporary:
467  representative_point = [0] * dimension
468  for index in range(dimension):
469  representative_point[index] = point[index] + self.__compression * (merged_cluster.mean[index] - point[index])
470 
471  merged_cluster.rep.append(representative_point)
472 
473  return merged_cluster
474 
475 
476  def __create_queue(self):
477  """!
478  @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point.
479 
480  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
481 
482  @return (list) Create queue of sorted clusters by distance between them.
483 
484  """
485 
486  self.__queue = [cure_cluster(self.__pointer_data[index_point], index_point) for index_point in range(len(self.__pointer_data))]
487 
488  # set closest clusters
489  for i in range(0, len(self.__queue)):
490  minimal_distance = float('inf')
491  closest_index_cluster = -1
492 
493  for k in range(0, len(self.__queue)):
494  if i != k:
495  dist = self.__cluster_distance(self.__queue[i], self.__queue[k])
496  if dist < minimal_distance:
497  minimal_distance = dist
498  closest_index_cluster = k
499 
500  self.__queue[i].closest = self.__queue[closest_index_cluster]
501  self.__queue[i].distance = minimal_distance
502 
503  # sort clusters
504  self.__queue.sort(key = lambda x: x.distance, reverse = False)
505 
506 
507  def __create_kdtree(self):
508  """!
509  @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set.
510 
511  @return (kdtree) k-d tree that consist of representative points of CURE clusters.
512 
513  """
514 
515  self.__tree = kdtree()
516  for current_cluster in self.__queue:
517  for representative_point in current_cluster.rep:
518  self.__tree.insert(representative_point, current_cluster)
519 
520 
521  def __cluster_distance(self, cluster1, cluster2):
522  """!
523  @brief Calculate minimal distance between clusters using representative points.
524 
525  @param[in] cluster1 (cure_cluster): The first cluster.
526  @param[in] cluster2 (cure_cluster): The second cluster.
527 
528  @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters.
529 
530  """
531 
532  distance = float('inf')
533  for i in range(0, len(cluster1.rep)):
534  for k in range(0, len(cluster2.rep)):
535  dist = euclidean_distance_square(cluster1.rep[i], cluster2.rep[k]); # Fast mode
536  #dist = euclidean_distance(cluster1.rep[i], cluster2.rep[k]) # Slow mode
537  if dist < distance:
538  distance = dist
539 
540  return distance
def __insert_represented_points(self, cluster)
Insert representation points to the k-d tree.
Definition: cure.py:395
Class represents clustering algorithm CURE with KD-tree optimization.
Definition: cure.py:89
Represents data cluster in CURE term.
Definition: cure.py:41
rep
List of points that represents clusters.
Definition: cure.py:67
def __validate_arguments(self)
Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception ...
Definition: cure.py:320
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: cure.py:158
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def get_means(self)
Returns list of mean values of each cluster.
Definition: cure.py:278
Module for representing clustering results.
Definition: encoder.py:1
indexes
Point indexes in dataset.
Definition: cure.py:61
def __init__(self, point, index)
Constructor of CURE cluster.
Definition: cure.py:48
Represents KD Tree that is a space-partitioning data structure for organizing points in a k-dimension...
Definition: kdtree.py:157
def __process_by_python(self)
Performs cluster analysis using python code.
Definition: cure.py:173
def __delete_represented_points(self, cluster)
Remove representation points of clusters from the k-d tree.
Definition: cure.py:407
points
List of points that make up cluster.
Definition: cure.py:58
closest
Pointer to the closest cluster.
Definition: cure.py:76
def get_representors(self)
Returns list of point-representors of each cluster.
Definition: cure.py:263
distance
Distance to the closest cluster.
Definition: cure.py:79
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: cure.py:248
def __create_kdtree(self)
Create k-d tree in line with created clusters.
Definition: cure.py:507
def process(self)
Performs cluster analysis in line with rules of CURE algorithm.
Definition: cure.py:141
def __insert_cluster(self, cluster)
Insert cluster to the list (sorted queue) in line with sequence order (distance). ...
Definition: cure.py:340
def __create_queue(self)
Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbo...
Definition: cure.py:476
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: cure.py:293
mean
Mean of points that make up cluster.
Definition: cure.py:64
def __cluster_distance(self, cluster1, cluster2)
Calculate minimal distance between clusters using representative points.
Definition: cure.py:521
def __closest_cluster(self, cluster, distance)
Find closest cluster to the specified cluster in line with distance.
Definition: cure.py:368
def __relocate_cluster(self, cluster)
Relocate cluster in list in line with distance order.
Definition: cure.py:356
def __merge_clusters(self, cluster1, cluster2)
Merges two clusters and returns new merged cluster.
Definition: cure.py:419
def __repr__(self)
Displays distance to closest cluster and points that are contained by current cluster.
Definition: cure.py:81
def __init__(self, data, number_cluster, number_represent_points=5, compression=0.5, ccore=True)
Constructor of clustering algorithm CURE.
Definition: cure.py:112
def __prepare_data_points(self, sample)
Prepare data points for clustering.
Definition: cure.py:306
Data Structure: KD-Tree.
Definition: kdtree.py:1