pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
cftree.py
1 """!
2 
3 @brief Data Structure: CF-Tree
4 @details Implementation based on paper @cite article::birch::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 import numpy
13 
14 from copy import copy
15 
16 from pyclustering.utils import euclidean_distance_square
17 from pyclustering.utils import manhattan_distance
18 from pyclustering.utils import linear_sum, square_sum
19 
20 from enum import IntEnum
21 
22 
23 class measurement_type(IntEnum):
24  """!
25  @brief Enumeration of measurement types for CF-Tree.
26 
27  @see cftree
28 
29  """
30 
31 
32  CENTROID_EUCLIDEAN_DISTANCE = 0
33 
34 
35  CENTROID_MANHATTAN_DISTANCE = 1
36 
37 
38  AVERAGE_INTER_CLUSTER_DISTANCE = 2
39 
40 
41  AVERAGE_INTRA_CLUSTER_DISTANCE = 3
42 
43 
44  VARIANCE_INCREASE_DISTANCE = 4
45 
46 
47 class cfnode_type(IntEnum):
48  """!
49  @brief Enumeration of CF-Node types that are used by CF-Tree.
50 
51  @see cfnode
52  @see cftree
53 
54  """
55 
56 
57  CFNODE_DUMMY = 0
58 
59 
60  CFNODE_LEAF = 1
61 
62 
63  CFNODE_NONLEAF = 2
64 
65 
66 class cfentry:
67  """!
68  @brief Clustering feature representation.
69 
70  @see cfnode
71  @see cftree
72 
73  """
74 
75  @property
76  def number_points(self):
77  """!
78  @brief Returns number of points that are encoded.
79 
80  @return (uint) Number of encoded points.
81 
82  """
83  return self.__number_points
84 
85  @property
86  def linear_sum(self):
87  """!
88  @brief Returns linear sum.
89 
90  @return (list) Linear sum.
91 
92  """
93 
94  return self.__linear_sum
95 
96  @property
97  def square_sum(self):
98  """!
99  @brief Returns square sum.
100 
101  @return (double) Square sum.
102 
103  """
104  return self.__square_sum
105 
106 
107  def __init__(self, number_points, linear_sum, square_sum):
108  """!
109  @brief CF-entry constructor.
110 
111  @param[in] number_points (uint): Number of objects that is represented by the entry.
112  @param[in] linear_sum (list): Linear sum of values that represent objects in each dimension.
113  @param[in] square_sum (double): Square sum of values that represent objects.
114 
115  """
116 
117  self.__number_points = number_points
118  self.__linear_sum = numpy.array(linear_sum)
119  self.__square_sum = square_sum
120 
121  self.__centroid = None
122  self.__radius = None
123  self.__diameter = None
124 
125 
126  def __copy__(self):
127  """!
128  @returns (cfentry) Makes copy of the CF-entry instance.
129 
130  """
131  return cfentry(self.__number_points, self.__linear_sum, self.__square_sum)
132 
133 
134  def __repr__(self):
135  """!
136  @return (string) Returns CF-entry representation.
137 
138  """
139  return "CF-Entry (N: '%s', LS: '%s', D: '%s')" % \
140  (self.number_points, self.linear_sum, str(self.get_diameter()))
141 
142 
143  def __str__(self):
144  """!
145  @brief Default cfentry string representation.
146 
147  """
148  return self.__repr__()
149 
150 
151  def __add__(self, entry):
152  """!
153  @brief Overloaded operator add. Performs addition of two clustering features.
154 
155  @param[in] entry (cfentry): Entry that is added to the current.
156 
157  @return (cfentry) Result of addition of two clustering features.
158 
159  """
160 
161  number_points = self.number_points + entry.number_points
162  result_linear_sum = numpy.add(self.linear_sum, entry.linear_sum)
163  result_square_sum = self.square_sum + entry.square_sum
164 
165  return cfentry(number_points, result_linear_sum, result_square_sum)
166 
167 
168  def __eq__(self, entry):
169  """!
170  @brief Overloaded operator eq.
171  @details Performs comparison of two clustering features.
172 
173  @param[in] entry (cfentry): Entry that is used for comparison with current.
174 
175  @return (bool) True is both clustering features are equals in line with tolerance, otherwise False.
176 
177  """
178 
179  tolerance = 0.00001
180 
181  result = (self.__number_points == entry.number_points)
182  result &= ((self.square_sum + tolerance > entry.square_sum) and (self.square_sum - tolerance < entry.square_sum))
183 
184  for index_dimension in range(0, len(self.linear_sum)):
185  result &= ((self.linear_sum[index_dimension] + tolerance > entry.linear_sum[index_dimension]) and (self.linear_sum[index_dimension] - tolerance < entry.linear_sum[index_dimension]))
186 
187  return result
188 
189 
190  def get_distance(self, entry, type_measurement):
191  """!
192  @brief Calculates distance between two clusters in line with measurement type.
193 
194  @details In case of usage CENTROID_EUCLIDIAN_DISTANCE square euclidian distance will be returned.
195  Square root should be taken from the result for obtaining real euclidian distance between
196  entries.
197 
198  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
199  @param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters.
200 
201  @return (double) Distance between two clusters.
202 
203  """
204 
205  if type_measurement is measurement_type.CENTROID_EUCLIDEAN_DISTANCE:
206  return euclidean_distance_square(entry.get_centroid(), self.get_centroid())
207 
208  elif type_measurement is measurement_type.CENTROID_MANHATTAN_DISTANCE:
209  return manhattan_distance(entry.get_centroid(), self.get_centroid())
210 
211  elif type_measurement is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE:
212  return self.__get_average_inter_cluster_distance(entry)
213 
214  elif type_measurement is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE:
215  return self.__get_average_intra_cluster_distance(entry)
216 
217  elif type_measurement is measurement_type.VARIANCE_INCREASE_DISTANCE:
218  return self.__get_variance_increase_distance(entry)
219 
220  else:
221  raise ValueError("Unsupported type of measurement '%s' is specified." % type_measurement)
222 
223 
224  def get_centroid(self):
225  """!
226  @brief Calculates centroid of cluster that is represented by the entry.
227  @details It's calculated once when it's requested after the last changes.
228 
229  @return (array_like) Centroid of cluster that is represented by the entry.
230 
231  """
232 
233  if self.__centroid is not None:
234  return self.__centroid
235 
236  self.__centroid = numpy.divide(self.linear_sum, self.number_points)
237  return self.__centroid
238 
239 
240  def get_radius(self):
241  """!
242  @brief Calculates radius of cluster that is represented by the entry.
243  @details It's calculated once when it's requested after the last changes.
244 
245  @return (double) Radius of cluster that is represented by the entry.
246 
247  """
248 
249  if self.__radius is not None:
250  return self.__radius
251 
252  N = self.number_points
253  centroid = self.get_centroid()
254 
255  radius_part_1 = self.square_sum
256  radius_part_2 = 2.0 * numpy.dot(self.linear_sum, centroid)
257  radius_part_3 = N * numpy.dot(centroid, centroid)
258 
259  self.__radius = ((1.0 / N) * (radius_part_1 - radius_part_2 + radius_part_3)) ** 0.5
260  return self.__radius
261 
262 
263  def get_diameter(self):
264  """!
265  @brief Calculates diameter of cluster that is represented by the entry.
266  @details It's calculated once when it's requested after the last changes.
267 
268  @return (double) Diameter of cluster that is represented by the entry.
269 
270  """
271 
272  if self.__diameter is not None:
273  return self.__diameter
274 
275  diameter_part = self.square_sum * self.number_points - 2.0 * numpy.dot(self.linear_sum, self.linear_sum) + self.square_sum * self.number_points
276 
277  if diameter_part < 0.000000001:
278  self.__diameter = 0.0
279  else:
280  self.__diameter = (diameter_part / (self.number_points * (self.number_points - 1))) ** 0.5
281 
282  return self.__diameter
283 
284 
285  def __get_average_inter_cluster_distance(self, entry):
286  """!
287  @brief Calculates average inter cluster distance between current and specified clusters.
288 
289  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
290 
291  @return (double) Average inter cluster distance.
292 
293  """
294 
295  linear_part_distance = numpy.dot(self.linear_sum, entry.linear_sum)
296 
297  return ((entry.number_points * self.square_sum - 2.0 * linear_part_distance + self.number_points * entry.square_sum) / (self.number_points * entry.number_points)) ** 0.5
298 
299 
300  def __get_average_intra_cluster_distance(self, entry):
301  """!
302  @brief Calculates average intra cluster distance between current and specified clusters.
303 
304  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
305 
306  @return (double) Average intra cluster distance.
307 
308  """
309 
310  linear_part_first = numpy.add(self.linear_sum, entry.linear_sum)
311  linear_part_second = linear_part_first
312 
313  linear_part_distance = numpy.dot(linear_part_first, linear_part_second)
314 
315  general_part_distance = 2.0 * (self.number_points + entry.number_points) * (self.square_sum + entry.square_sum) - 2.0 * linear_part_distance
316 
317  return (general_part_distance / ((self.number_points + entry.number_points) * (self.number_points + entry.number_points - 1.0))) ** 0.5
318 
319 
320  def __get_variance_increase_distance(self, entry):
321  """!
322  @brief Calculates variance increase distance between current and specified clusters.
323 
324  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
325 
326  @return (double) Variance increase distance.
327 
328  """
329 
330  linear_part_12 = numpy.add(self.linear_sum, entry.linear_sum)
331  variance_part_first = (self.square_sum + entry.square_sum) - \
332  2.0 * numpy.dot(linear_part_12, linear_part_12) / (self.number_points + entry.number_points) + \
333  (self.number_points + entry.number_points) * numpy.dot(linear_part_12, linear_part_12) / (self.number_points + entry.number_points)**2.0
334 
335  linear_part_11 = numpy.dot(self.linear_sum, self.linear_sum)
336  variance_part_second = -(self.square_sum - (2.0 * linear_part_11 / self.number_points) + (linear_part_11 / self.number_points))
337 
338  linear_part_22 = numpy.dot(entry.linear_sum, entry.linear_sum)
339  variance_part_third = -(entry.square_sum - (2.0 / entry.number_points) * linear_part_22 + entry.number_points * (1.0 / entry.number_points ** 2.0) * linear_part_22)
340 
341  return variance_part_first + variance_part_second + variance_part_third
342 
343 
344 class cfnode:
345  """!
346  @brief Representation of node of CF-Tree.
347 
348  """
349 
350  def __init__(self, feature, parent):
351  """!
352  @brief Constructor of abstract CF node.
353 
354  @param[in] feature (cfentry): Clustering feature of the created node.
355  @param[in] parent (cfnode): Parent of the created node.
356 
357  """
358 
359 
360  self.feature = copy(feature)
361 
362 
363  self.parent = parent
364 
365 
366  self.type = cfnode_type.CFNODE_DUMMY
367 
368 
369  def __repr__(self):
370  """!
371  @return (string) Default representation of CF node.
372 
373  """
374 
375  return 'CF node %s, parent %s, feature %s' % (hex(id(self)), self.parent, self.feature)
376 
377 
378  def __str__(self):
379  """!
380  @return (string) String representation of CF node.
381 
382  """
383  return self.__repr__()
384 
385 
386  def get_distance(self, node, type_measurement):
387  """!
388  @brief Calculates distance between nodes in line with specified type measurement.
389 
390  @param[in] node (cfnode): CF-node that is used for calculation distance to the current node.
391  @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance.
392 
393  @return (double) Distance between two nodes.
394 
395  """
396 
397  return self.feature.get_distance(node.feature, type_measurement)
398 
399 
401  """!
402  @brief Representation of clustering feature non-leaf node.
403 
404  """
405 
406  @property
407  def successors(self):
408  """!
409  @return (list) List of successors of the node.
410 
411  """
412  return self.__successors
413 
414 
415  def __init__(self, feature, parent, successors):
416  """!
417  @brief Create CF Non-leaf node.
418 
419  @param[in] feature (cfentry): Clustering feature of the created node.
420  @param[in] parent (non_leaf_node): Parent of the created node.
421  @param[in] successors (list): List of successors of the node.
422 
423  """
424 
425  super().__init__(feature, parent)
426 
427 
428  self.type = cfnode_type.CFNODE_NONLEAF
429 
430  self.__successors = successors
431 
432 
433  def __repr__(self):
434  """!
435  @return (string) Representation of non-leaf node representation.
436 
437  """
438  return 'Non-leaf node %s, parent %s, feature %s, successors: %d' % (hex(id(self)), self.parent, self.feature, len(self.successors))
439 
440 
441  def __str__(self):
442  """!
443  @return (string) String non-leaf representation.
444 
445  """
446  return self.__repr__()
447 
448 
449  def insert_successor(self, successor):
450  """!
451  @brief Insert successor to the node.
452 
453  @param[in] successor (cfnode): Successor for adding.
454 
455  """
456 
457  self.feature += successor.feature
458  self.successors.append(successor)
459 
460  successor.parent = self
461 
462 
463  def merge(self, node):
464  """!
465  @brief Merge non-leaf node to the current.
466 
467  @param[in] node (non_leaf_node): Non-leaf node that should be merged with current.
468 
469  """
470 
471  self.feature += node.feature
472 
473  for child in node.successors:
474  child.parent = self
475  self.successors.append(child)
476 
477 
478  def get_farthest_successors(self, type_measurement):
479  """!
480  @brief Find pair of farthest successors of the node in line with measurement type.
481 
482  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest successors.
483 
484  @return (list) Pair of farthest successors represented by list [cfnode1, cfnode2].
485 
486  """
487 
488  farthest_node1 = None
489  farthest_node2 = None
490  farthest_distance = 0.0
491 
492  for i in range(0, len(self.successors)):
493  candidate1 = self.successors[i]
494 
495  for j in range(i + 1, len(self.successors)):
496  candidate2 = self.successors[j]
497  candidate_distance = candidate1.get_distance(candidate2, type_measurement)
498 
499  if candidate_distance > farthest_distance:
500  farthest_distance = candidate_distance
501  farthest_node1 = candidate1
502  farthest_node2 = candidate2
503 
504  return [farthest_node1, farthest_node2]
505 
506 
507  def get_nearest_successors(self, type_measurement):
508  """!
509  @brief Find pair of nearest successors of the node in line with measurement type.
510 
511  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest successors.
512 
513  @return (list) Pair of nearest successors represented by list.
514 
515  """
516 
517  nearest_node1 = None
518  nearest_node2 = None
519  nearest_distance = float("Inf")
520 
521  for i in range(0, len(self.successors)):
522  candidate1 = self.successors[i]
523 
524  for j in range(i + 1, len(self.successors)):
525  candidate2 = self.successors[j]
526  candidate_distance = candidate1.get_distance(candidate2, type_measurement)
527 
528  if candidate_distance < nearest_distance:
529  nearest_distance = candidate_distance
530  nearest_node1 = candidate1
531  nearest_node2 = candidate2
532 
533  return [nearest_node1, nearest_node2]
534 
535 
537  """!
538  @brief Represents clustering feature leaf node.
539 
540  """
541 
542  @property
543  def entries(self):
544  """!
545  @return (list) List of leaf nodes.
546 
547  """
548  return self.__entries
549 
550 
551  def __init__(self, feature, parent, entries):
552  """!
553  @brief Create CF Leaf node.
554 
555  @param[in] feature (cfentry): Clustering feature of the created node.
556  @param[in] parent (non_leaf_node): Parent of the created node.
557  @param[in] entries (list): List of entries of the node.
558 
559  """
560 
561  super().__init__(feature, parent)
562 
563 
564  self.type = cfnode_type.CFNODE_LEAF
565 
566  self.__entries = entries # list of clustering features
567 
568 
569  def __repr__(self):
570  """!
571  @return (string) Default leaf node represenation.
572 
573  """
574  text_entries = "\n"
575  for entry in self.entries:
576  text_entries += "\t" + str(entry) + "\n"
577 
578  return "Leaf-node: '%s', parent: '%s', feature: '%s', entries: '%d'" % \
579  (str(hex(id(self))), self.parent, self.feature, len(self.entries))
580 
581 
582  def __str__(self):
583  """!
584  @return (string) String leaf node representation.
585 
586  """
587  return self.__repr__()
588 
589 
590  def insert_entry(self, entry):
591  """!
592  @brief Insert new clustering feature to the leaf node.
593 
594  @param[in] entry (cfentry): Clustering feature.
595 
596  """
597 
598  self.feature += entry
599  self.entries.append(entry)
600 
601 
602  def merge(self, node):
603  """!
604  @brief Merge leaf node to the current.
605 
606  @param[in] node (leaf_node): Leaf node that should be merged with current.
607 
608  """
609 
610  self.feature += node.feature
611 
612  # Move entries from merged node
613  for entry in node.entries:
614  self.entries.append(entry)
615 
616 
617  def get_farthest_entries(self, type_measurement):
618  """!
619  @brief Find pair of farthest entries of the node.
620 
621  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest entries.
622 
623  @return (list) Pair of farthest entries of the node that are represented by list.
624 
625  """
626 
627  farthest_entity1 = None
628  farthest_entity2 = None
629  farthest_distance = 0
630 
631  for i in range(0, len(self.entries)):
632  candidate1 = self.entries[i]
633 
634  for j in range(i + 1, len(self.entries)):
635  candidate2 = self.entries[j]
636  candidate_distance = candidate1.get_distance(candidate2, type_measurement)
637 
638  if candidate_distance > farthest_distance:
639  farthest_distance = candidate_distance
640  farthest_entity1 = candidate1
641  farthest_entity2 = candidate2
642 
643  return [farthest_entity1, farthest_entity2]
644 
645 
646  def get_nearest_index_entry(self, entry, type_measurement):
647  """!
648  @brief Find nearest index of nearest entry of node for the specified entry.
649 
650  @param[in] entry (cfentry): Entry that is used for calculation distance.
651  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
652 
653  @return (uint) Index of nearest entry of node for the specified entry.
654 
655  """
656 
657  minimum_distance = float('Inf')
658  nearest_index = -1
659 
660  for candidate_index in range(0, len(self.entries)):
661  candidate_distance = self.entries[candidate_index].get_distance(entry, type_measurement)
662  if candidate_distance < minimum_distance:
663  minimum_distance = candidate_distance
664  nearest_index = candidate_index
665 
666  return nearest_index
667 
668 
669  def get_nearest_entry(self, entry, type_measurement):
670  """!
671  @brief Find nearest entry of node for the specified entry.
672 
673  @param[in] entry (cfentry): Entry that is used for calculation distance.
674  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
675 
676  @return (cfentry) Nearest entry of node for the specified entry.
677 
678  """
679 
680  min_key = lambda cur_entity: cur_entity.get_distance(entry, type_measurement)
681  return min(self.entries, key=min_key)
682 
683 
684 class cftree:
685  """!
686  @brief CF-Tree representation.
687  @details A CF-tree is a height-balanced tree with two parameters: branching factor and threshold.
688 
689  """
690 
691  @property
692  def root(self):
693  """!
694  @return (cfnode) Root of the tree.
695 
696  """
697  return self.__root
698 
699 
700  @property
701  def leafes(self):
702  """!
703  @return (list) List of all leaf nodes in the tree.
704 
705  """
706  return self.__leafes
707 
708 
709  @property
710  def amount_nodes(self):
711  """!
712  @return (unit) Number of nodes (leaf and non-leaf) in the tree.
713 
714  """
715  return self.__amount_nodes
716 
717 
718  @property
719  def amount_entries(self):
720  """!
721  @return (uint) Number of entries in the tree.
722 
723  """
724  return self.__amount_entries
725 
726 
727  @property
728  def height(self):
729  """!
730  @return (uint) Height of the tree.
731 
732  """
733  return self.__height
734 
735 
736  @property
737  def branch_factor(self):
738  """!
739  @return (uint) Branching factor of the tree.
740  @details Branching factor defines maximum number of successors in each non-leaf node.
741 
742  """
743  return self.__branch_factor
744 
745 
746  @property
747  def threshold(self):
748  """!
749  @return (double) Threshold of the tree that represents maximum diameter of sub-clusters that is formed by leaf node entries.
750 
751  """
752  return self.__threshold
753 
754 
755  @property
756  def max_entries(self):
757  """!
758  @return (uint) Maximum number of entries in each leaf node.
759 
760  """
761  return self.__max_entries
762 
763 
764  @property
765  def type_measurement(self):
766  """!
767  @return (measurement_type) Type that is used for measuring.
768 
769  """
770  return self.__type_measurement
771 
772 
773  def __init__(self, branch_factor, max_entries, threshold, type_measurement = measurement_type.CENTROID_EUCLIDEAN_DISTANCE):
774  """!
775  @brief Create CF-tree.
776 
777  @param[in] branch_factor (uint): Maximum number of children for non-leaf nodes.
778  @param[in] max_entries (uint): Maximum number of entries for leaf nodes.
779  @param[in] threshold (double): Maximum diameter of feature clustering for each leaf node.
780  @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance metrics.
781 
782  """
783 
784  self.__root = None
785 
786  self.__branch_factor = branch_factor # maximum number of children
787  if self.__branch_factor < 2:
788  self.__branch_factor = 2
789 
790  self.__threshold = threshold # maximum diameter of sub-clusters stored at the leaf nodes
791  self.__max_entries = max_entries
792 
793  self.__leafes = []
794 
795  self.__type_measurement = type_measurement
796 
797  # statistics
798  self.__amount_nodes = 0 # root, despite it can be None.
799  self.__amount_entries = 0
800  self.__height = 0 # tree size with root.
801 
802 
803  def get_level_nodes(self, level):
804  """!
805  @brief Traverses CF-tree to obtain nodes at the specified level.
806 
807  @param[in] level (uint): CF-tree level from that nodes should be returned.
808 
809  @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
810 
811  """
812 
813  level_nodes = []
814  if level < self.__height:
815  level_nodes = self.__recursive_get_level_nodes(level, self.__root)
816 
817  return level_nodes
818 
819 
820  def __recursive_get_level_nodes(self, level, node):
821  """!
822  @brief Traverses CF-tree to obtain nodes at the specified level recursively.
823 
824  @param[in] level (uint): Current CF-tree level.
825  @param[in] node (cfnode): CF-node from that traversing is performed.
826 
827  @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
828 
829  """
830 
831  level_nodes = []
832  if level == 0:
833  level_nodes.append(node)
834 
835  else:
836  for sucessor in node.successors:
837  level_nodes += self.__recursive_get_level_nodes(level - 1, sucessor)
838 
839  return level_nodes
840 
841 
842  def insert_point(self, point):
843  """!
844  @brief Insert point that is represented by list of coordinates.
845 
846  @param[in] point (list): Point represented by list of coordinates that should be inserted to CF tree.
847 
848  """
849 
850  entry = cfentry(len([point]), linear_sum([point]), square_sum([point]))
851  self.insert(entry)
852 
853 
854  def insert(self, entry):
855  """!
856  @brief Insert clustering feature to the tree.
857 
858  @param[in] entry (cfentry): Clustering feature that should be inserted.
859 
860  """
861 
862  if self.__root is None:
863  node = leaf_node(entry, None, [entry])
864 
865  self.__root = node
866  self.__leafes.append(node)
867 
868  # Update statistics
869  self.__amount_entries += 1
870  self.__amount_nodes += 1
871  self.__height += 1 # root has successor now
872  else:
873  child_node_updation = self.__recursive_insert(entry, self.__root)
874  if child_node_updation is True:
875  # Splitting has been finished, check for possibility to merge (at least we have already two children).
876  if self.__merge_nearest_successors(self.__root) is True:
877  self.__amount_nodes -= 1
878 
879 
880  def find_nearest_leaf(self, entry, search_node = None):
881  """!
882  @brief Search nearest leaf to the specified clustering feature.
883 
884  @param[in] entry (cfentry): Clustering feature.
885  @param[in] search_node (cfnode): Node from that searching should be started, if None then search process will be started for the root.
886 
887  @return (leaf_node) Nearest node to the specified clustering feature.
888 
889  """
890 
891  if search_node is None:
892  search_node = self.__root
893 
894  nearest_node = search_node
895 
896  if search_node.type == cfnode_type.CFNODE_NONLEAF:
897  min_key = lambda child_node: child_node.feature.get_distance(entry, self.__type_measurement)
898  nearest_child_node = min(search_node.successors, key = min_key)
899 
900  nearest_node = self.find_nearest_leaf(entry, nearest_child_node)
901 
902  return nearest_node
903 
904 
905  def __recursive_insert(self, entry, search_node):
906  """!
907  @brief Recursive insert of the entry to the tree.
908  @details It performs all required procedures during insertion such as splitting, merging.
909 
910  @param[in] entry (cfentry): Clustering feature.
911  @param[in] search_node (cfnode): Node from that insertion should be started.
912 
913  @return (bool) True if number of nodes at the below level is changed, otherwise False.
914 
915  """
916 
917  # None-leaf node
918  if search_node.type == cfnode_type.CFNODE_NONLEAF:
919  return self.__insert_for_noneleaf_node(entry, search_node)
920 
921  # Leaf is reached
922  else:
923  return self.__insert_for_leaf_node(entry, search_node)
924 
925 
926  def __insert_for_leaf_node(self, entry, search_node):
927  """!
928  @brief Recursive insert entry from leaf node to the tree.
929 
930  @param[in] entry (cfentry): Clustering feature.
931  @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
932 
933  @return (bool) True if number of nodes at the below level is changed, otherwise False.
934 
935  """
936 
937  node_amount_updation = False
938 
939  # Try to absorb by the entity
940  index_nearest_entry = search_node.get_nearest_index_entry(entry, self.__type_measurement)
941  nearest_entry = search_node.entries[index_nearest_entry] # get nearest entry
942  merged_entry = nearest_entry + entry
943 
944  # Otherwise try to add new entry
945  if merged_entry.get_diameter() > self.__threshold:
946  # If it's not exceeded append entity and update feature of the leaf node.
947  search_node.insert_entry(entry)
948 
949  # Otherwise current node should be splitted
950  if len(search_node.entries) > self.__max_entries:
951  self.__split_procedure(search_node)
952  node_amount_updation = True
953 
954  # Update statistics
955  self.__amount_entries += 1
956 
957  else:
958  search_node.entries[index_nearest_entry] = merged_entry
959  search_node.feature += entry
960 
961  return node_amount_updation
962 
963 
964  def __insert_for_noneleaf_node(self, entry, search_node):
965  """!
966  @brief Recursive insert entry from none-leaf node to the tree.
967 
968  @param[in] entry (cfentry): Clustering feature.
969  @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
970 
971  @return (bool) True if number of nodes at the below level is changed, otherwise False.
972 
973  """
974 
975  node_amount_updation = False
976 
977  min_key = lambda child_node: child_node.get_distance(search_node, self.__type_measurement)
978  nearest_child_node = min(search_node.successors, key=min_key)
979 
980  child_node_updation = self.__recursive_insert(entry, nearest_child_node)
981 
982  # Update clustering feature of none-leaf node.
983  search_node.feature += entry
984 
985  # Check branch factor, probably some leaf has been splitted and threshold has been exceeded.
986  if (len(search_node.successors) > self.__branch_factor):
987 
988  # Check if it's aleady root then new root should be created (height is increased in this case).
989  if search_node is self.__root:
990  self.__root = non_leaf_node(search_node.feature, None, [search_node])
991  search_node.parent = self.__root
992 
993  # Update statistics
994  self.__amount_nodes += 1
995  self.__height += 1
996 
997  [new_node1, new_node2] = self.__split_nonleaf_node(search_node)
998 
999  # Update parent list of successors
1000  parent = search_node.parent
1001  parent.successors.remove(search_node)
1002  parent.successors.append(new_node1)
1003  parent.successors.append(new_node2)
1004 
1005  # Update statistics
1006  self.__amount_nodes += 1
1007  node_amount_updation = True
1008 
1009  elif child_node_updation is True:
1010  # Splitting has been finished, check for possibility to merge (at least we have already two children).
1011  if self.__merge_nearest_successors(search_node) is True:
1012  self.__amount_nodes -= 1
1013 
1014  return node_amount_updation
1015 
1016 
1017  def __merge_nearest_successors(self, node):
1018  """!
1019  @brief Find nearest sucessors and merge them.
1020 
1021  @param[in] node (non_leaf_node): Node whose two nearest successors should be merged.
1022 
1023  @return (bool): True if merging has been successfully performed, otherwise False.
1024 
1025  """
1026 
1027  merging_result = False
1028 
1029  if node.successors[0].type == cfnode_type.CFNODE_NONLEAF:
1030  [nearest_child_node1, nearest_child_node2] = node.get_nearest_successors(self.__type_measurement)
1031 
1032  if len(nearest_child_node1.successors) + len(nearest_child_node2.successors) <= self.__branch_factor:
1033  node.successors.remove(nearest_child_node2)
1034  if nearest_child_node2.type == cfnode_type.CFNODE_LEAF:
1035  self.__leafes.remove(nearest_child_node2)
1036 
1037  nearest_child_node1.merge(nearest_child_node2)
1038 
1039  merging_result = True
1040 
1041  return merging_result
1042 
1043 
1044  def __split_procedure(self, split_node):
1045  """!
1046  @brief Starts node splitting procedure in the CF-tree from the specify node.
1047 
1048  @param[in] split_node (cfnode): CF-tree node that should be splitted.
1049 
1050  """
1051  if split_node is self.__root:
1052  self.__root = non_leaf_node(split_node.feature, None, [ split_node ])
1053  split_node.parent = self.__root
1054 
1055  # Update statistics
1056  self.__amount_nodes += 1
1057  self.__height += 1
1058 
1059  [new_node1, new_node2] = self.__split_leaf_node(split_node)
1060 
1061  self.__leafes.remove(split_node)
1062  self.__leafes.append(new_node1)
1063  self.__leafes.append(new_node2)
1064 
1065  # Update parent list of successors
1066  parent = split_node.parent
1067  parent.successors.remove(split_node)
1068  parent.successors.append(new_node1)
1069  parent.successors.append(new_node2)
1070 
1071  # Update statistics
1072  self.__amount_nodes += 1
1073 
1074 
1075  def __split_nonleaf_node(self, node):
1076  """!
1077  @brief Performs splitting of the specified non-leaf node.
1078 
1079  @param[in] node (non_leaf_node): Non-leaf node that should be splitted.
1080 
1081  @return (list) New pair of non-leaf nodes [non_leaf_node1, non_leaf_node2].
1082 
1083  """
1084 
1085  [farthest_node1, farthest_node2] = node.get_farthest_successors(self.__type_measurement)
1086 
1087  # create new non-leaf nodes
1088  new_node1 = non_leaf_node(farthest_node1.feature, node.parent, [farthest_node1])
1089  new_node2 = non_leaf_node(farthest_node2.feature, node.parent, [farthest_node2])
1090 
1091  farthest_node1.parent = new_node1
1092  farthest_node2.parent = new_node2
1093 
1094  # re-insert other successors
1095  for successor in node.successors:
1096  if (successor is not farthest_node1) and (successor is not farthest_node2):
1097  distance1 = new_node1.get_distance(successor, self.__type_measurement)
1098  distance2 = new_node2.get_distance(successor, self.__type_measurement)
1099 
1100  if distance1 < distance2:
1101  new_node1.insert_successor(successor)
1102  else:
1103  new_node2.insert_successor(successor)
1104 
1105  return [new_node1, new_node2]
1106 
1107 
1108  def __split_leaf_node(self, node):
1109  """!
1110  @brief Performs splitting of the specified leaf node.
1111 
1112  @param[in] node (leaf_node): Leaf node that should be splitted.
1113 
1114  @return (list) New pair of leaf nodes [leaf_node1, leaf_node2].
1115 
1116  @warning Splitted node is transformed to non_leaf.
1117 
1118  """
1119 
1120  # search farthest pair of entries
1121  [farthest_entity1, farthest_entity2] = node.get_farthest_entries(self.__type_measurement)
1122 
1123  # create new nodes
1124  new_node1 = leaf_node(farthest_entity1, node.parent, [farthest_entity1])
1125  new_node2 = leaf_node(farthest_entity2, node.parent, [farthest_entity2])
1126 
1127  # re-insert other entries
1128  for entity in node.entries:
1129  if (entity is not farthest_entity1) and (entity is not farthest_entity2):
1130  distance1 = new_node1.feature.get_distance(entity, self.__type_measurement)
1131  distance2 = new_node2.feature.get_distance(entity, self.__type_measurement)
1132 
1133  if distance1 < distance2:
1134  new_node1.insert_entry(entity)
1135  else:
1136  new_node2.insert_entry(entity)
1137 
1138  return [new_node1, new_node2]
pyclustering.container.cftree.non_leaf_node.__repr__
def __repr__(self)
Definition: cftree.py:433
pyclustering.container.cftree.non_leaf_node.successors
def successors(self)
Definition: cftree.py:407
pyclustering.container.cftree.cftree.__height
__height
Definition: cftree.py:800
pyclustering.container.cftree.cftree.insert_point
def insert_point(self, point)
Insert point that is represented by list of coordinates.
Definition: cftree.py:842
pyclustering.container.cftree.cftree.height
def height(self)
Definition: cftree.py:728
pyclustering.container.cftree.cfentry.number_points
def number_points(self)
Returns number of points that are encoded.
Definition: cftree.py:76
pyclustering.container.cftree.cfentry.__diameter
__diameter
Definition: cftree.py:123
pyclustering.container.cftree.cfnode.__str__
def __str__(self)
Definition: cftree.py:378
pyclustering.container.cftree.cfentry.__repr__
def __repr__(self)
Definition: cftree.py:134
pyclustering.container.cftree.non_leaf_node.__successors
__successors
Definition: cftree.py:430
pyclustering.container.cftree.cfentry.get_diameter
def get_diameter(self)
Calculates diameter of cluster that is represented by the entry.
Definition: cftree.py:263
pyclustering.container.cftree.cftree.leafes
def leafes(self)
Definition: cftree.py:701
pyclustering.container.cftree.non_leaf_node.__str__
def __str__(self)
Definition: cftree.py:441
pyclustering.container.cftree.cfentry.__radius
__radius
Definition: cftree.py:122
pyclustering.container.cftree.non_leaf_node.__init__
def __init__(self, feature, parent, successors)
Create CF Non-leaf node.
Definition: cftree.py:415
pyclustering.container.cftree.cftree.__recursive_get_level_nodes
def __recursive_get_level_nodes(self, level, node)
Traverses CF-tree to obtain nodes at the specified level recursively.
Definition: cftree.py:820
pyclustering.container.cftree.cftree.__init__
def __init__(self, branch_factor, max_entries, threshold, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE)
Create CF-tree.
Definition: cftree.py:773
pyclustering.container.cftree.leaf_node.__entries
__entries
Definition: cftree.py:566
pyclustering.container.cftree.leaf_node.insert_entry
def insert_entry(self, entry)
Insert new clustering feature to the leaf node.
Definition: cftree.py:590
pyclustering.container.cftree.cfentry.__get_average_intra_cluster_distance
def __get_average_intra_cluster_distance(self, entry)
Calculates average intra cluster distance between current and specified clusters.
Definition: cftree.py:300
pyclustering.container.cftree.leaf_node.get_farthest_entries
def get_farthest_entries(self, type_measurement)
Find pair of farthest entries of the node.
Definition: cftree.py:617
pyclustering.container.cftree.cfnode.parent
parent
Pointer to the parent node (None for root).
Definition: cftree.py:363
pyclustering.container.cftree.cftree.__split_nonleaf_node
def __split_nonleaf_node(self, node)
Performs splitting of the specified non-leaf node.
Definition: cftree.py:1075
pyclustering.container.cftree.cfnode.get_distance
def get_distance(self, node, type_measurement)
Calculates distance between nodes in line with specified type measurement.
Definition: cftree.py:386
pyclustering.container.cftree.cfentry.get_radius
def get_radius(self)
Calculates radius of cluster that is represented by the entry.
Definition: cftree.py:240
pyclustering.container.cftree.leaf_node.get_nearest_index_entry
def get_nearest_index_entry(self, entry, type_measurement)
Find nearest index of nearest entry of node for the specified entry.
Definition: cftree.py:646
pyclustering.container.cftree.cfentry.__copy__
def __copy__(self)
Definition: cftree.py:126
pyclustering.container.cftree.cfentry.square_sum
def square_sum(self)
Returns square sum.
Definition: cftree.py:97
pyclustering.container.cftree.cfentry.__eq__
def __eq__(self, entry)
Overloaded operator eq.
Definition: cftree.py:168
pyclustering.container.cftree.cftree.__split_procedure
def __split_procedure(self, split_node)
Starts node splitting procedure in the CF-tree from the specify node.
Definition: cftree.py:1044
pyclustering.container.cftree.cfnode.__init__
def __init__(self, feature, parent)
Constructor of abstract CF node.
Definition: cftree.py:350
pyclustering.container.cftree.cfnode.type
type
Type node (leaf or non-leaf).
Definition: cftree.py:366
pyclustering.container.cftree.cftree.insert
def insert(self, entry)
Insert clustering feature to the tree.
Definition: cftree.py:854
pyclustering.container.cftree.cftree
CF-Tree representation.
Definition: cftree.py:684
pyclustering.container.cftree.cftree.__max_entries
__max_entries
Definition: cftree.py:791
pyclustering.container.cftree.cfentry
Clustering feature representation.
Definition: cftree.py:66
pyclustering.container.cftree.non_leaf_node
Representation of clustering feature non-leaf node.
Definition: cftree.py:400
pyclustering.container.cftree.cftree.find_nearest_leaf
def find_nearest_leaf(self, entry, search_node=None)
Search nearest leaf to the specified clustering feature.
Definition: cftree.py:880
pyclustering.container.cftree.cftree.__threshold
__threshold
Definition: cftree.py:790
pyclustering.container.cftree.cftree.get_level_nodes
def get_level_nodes(self, level)
Traverses CF-tree to obtain nodes at the specified level.
Definition: cftree.py:803
pyclustering.container.cftree.cftree.root
def root(self)
Definition: cftree.py:692
pyclustering.container.cftree.leaf_node.__str__
def __str__(self)
Definition: cftree.py:582
pyclustering.container.cftree.cfentry.get_centroid
def get_centroid(self)
Calculates centroid of cluster that is represented by the entry.
Definition: cftree.py:224
pyclustering.container.cftree.cfentry.__str__
def __str__(self)
Default cfentry string representation.
Definition: cftree.py:143
pyclustering.container.cftree.cfnode.__repr__
def __repr__(self)
Definition: cftree.py:369
pyclustering.container.cftree.cfentry.__centroid
__centroid
Definition: cftree.py:121
pyclustering.container.cftree.cftree.__amount_entries
__amount_entries
Definition: cftree.py:799
pyclustering.container.cftree.cftree.__leafes
__leafes
Definition: cftree.py:793
pyclustering.container.cftree.cfentry.__linear_sum
__linear_sum
Definition: cftree.py:118
pyclustering.container.cftree.cftree.__insert_for_leaf_node
def __insert_for_leaf_node(self, entry, search_node)
Recursive insert entry from leaf node to the tree.
Definition: cftree.py:926
pyclustering.container.cftree.cftree.__insert_for_noneleaf_node
def __insert_for_noneleaf_node(self, entry, search_node)
Recursive insert entry from none-leaf node to the tree.
Definition: cftree.py:964
pyclustering.container.cftree.cftree.branch_factor
def branch_factor(self)
Definition: cftree.py:737
pyclustering.container.cftree.cftree.__branch_factor
__branch_factor
Definition: cftree.py:786
pyclustering.container.cftree.cftree.__merge_nearest_successors
def __merge_nearest_successors(self, node)
Find nearest sucessors and merge them.
Definition: cftree.py:1017
pyclustering.container.cftree.leaf_node.get_nearest_entry
def get_nearest_entry(self, entry, type_measurement)
Find nearest entry of node for the specified entry.
Definition: cftree.py:669
pyclustering.container.cftree.cftree.__amount_nodes
__amount_nodes
Definition: cftree.py:798
pyclustering.container.cftree.cfentry.__get_variance_increase_distance
def __get_variance_increase_distance(self, entry)
Calculates variance increase distance between current and specified clusters.
Definition: cftree.py:320
pyclustering.container.cftree.cfentry.get_distance
def get_distance(self, entry, type_measurement)
Calculates distance between two clusters in line with measurement type.
Definition: cftree.py:190
pyclustering.container.cftree.cfnode.feature
feature
Clustering feature of the node.
Definition: cftree.py:360
pyclustering.container.cftree.non_leaf_node.merge
def merge(self, node)
Merge non-leaf node to the current.
Definition: cftree.py:463
pyclustering.container.cftree.non_leaf_node.get_nearest_successors
def get_nearest_successors(self, type_measurement)
Find pair of nearest successors of the node in line with measurement type.
Definition: cftree.py:507
pyclustering.container.cftree.measurement_type
Enumeration of measurement types for CF-Tree.
Definition: cftree.py:23
pyclustering.container.cftree.non_leaf_node.get_farthest_successors
def get_farthest_successors(self, type_measurement)
Find pair of farthest successors of the node in line with measurement type.
Definition: cftree.py:478
pyclustering.container.cftree.cfentry.__number_points
__number_points
Definition: cftree.py:117
pyclustering.container.cftree.leaf_node
Represents clustering feature leaf node.
Definition: cftree.py:536
pyclustering.container.cftree.cfentry.linear_sum
def linear_sum(self)
Returns linear sum.
Definition: cftree.py:86
pyclustering.container.cftree.cftree.type_measurement
def type_measurement(self)
Definition: cftree.py:765
pyclustering.container.cftree.leaf_node.merge
def merge(self, node)
Merge leaf node to the current.
Definition: cftree.py:602
pyclustering.container.cftree.cftree.amount_entries
def amount_entries(self)
Definition: cftree.py:719
pyclustering.container.cftree.leaf_node.entries
def entries(self)
Definition: cftree.py:543
pyclustering.container.cftree.cftree.__root
__root
Definition: cftree.py:784
pyclustering.utils
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
pyclustering.container.cftree.cftree.max_entries
def max_entries(self)
Definition: cftree.py:756
pyclustering.container.cftree.cfentry.__square_sum
__square_sum
Definition: cftree.py:119
pyclustering.container.cftree.cftree.threshold
def threshold(self)
Definition: cftree.py:747
pyclustering.container.cftree.non_leaf_node.insert_successor
def insert_successor(self, successor)
Insert successor to the node.
Definition: cftree.py:449
pyclustering.container.cftree.cfnode
Representation of node of CF-Tree.
Definition: cftree.py:344
pyclustering.container.cftree.cfentry.__init__
def __init__(self, number_points, linear_sum, square_sum)
CF-entry constructor.
Definition: cftree.py:107
pyclustering.container.cftree.cftree.__type_measurement
__type_measurement
Definition: cftree.py:795
pyclustering.container.cftree.leaf_node.__init__
def __init__(self, feature, parent, entries)
Create CF Leaf node.
Definition: cftree.py:551
pyclustering.container.cftree.cfnode_type
Enumeration of CF-Node types that are used by CF-Tree.
Definition: cftree.py:47
pyclustering.container.cftree.cftree.amount_nodes
def amount_nodes(self)
Definition: cftree.py:710
pyclustering.container.cftree.leaf_node.__repr__
def __repr__(self)
Definition: cftree.py:569
pyclustering.container.cftree.cfentry.__add__
def __add__(self, entry)
Overloaded operator add.
Definition: cftree.py:151
pyclustering.container.cftree.cftree.__recursive_insert
def __recursive_insert(self, entry, search_node)
Recursive insert of the entry to the tree.
Definition: cftree.py:905
pyclustering.container.cftree.cfentry.__get_average_inter_cluster_distance
def __get_average_inter_cluster_distance(self, entry)
Calculates average inter cluster distance between current and specified clusters.
Definition: cftree.py:285
pyclustering.container.cftree.cftree.__split_leaf_node
def __split_leaf_node(self, node)
Performs splitting of the specified leaf node.
Definition: cftree.py:1108