cftree.py
1 """!
2 
3 @brief Data Structure: CF-Tree
4 @details Implementation based on paper @cite article::birch::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 import numpy
28 
29 from copy import copy
30 
31 from pyclustering.utils import euclidean_distance_square
32 from pyclustering.utils import manhattan_distance
33 from pyclustering.utils import linear_sum, square_sum
34 
35 from enum import IntEnum
36 
37 
38 class measurement_type(IntEnum):
39  """!
40  @brief Enumeration of measurement types for CF-Tree.
41 
42  @see cftree
43 
44  """
45 
46 
47  CENTROID_EUCLIDEAN_DISTANCE = 0
48 
49 
50  CENTROID_MANHATTAN_DISTANCE = 1
51 
52 
53  AVERAGE_INTER_CLUSTER_DISTANCE = 2
54 
55 
56  AVERAGE_INTRA_CLUSTER_DISTANCE = 3
57 
58 
59  VARIANCE_INCREASE_DISTANCE = 4
60 
61 
62 class cfnode_type(IntEnum):
63  """!
64  @brief Enumeration of CF-Node types that are used by CF-Tree.
65 
66  @see cfnode
67  @see cftree
68 
69  """
70 
71 
72  CFNODE_DUMMY = 0
73 
74 
75  CFNODE_LEAF = 1
76 
77 
78  CFNODE_NONLEAF = 2
79 
80 
81 class cfentry:
82  """!
83  @brief Clustering feature representation.
84 
85  @see cfnode
86  @see cftree
87 
88  """
89 
90  @property
91  def number_points(self):
92  """!
93  @brief Returns number of points that are encoded.
94 
95  @return (uint) Number of encoded points.
96 
97  """
98  return self.__number_points
99 
100  @property
101  def linear_sum(self):
102  """!
103  @brief Returns linear sum.
104 
105  @return (list) Linear sum.
106 
107  """
108 
109  return self.__linear_sum
110 
111  @property
112  def square_sum(self):
113  """!
114  @brief Returns square sum.
115 
116  @return (double) Square sum.
117 
118  """
119  return self.__square_sum
120 
121 
122  def __init__(self, number_points, linear_sum, square_sum):
123  """!
124  @brief CF-entry constructor.
125 
126  @param[in] number_points (uint): Number of objects that is represented by the entry.
127  @param[in] linear_sum (list): Linear sum of values that represent objects in each dimension.
128  @param[in] square_sum (double): Square sum of values that represent objects.
129 
130  """
131 
132  self.__number_points = number_points
133  self.__linear_sum = numpy.array(linear_sum)
134  self.__square_sum = square_sum
135 
136  self.__centroid = None
137  self.__radius = None
138  self.__diameter = None
139 
140 
141  def __copy__(self):
142  """!
143  @returns (cfentry) Makes copy of the CF-entry instance.
144 
145  """
146  return cfentry(self.__number_points, self.__linear_sum, self.__square_sum)
147 
148 
149  def __repr__(self):
150  """!
151  @return (string) Returns CF-entry representation.
152 
153  """
154  return "CF-Entry (N: '%s', LS: '%s', D: '%s')" % \
155  (self.number_points, self.linear_sum, str(self.get_diameter()))
156 
157 
158  def __str__(self):
159  """!
160  @brief Default cfentry string representation.
161 
162  """
163  return self.__repr__()
164 
165 
166  def __add__(self, entry):
167  """!
168  @brief Overloaded operator add. Performs addition of two clustering features.
169 
170  @param[in] entry (cfentry): Entry that is added to the current.
171 
172  @return (cfentry) Result of addition of two clustering features.
173 
174  """
175 
176  number_points = self.number_points + entry.number_points
177  result_linear_sum = numpy.add(self.linear_sum, entry.linear_sum)
178  result_square_sum = self.square_sum + entry.square_sum
179 
180  return cfentry(number_points, result_linear_sum, result_square_sum)
181 
182 
183  def __eq__(self, entry):
184  """!
185  @brief Overloaded operator eq.
186  @details Performs comparison of two clustering features.
187 
188  @param[in] entry (cfentry): Entry that is used for comparison with current.
189 
190  @return (bool) True is both clustering features are equals in line with tolerance, otherwise False.
191 
192  """
193 
194  tolerance = 0.00001
195 
196  result = (self.__number_points == entry.number_points)
197  result &= ((self.square_sum + tolerance > entry.square_sum) and (self.square_sum - tolerance < entry.square_sum))
198 
199  for index_dimension in range(0, len(self.linear_sum)):
200  result &= ((self.linear_sum[index_dimension] + tolerance > entry.linear_sum[index_dimension]) and (self.linear_sum[index_dimension] - tolerance < entry.linear_sum[index_dimension]))
201 
202  return result
203 
204 
205  def get_distance(self, entry, type_measurement):
206  """!
207  @brief Calculates distance between two clusters in line with measurement type.
208 
209  @details In case of usage CENTROID_EUCLIDIAN_DISTANCE square euclidian distance will be returned.
210  Square root should be taken from the result for obtaining real euclidian distance between
211  entries.
212 
213  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
214  @param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters.
215 
216  @return (double) Distance between two clusters.
217 
218  """
219 
220  if type_measurement is measurement_type.CENTROID_EUCLIDEAN_DISTANCE:
221  return euclidean_distance_square(entry.get_centroid(), self.get_centroid())
222 
223  elif type_measurement is measurement_type.CENTROID_MANHATTAN_DISTANCE:
224  return manhattan_distance(entry.get_centroid(), self.get_centroid())
225 
226  elif type_measurement is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE:
227  return self.__get_average_inter_cluster_distance(entry)
228 
229  elif type_measurement is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE:
230  return self.__get_average_intra_cluster_distance(entry)
231 
232  elif type_measurement is measurement_type.VARIANCE_INCREASE_DISTANCE:
233  return self.__get_variance_increase_distance(entry)
234 
235  else:
236  raise ValueError("Unsupported type of measurement '%s' is specified." % type_measurement)
237 
238 
239  def get_centroid(self):
240  """!
241  @brief Calculates centroid of cluster that is represented by the entry.
242  @details It's calculated once when it's requested after the last changes.
243 
244  @return (array_like) Centroid of cluster that is represented by the entry.
245 
246  """
247 
248  if self.__centroid is not None:
249  return self.__centroid
250 
251  self.__centroid = numpy.divide(self.linear_sum, self.number_points)
252  return self.__centroid
253 
254 
255  def get_radius(self):
256  """!
257  @brief Calculates radius of cluster that is represented by the entry.
258  @details It's calculated once when it's requested after the last changes.
259 
260  @return (double) Radius of cluster that is represented by the entry.
261 
262  """
263 
264  if self.__radius is not None:
265  return self.__radius
266 
267  N = self.number_points
268  centroid = self.get_centroid()
269 
270  radius_part_1 = self.square_sum
271  radius_part_2 = 2.0 * numpy.dot(self.linear_sum, centroid)
272  radius_part_3 = N * numpy.dot(centroid, centroid)
273 
274  self.__radius = ((1.0 / N) * (radius_part_1 - radius_part_2 + radius_part_3)) ** 0.5
275  return self.__radius
276 
277 
278  def get_diameter(self):
279  """!
280  @brief Calculates diameter of cluster that is represented by the entry.
281  @details It's calculated once when it's requested after the last changes.
282 
283  @return (double) Diameter of cluster that is represented by the entry.
284 
285  """
286 
287  if self.__diameter is not None:
288  return self.__diameter
289 
290  diameter_part = self.square_sum * self.number_points - 2.0 * numpy.dot(self.linear_sum, self.linear_sum) + self.square_sum * self.number_points
291 
292  if diameter_part < 0.000000001:
293  self.__diameter = 0.0
294  else:
295  self.__diameter = (diameter_part / (self.number_points * (self.number_points - 1))) ** 0.5
296 
297  return self.__diameter
298 
299 
300  def __get_average_inter_cluster_distance(self, entry):
301  """!
302  @brief Calculates average inter cluster distance between current and specified clusters.
303 
304  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
305 
306  @return (double) Average inter cluster distance.
307 
308  """
309 
310  linear_part_distance = numpy.dot(self.linear_sum, entry.linear_sum)
311 
312  return ((entry.number_points * self.square_sum - 2.0 * linear_part_distance + self.number_points * entry.square_sum) / (self.number_points * entry.number_points)) ** 0.5
313 
314 
315  def __get_average_intra_cluster_distance(self, entry):
316  """!
317  @brief Calculates average intra cluster distance between current and specified clusters.
318 
319  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
320 
321  @return (double) Average intra cluster distance.
322 
323  """
324 
325  linear_part_first = numpy.add(self.linear_sum, entry.linear_sum)
326  linear_part_second = linear_part_first
327 
328  linear_part_distance = numpy.dot(linear_part_first, linear_part_second)
329 
330  general_part_distance = 2.0 * (self.number_points + entry.number_points) * (self.square_sum + entry.square_sum) - 2.0 * linear_part_distance
331 
332  return (general_part_distance / ((self.number_points + entry.number_points) * (self.number_points + entry.number_points - 1.0))) ** 0.5
333 
334 
335  def __get_variance_increase_distance(self, entry):
336  """!
337  @brief Calculates variance increase distance between current and specified clusters.
338 
339  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
340 
341  @return (double) Variance increase distance.
342 
343  """
344 
345  linear_part_12 = numpy.add(self.linear_sum, entry.linear_sum)
346  variance_part_first = (self.square_sum + entry.square_sum) - \
347  2.0 * numpy.dot(linear_part_12, linear_part_12) / (self.number_points + entry.number_points) + \
348  (self.number_points + entry.number_points) * numpy.dot(linear_part_12, linear_part_12) / (self.number_points + entry.number_points)**2.0
349 
350  linear_part_11 = numpy.dot(self.linear_sum, self.linear_sum)
351  variance_part_second = -(self.square_sum - (2.0 * linear_part_11 / self.number_points) + (linear_part_11 / self.number_points))
352 
353  linear_part_22 = numpy.dot(entry.linear_sum, entry.linear_sum)
354  variance_part_third = -(entry.square_sum - (2.0 / entry.number_points) * linear_part_22 + entry.number_points * (1.0 / entry.number_points ** 2.0) * linear_part_22)
355 
356  return variance_part_first + variance_part_second + variance_part_third
357 
358 
359 class cfnode:
360  """!
361  @brief Representation of node of CF-Tree.
362 
363  """
364 
365  def __init__(self, feature, parent):
366  """!
367  @brief Constructor of abstract CF node.
368 
369  @param[in] feature (cfentry): Clustering feature of the created node.
370  @param[in] parent (cfnode): Parent of the created node.
371 
372  """
373 
374 
375  self.feature = copy(feature)
376 
377 
378  self.parent = parent
379 
380 
381  self.type = cfnode_type.CFNODE_DUMMY
382 
383 
384  def __repr__(self):
385  """!
386  @return (string) Default representation of CF node.
387 
388  """
389 
390  return 'CF node %s, parent %s, feature %s' % (hex(id(self)), self.parent, self.feature)
391 
392 
393  def __str__(self):
394  """!
395  @return (string) String representation of CF node.
396 
397  """
398  return self.__repr__()
399 
400 
401  def get_distance(self, node, type_measurement):
402  """!
403  @brief Calculates distance between nodes in line with specified type measurement.
404 
405  @param[in] node (cfnode): CF-node that is used for calculation distance to the current node.
406  @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance.
407 
408  @return (double) Distance between two nodes.
409 
410  """
411 
412  return self.feature.get_distance(node.feature, type_measurement)
413 
414 
416  """!
417  @brief Representation of clustering feature non-leaf node.
418 
419  """
420 
421  @property
422  def successors(self):
423  """!
424  @return (list) List of successors of the node.
425 
426  """
427  return self.__successors
428 
429 
430  def __init__(self, feature, parent, successors):
431  """!
432  @brief Create CF Non-leaf node.
433 
434  @param[in] feature (cfentry): Clustering feature of the created node.
435  @param[in] parent (non_leaf_node): Parent of the created node.
436  @param[in] successors (list): List of successors of the node.
437 
438  """
439 
440  super().__init__(feature, parent)
441 
442 
443  self.type = cfnode_type.CFNODE_NONLEAF
444 
445  self.__successors = successors
446 
447 
448  def __repr__(self):
449  """!
450  @return (string) Representation of non-leaf node representation.
451 
452  """
453  return 'Non-leaf node %s, parent %s, feature %s, successors: %d' % (hex(id(self)), self.parent, self.feature, len(self.successors))
454 
455 
456  def __str__(self):
457  """!
458  @return (string) String non-leaf representation.
459 
460  """
461  return self.__repr__()
462 
463 
464  def insert_successor(self, successor):
465  """!
466  @brief Insert successor to the node.
467 
468  @param[in] successor (cfnode): Successor for adding.
469 
470  """
471 
472  self.feature += successor.feature
473  self.successors.append(successor)
474 
475  successor.parent = self
476 
477 
478  def merge(self, node):
479  """!
480  @brief Merge non-leaf node to the current.
481 
482  @param[in] node (non_leaf_node): Non-leaf node that should be merged with current.
483 
484  """
485 
486  self.feature += node.feature
487 
488  for child in node.successors:
489  child.parent = self
490  self.successors.append(child)
491 
492 
493  def get_farthest_successors(self, type_measurement):
494  """!
495  @brief Find pair of farthest successors of the node in line with measurement type.
496 
497  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest successors.
498 
499  @return (list) Pair of farthest successors represented by list [cfnode1, cfnode2].
500 
501  """
502 
503  farthest_node1 = None
504  farthest_node2 = None
505  farthest_distance = 0.0
506 
507  for i in range(0, len(self.successors)):
508  candidate1 = self.successors[i]
509 
510  for j in range(i + 1, len(self.successors)):
511  candidate2 = self.successors[j]
512  candidate_distance = candidate1.get_distance(candidate2, type_measurement)
513 
514  if candidate_distance > farthest_distance:
515  farthest_distance = candidate_distance
516  farthest_node1 = candidate1
517  farthest_node2 = candidate2
518 
519  return [farthest_node1, farthest_node2]
520 
521 
522  def get_nearest_successors(self, type_measurement):
523  """!
524  @brief Find pair of nearest successors of the node in line with measurement type.
525 
526  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest successors.
527 
528  @return (list) Pair of nearest successors represented by list.
529 
530  """
531 
532  nearest_node1 = None
533  nearest_node2 = None
534  nearest_distance = float("Inf")
535 
536  for i in range(0, len(self.successors)):
537  candidate1 = self.successors[i]
538 
539  for j in range(i + 1, len(self.successors)):
540  candidate2 = self.successors[j]
541  candidate_distance = candidate1.get_distance(candidate2, type_measurement)
542 
543  if candidate_distance < nearest_distance:
544  nearest_distance = candidate_distance
545  nearest_node1 = candidate1
546  nearest_node2 = candidate2
547 
548  return [nearest_node1, nearest_node2]
549 
550 
552  """!
553  @brief Represents clustering feature leaf node.
554 
555  """
556 
557  @property
558  def entries(self):
559  """!
560  @return (list) List of leaf nodes.
561 
562  """
563  return self.__entries
564 
565 
566  def __init__(self, feature, parent, entries):
567  """!
568  @brief Create CF Leaf node.
569 
570  @param[in] feature (cfentry): Clustering feature of the created node.
571  @param[in] parent (non_leaf_node): Parent of the created node.
572  @param[in] entries (list): List of entries of the node.
573 
574  """
575 
576  super().__init__(feature, parent)
577 
578 
579  self.type = cfnode_type.CFNODE_LEAF
580 
581  self.__entries = entries # list of clustering features
582 
583 
584  def __repr__(self):
585  """!
586  @return (string) Default leaf node represenation.
587 
588  """
589  text_entries = "\n"
590  for entry in self.entries:
591  text_entries += "\t" + str(entry) + "\n"
592 
593  return "Leaf-node: '%s', parent: '%s', feature: '%s', entries: '%d'" % \
594  (str(hex(id(self))), self.parent, self.feature, len(self.entries))
595 
596 
597  def __str__(self):
598  """!
599  @return (string) String leaf node representation.
600 
601  """
602  return self.__repr__()
603 
604 
605  def insert_entry(self, entry):
606  """!
607  @brief Insert new clustering feature to the leaf node.
608 
609  @param[in] entry (cfentry): Clustering feature.
610 
611  """
612 
613  self.feature += entry
614  self.entries.append(entry)
615 
616 
617  def merge(self, node):
618  """!
619  @brief Merge leaf node to the current.
620 
621  @param[in] node (leaf_node): Leaf node that should be merged with current.
622 
623  """
624 
625  self.feature += node.feature
626 
627  # Move entries from merged node
628  for entry in node.entries:
629  self.entries.append(entry)
630 
631 
632  def get_farthest_entries(self, type_measurement):
633  """!
634  @brief Find pair of farthest entries of the node.
635 
636  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest entries.
637 
638  @return (list) Pair of farthest entries of the node that are represented by list.
639 
640  """
641 
642  farthest_entity1 = None
643  farthest_entity2 = None
644  farthest_distance = 0
645 
646  for i in range(0, len(self.entries)):
647  candidate1 = self.entries[i]
648 
649  for j in range(i + 1, len(self.entries)):
650  candidate2 = self.entries[j]
651  candidate_distance = candidate1.get_distance(candidate2, type_measurement)
652 
653  if candidate_distance > farthest_distance:
654  farthest_distance = candidate_distance
655  farthest_entity1 = candidate1
656  farthest_entity2 = candidate2
657 
658  return [farthest_entity1, farthest_entity2]
659 
660 
661  def get_nearest_index_entry(self, entry, type_measurement):
662  """!
663  @brief Find nearest index of nearest entry of node for the specified entry.
664 
665  @param[in] entry (cfentry): Entry that is used for calculation distance.
666  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
667 
668  @return (uint) Index of nearest entry of node for the specified entry.
669 
670  """
671 
672  minimum_distance = float('Inf')
673  nearest_index = -1
674 
675  for candidate_index in range(0, len(self.entries)):
676  candidate_distance = self.entries[candidate_index].get_distance(entry, type_measurement)
677  if candidate_distance < minimum_distance:
678  minimum_distance = candidate_distance
679  nearest_index = candidate_index
680 
681  return nearest_index
682 
683 
684  def get_nearest_entry(self, entry, type_measurement):
685  """!
686  @brief Find nearest entry of node for the specified entry.
687 
688  @param[in] entry (cfentry): Entry that is used for calculation distance.
689  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
690 
691  @return (cfentry) Nearest entry of node for the specified entry.
692 
693  """
694 
695  min_key = lambda cur_entity: cur_entity.get_distance(entry, type_measurement)
696  return min(self.entries, key=min_key)
697 
698 
699 class cftree:
700  """!
701  @brief CF-Tree representation.
702  @details A CF-tree is a height-balanced tree with two parameters: branching factor and threshold.
703 
704  """
705 
706  @property
707  def root(self):
708  """!
709  @return (cfnode) Root of the tree.
710 
711  """
712  return self.__root
713 
714 
715  @property
716  def leafes(self):
717  """!
718  @return (list) List of all leaf nodes in the tree.
719 
720  """
721  return self.__leafes
722 
723 
724  @property
725  def amount_nodes(self):
726  """!
727  @return (unit) Number of nodes (leaf and non-leaf) in the tree.
728 
729  """
730  return self.__amount_nodes
731 
732 
733  @property
734  def amount_entries(self):
735  """!
736  @return (uint) Number of entries in the tree.
737 
738  """
739  return self.__amount_entries
740 
741 
742  @property
743  def height(self):
744  """!
745  @return (uint) Height of the tree.
746 
747  """
748  return self.__height
749 
750 
751  @property
752  def branch_factor(self):
753  """!
754  @return (uint) Branching factor of the tree.
755  @details Branching factor defines maximum number of successors in each non-leaf node.
756 
757  """
758  return self.__branch_factor
759 
760 
761  @property
762  def threshold(self):
763  """!
764  @return (double) Threshold of the tree that represents maximum diameter of sub-clusters that is formed by leaf node entries.
765 
766  """
767  return self.__threshold
768 
769 
770  @property
771  def max_entries(self):
772  """!
773  @return (uint) Maximum number of entries in each leaf node.
774 
775  """
776  return self.__max_entries
777 
778 
779  @property
780  def type_measurement(self):
781  """!
782  @return (measurement_type) Type that is used for measuring.
783 
784  """
785  return self.__type_measurement
786 
787 
788  def __init__(self, branch_factor, max_entries, threshold, type_measurement = measurement_type.CENTROID_EUCLIDEAN_DISTANCE):
789  """!
790  @brief Create CF-tree.
791 
792  @param[in] branch_factor (uint): Maximum number of children for non-leaf nodes.
793  @param[in] max_entries (uint): Maximum number of entries for leaf nodes.
794  @param[in] threshold (double): Maximum diameter of feature clustering for each leaf node.
795  @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance metrics.
796 
797  """
798 
799  self.__root = None
800 
801  self.__branch_factor = branch_factor # maximum number of children
802  if self.__branch_factor < 2:
803  self.__branch_factor = 2
804 
805  self.__threshold = threshold # maximum diameter of sub-clusters stored at the leaf nodes
806  self.__max_entries = max_entries
807 
808  self.__leafes = []
809 
810  self.__type_measurement = type_measurement
811 
812  # statistics
813  self.__amount_nodes = 0 # root, despite it can be None.
814  self.__amount_entries = 0
815  self.__height = 0 # tree size with root.
816 
817 
818  def get_level_nodes(self, level):
819  """!
820  @brief Traverses CF-tree to obtain nodes at the specified level.
821 
822  @param[in] level (uint): CF-tree level from that nodes should be returned.
823 
824  @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
825 
826  """
827 
828  level_nodes = []
829  if level < self.__height:
830  level_nodes = self.__recursive_get_level_nodes(level, self.__root)
831 
832  return level_nodes
833 
834 
835  def __recursive_get_level_nodes(self, level, node):
836  """!
837  @brief Traverses CF-tree to obtain nodes at the specified level recursively.
838 
839  @param[in] level (uint): Current CF-tree level.
840  @param[in] node (cfnode): CF-node from that traversing is performed.
841 
842  @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
843 
844  """
845 
846  level_nodes = []
847  if level == 0:
848  level_nodes.append(node)
849 
850  else:
851  for sucessor in node.successors:
852  level_nodes += self.__recursive_get_level_nodes(level - 1, sucessor)
853 
854  return level_nodes
855 
856 
857  def insert_point(self, point):
858  """!
859  @brief Insert point that is represented by list of coordinates.
860 
861  @param[in] point (list): Point represented by list of coordinates that should be inserted to CF tree.
862 
863  """
864 
865  entry = cfentry(len([point]), linear_sum([point]), square_sum([point]))
866  self.insert(entry)
867 
868 
869  def insert(self, entry):
870  """!
871  @brief Insert clustering feature to the tree.
872 
873  @param[in] entry (cfentry): Clustering feature that should be inserted.
874 
875  """
876 
877  if self.__root is None:
878  node = leaf_node(entry, None, [entry])
879 
880  self.__root = node
881  self.__leafes.append(node)
882 
883  # Update statistics
884  self.__amount_entries += 1
885  self.__amount_nodes += 1
886  self.__height += 1 # root has successor now
887  else:
888  child_node_updation = self.__recursive_insert(entry, self.__root)
889  if child_node_updation is True:
890  # Splitting has been finished, check for possibility to merge (at least we have already two children).
891  if self.__merge_nearest_successors(self.__root) is True:
892  self.__amount_nodes -= 1
893 
894 
895  def find_nearest_leaf(self, entry, search_node = None):
896  """!
897  @brief Search nearest leaf to the specified clustering feature.
898 
899  @param[in] entry (cfentry): Clustering feature.
900  @param[in] search_node (cfnode): Node from that searching should be started, if None then search process will be started for the root.
901 
902  @return (leaf_node) Nearest node to the specified clustering feature.
903 
904  """
905 
906  if search_node is None:
907  search_node = self.__root
908 
909  nearest_node = search_node
910 
911  if search_node.type == cfnode_type.CFNODE_NONLEAF:
912  min_key = lambda child_node: child_node.feature.get_distance(entry, self.__type_measurement)
913  nearest_child_node = min(search_node.successors, key = min_key)
914 
915  nearest_node = self.find_nearest_leaf(entry, nearest_child_node)
916 
917  return nearest_node
918 
919 
920  def __recursive_insert(self, entry, search_node):
921  """!
922  @brief Recursive insert of the entry to the tree.
923  @details It performs all required procedures during insertion such as splitting, merging.
924 
925  @param[in] entry (cfentry): Clustering feature.
926  @param[in] search_node (cfnode): Node from that insertion should be started.
927 
928  @return (bool) True if number of nodes at the below level is changed, otherwise False.
929 
930  """
931 
932  # None-leaf node
933  if search_node.type == cfnode_type.CFNODE_NONLEAF:
934  return self.__insert_for_noneleaf_node(entry, search_node)
935 
936  # Leaf is reached
937  else:
938  return self.__insert_for_leaf_node(entry, search_node)
939 
940 
941  def __insert_for_leaf_node(self, entry, search_node):
942  """!
943  @brief Recursive insert entry from leaf node to the tree.
944 
945  @param[in] entry (cfentry): Clustering feature.
946  @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
947 
948  @return (bool) True if number of nodes at the below level is changed, otherwise False.
949 
950  """
951 
952  node_amount_updation = False
953 
954  # Try to absorb by the entity
955  index_nearest_entry = search_node.get_nearest_index_entry(entry, self.__type_measurement)
956  nearest_entry = search_node.entries[index_nearest_entry] # get nearest entry
957  merged_entry = nearest_entry + entry
958 
959  # Otherwise try to add new entry
960  if merged_entry.get_diameter() > self.__threshold:
961  # If it's not exceeded append entity and update feature of the leaf node.
962  search_node.insert_entry(entry)
963 
964  # Otherwise current node should be splitted
965  if len(search_node.entries) > self.__max_entries:
966  self.__split_procedure(search_node)
967  node_amount_updation = True
968 
969  # Update statistics
970  self.__amount_entries += 1
971 
972  else:
973  search_node.entries[index_nearest_entry] = merged_entry
974  search_node.feature += entry
975 
976  return node_amount_updation
977 
978 
979  def __insert_for_noneleaf_node(self, entry, search_node):
980  """!
981  @brief Recursive insert entry from none-leaf node to the tree.
982 
983  @param[in] entry (cfentry): Clustering feature.
984  @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
985 
986  @return (bool) True if number of nodes at the below level is changed, otherwise False.
987 
988  """
989 
990  node_amount_updation = False
991 
992  min_key = lambda child_node: child_node.get_distance(search_node, self.__type_measurement)
993  nearest_child_node = min(search_node.successors, key=min_key)
994 
995  child_node_updation = self.__recursive_insert(entry, nearest_child_node)
996 
997  # Update clustering feature of none-leaf node.
998  search_node.feature += entry
999 
1000  # Check branch factor, probably some leaf has been splitted and threshold has been exceeded.
1001  if (len(search_node.successors) > self.__branch_factor):
1002 
1003  # Check if it's aleady root then new root should be created (height is increased in this case).
1004  if search_node is self.__root:
1005  self.__root = non_leaf_node(search_node.feature, None, [search_node])
1006  search_node.parent = self.__root
1007 
1008  # Update statistics
1009  self.__amount_nodes += 1
1010  self.__height += 1
1011 
1012  [new_node1, new_node2] = self.__split_nonleaf_node(search_node)
1013 
1014  # Update parent list of successors
1015  parent = search_node.parent
1016  parent.successors.remove(search_node)
1017  parent.successors.append(new_node1)
1018  parent.successors.append(new_node2)
1019 
1020  # Update statistics
1021  self.__amount_nodes += 1
1022  node_amount_updation = True
1023 
1024  elif child_node_updation is True:
1025  # Splitting has been finished, check for possibility to merge (at least we have already two children).
1026  if self.__merge_nearest_successors(search_node) is True:
1027  self.__amount_nodes -= 1
1028 
1029  return node_amount_updation
1030 
1031 
1032  def __merge_nearest_successors(self, node):
1033  """!
1034  @brief Find nearest sucessors and merge them.
1035 
1036  @param[in] node (non_leaf_node): Node whose two nearest successors should be merged.
1037 
1038  @return (bool): True if merging has been successfully performed, otherwise False.
1039 
1040  """
1041 
1042  merging_result = False
1043 
1044  if node.successors[0].type == cfnode_type.CFNODE_NONLEAF:
1045  [nearest_child_node1, nearest_child_node2] = node.get_nearest_successors(self.__type_measurement)
1046 
1047  if len(nearest_child_node1.successors) + len(nearest_child_node2.successors) <= self.__branch_factor:
1048  node.successors.remove(nearest_child_node2)
1049  if nearest_child_node2.type == cfnode_type.CFNODE_LEAF:
1050  self.__leafes.remove(nearest_child_node2)
1051 
1052  nearest_child_node1.merge(nearest_child_node2)
1053 
1054  merging_result = True
1055 
1056  return merging_result
1057 
1058 
1059  def __split_procedure(self, split_node):
1060  """!
1061  @brief Starts node splitting procedure in the CF-tree from the specify node.
1062 
1063  @param[in] split_node (cfnode): CF-tree node that should be splitted.
1064 
1065  """
1066  if split_node is self.__root:
1067  self.__root = non_leaf_node(split_node.feature, None, [ split_node ])
1068  split_node.parent = self.__root
1069 
1070  # Update statistics
1071  self.__amount_nodes += 1
1072  self.__height += 1
1073 
1074  [new_node1, new_node2] = self.__split_leaf_node(split_node)
1075 
1076  self.__leafes.remove(split_node)
1077  self.__leafes.append(new_node1)
1078  self.__leafes.append(new_node2)
1079 
1080  # Update parent list of successors
1081  parent = split_node.parent
1082  parent.successors.remove(split_node)
1083  parent.successors.append(new_node1)
1084  parent.successors.append(new_node2)
1085 
1086  # Update statistics
1087  self.__amount_nodes += 1
1088 
1089 
1090  def __split_nonleaf_node(self, node):
1091  """!
1092  @brief Performs splitting of the specified non-leaf node.
1093 
1094  @param[in] node (non_leaf_node): Non-leaf node that should be splitted.
1095 
1096  @return (list) New pair of non-leaf nodes [non_leaf_node1, non_leaf_node2].
1097 
1098  """
1099 
1100  [farthest_node1, farthest_node2] = node.get_farthest_successors(self.__type_measurement)
1101 
1102  # create new non-leaf nodes
1103  new_node1 = non_leaf_node(farthest_node1.feature, node.parent, [farthest_node1])
1104  new_node2 = non_leaf_node(farthest_node2.feature, node.parent, [farthest_node2])
1105 
1106  farthest_node1.parent = new_node1
1107  farthest_node2.parent = new_node2
1108 
1109  # re-insert other successors
1110  for successor in node.successors:
1111  if (successor is not farthest_node1) and (successor is not farthest_node2):
1112  distance1 = new_node1.get_distance(successor, self.__type_measurement)
1113  distance2 = new_node2.get_distance(successor, self.__type_measurement)
1114 
1115  if distance1 < distance2:
1116  new_node1.insert_successor(successor)
1117  else:
1118  new_node2.insert_successor(successor)
1119 
1120  return [new_node1, new_node2]
1121 
1122 
1123  def __split_leaf_node(self, node):
1124  """!
1125  @brief Performs splitting of the specified leaf node.
1126 
1127  @param[in] node (leaf_node): Leaf node that should be splitted.
1128 
1129  @return (list) New pair of leaf nodes [leaf_node1, leaf_node2].
1130 
1131  @warning Splitted node is transformed to non_leaf.
1132 
1133  """
1134 
1135  # search farthest pair of entries
1136  [farthest_entity1, farthest_entity2] = node.get_farthest_entries(self.__type_measurement)
1137 
1138  # create new nodes
1139  new_node1 = leaf_node(farthest_entity1, node.parent, [farthest_entity1])
1140  new_node2 = leaf_node(farthest_entity2, node.parent, [farthest_entity2])
1141 
1142  # re-insert other entries
1143  for entity in node.entries:
1144  if (entity is not farthest_entity1) and (entity is not farthest_entity2):
1145  distance1 = new_node1.feature.get_distance(entity, self.__type_measurement)
1146  distance2 = new_node2.feature.get_distance(entity, self.__type_measurement)
1147 
1148  if distance1 < distance2:
1149  new_node1.insert_entry(entity)
1150  else:
1151  new_node2.insert_entry(entity)
1152 
1153  return [new_node1, new_node2]
def linear_sum(self)
Returns linear sum.
Definition: cftree.py:101
Represents clustering feature leaf node.
Definition: cftree.py:551
def get_distance(self, entry, type_measurement)
Calculates distance between two clusters in line with measurement type.
Definition: cftree.py:205
def insert_entry(self, entry)
Insert new clustering feature to the leaf node.
Definition: cftree.py:605
def get_farthest_successors(self, type_measurement)
Find pair of farthest successors of the node in line with measurement type.
Definition: cftree.py:493
def __init__(self, feature, parent, entries)
Create CF Leaf node.
Definition: cftree.py:566
def __init__(self, feature, parent)
Constructor of abstract CF node.
Definition: cftree.py:365
def __str__(self)
Default cfentry string representation.
Definition: cftree.py:158
def __add__(self, entry)
Overloaded operator add.
Definition: cftree.py:166
def __insert_for_noneleaf_node(self, entry, search_node)
Recursive insert entry from none-leaf node to the tree.
Definition: cftree.py:979
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def __split_procedure(self, split_node)
Starts node splitting procedure in the CF-tree from the specify node.
Definition: cftree.py:1059
def __recursive_get_level_nodes(self, level, node)
Traverses CF-tree to obtain nodes at the specified level recursively.
Definition: cftree.py:835
def __get_average_inter_cluster_distance(self, entry)
Calculates average inter cluster distance between current and specified clusters. ...
Definition: cftree.py:300
def __split_nonleaf_node(self, node)
Performs splitting of the specified non-leaf node.
Definition: cftree.py:1090
def merge(self, node)
Merge non-leaf node to the current.
Definition: cftree.py:478
def insert_successor(self, successor)
Insert successor to the node.
Definition: cftree.py:464
feature
Clustering feature of the node.
Definition: cftree.py:375
Representation of clustering feature non-leaf node.
Definition: cftree.py:415
def __get_variance_increase_distance(self, entry)
Calculates variance increase distance between current and specified clusters.
Definition: cftree.py:335
def get_nearest_index_entry(self, entry, type_measurement)
Find nearest index of nearest entry of node for the specified entry.
Definition: cftree.py:661
def merge(self, node)
Merge leaf node to the current.
Definition: cftree.py:617
def __init__(self, feature, parent, successors)
Create CF Non-leaf node.
Definition: cftree.py:430
def get_diameter(self)
Calculates diameter of cluster that is represented by the entry.
Definition: cftree.py:278
def __insert_for_leaf_node(self, entry, search_node)
Recursive insert entry from leaf node to the tree.
Definition: cftree.py:941
def insert(self, entry)
Insert clustering feature to the tree.
Definition: cftree.py:869
def find_nearest_leaf(self, entry, search_node=None)
Search nearest leaf to the specified clustering feature.
Definition: cftree.py:895
def __merge_nearest_successors(self, node)
Find nearest sucessors and merge them.
Definition: cftree.py:1032
CF-Tree representation.
Definition: cftree.py:699
def __eq__(self, entry)
Overloaded operator eq.
Definition: cftree.py:183
def get_radius(self)
Calculates radius of cluster that is represented by the entry.
Definition: cftree.py:255
def get_farthest_entries(self, type_measurement)
Find pair of farthest entries of the node.
Definition: cftree.py:632
def get_nearest_entry(self, entry, type_measurement)
Find nearest entry of node for the specified entry.
Definition: cftree.py:684
def square_sum(self)
Returns square sum.
Definition: cftree.py:112
parent
Pointer to the parent node (None for root).
Definition: cftree.py:378
def __init__(self, number_points, linear_sum, square_sum)
CF-entry constructor.
Definition: cftree.py:122
def get_distance(self, node, type_measurement)
Calculates distance between nodes in line with specified type measurement.
Definition: cftree.py:401
Enumeration of measurement types for CF-Tree.
Definition: cftree.py:38
type
Type node (leaf or non-leaf).
Definition: cftree.py:381
Clustering feature representation.
Definition: cftree.py:81
Representation of node of CF-Tree.
Definition: cftree.py:359
def number_points(self)
Returns number of points that are encoded.
Definition: cftree.py:91
def __init__(self, branch_factor, max_entries, threshold, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE)
Create CF-tree.
Definition: cftree.py:788
Enumeration of CF-Node types that are used by CF-Tree.
Definition: cftree.py:62
def __recursive_insert(self, entry, search_node)
Recursive insert of the entry to the tree.
Definition: cftree.py:920
def get_centroid(self)
Calculates centroid of cluster that is represented by the entry.
Definition: cftree.py:239
def __split_leaf_node(self, node)
Performs splitting of the specified leaf node.
Definition: cftree.py:1123
def insert_point(self, point)
Insert point that is represented by list of coordinates.
Definition: cftree.py:857
def get_level_nodes(self, level)
Traverses CF-tree to obtain nodes at the specified level.
Definition: cftree.py:818
def get_nearest_successors(self, type_measurement)
Find pair of nearest successors of the node in line with measurement type.
Definition: cftree.py:522
def __get_average_intra_cluster_distance(self, entry)
Calculates average intra cluster distance between current and specified clusters. ...
Definition: cftree.py:315