3 @brief Data Structure: CF-Tree
4 @details Implementation based on paper @cite article::birch::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
20 from enum
import IntEnum
25 @brief Enumeration of measurement types for CF-Tree.
32 CENTROID_EUCLIDEAN_DISTANCE = 0
35 CENTROID_MANHATTAN_DISTANCE = 1
38 AVERAGE_INTER_CLUSTER_DISTANCE = 2
41 AVERAGE_INTRA_CLUSTER_DISTANCE = 3
44 VARIANCE_INCREASE_DISTANCE = 4
49 @brief Enumeration of CF-Node types that are used by CF-Tree.
68 @brief Clustering feature representation.
78 @brief Returns number of points that are encoded.
80 @return (uint) Number of encoded points.
88 @brief Returns linear sum.
90 @return (list) Linear sum.
99 @brief Returns square sum.
101 @return (double) Square sum.
107 def __init__(self, number_points, linear_sum, square_sum):
109 @brief CF-entry constructor.
111 @param[in] number_points (uint): Number of objects that is represented by the entry.
112 @param[in] linear_sum (list): Linear sum of values that represent objects in each dimension.
113 @param[in] square_sum (double): Square sum of values that represent objects.
128 @returns (cfentry) Makes copy of the CF-entry instance.
136 @return (string) Returns CF-entry representation.
139 return "CF-Entry (N: '%s', LS: '%s', D: '%s')" % \
145 @brief Default cfentry string representation.
153 @brief Overloaded operator add. Performs addition of two clustering features.
155 @param[in] entry (cfentry): Entry that is added to the current.
157 @return (cfentry) Result of addition of two clustering features.
162 result_linear_sum = numpy.add(self.
linear_sum, entry.linear_sum)
163 result_square_sum = self.
square_sum + entry.square_sum
165 return cfentry(number_points, result_linear_sum, result_square_sum)
170 @brief Overloaded operator eq.
171 @details Performs comparison of two clustering features.
173 @param[in] entry (cfentry): Entry that is used for comparison with current.
175 @return (bool) True is both clustering features are equals in line with tolerance, otherwise False.
182 result &= ((self.
square_sum + tolerance > entry.square_sum)
and (self.
square_sum - tolerance < entry.square_sum))
184 for index_dimension
in range(0, len(self.
linear_sum)):
185 result &= ((self.
linear_sum[index_dimension] + tolerance > entry.linear_sum[index_dimension])
and (self.
linear_sum[index_dimension] - tolerance < entry.linear_sum[index_dimension]))
192 @brief Calculates distance between two clusters in line with measurement type.
194 @details In case of usage CENTROID_EUCLIDIAN_DISTANCE square euclidian distance will be returned.
195 Square root should be taken from the result for obtaining real euclidian distance between
198 @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
199 @param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters.
201 @return (double) Distance between two clusters.
205 if type_measurement
is measurement_type.CENTROID_EUCLIDEAN_DISTANCE:
206 return euclidean_distance_square(entry.get_centroid(), self.
get_centroid())
208 elif type_measurement
is measurement_type.CENTROID_MANHATTAN_DISTANCE:
209 return manhattan_distance(entry.get_centroid(), self.
get_centroid())
211 elif type_measurement
is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE:
214 elif type_measurement
is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE:
217 elif type_measurement
is measurement_type.VARIANCE_INCREASE_DISTANCE:
221 raise ValueError(
"Unsupported type of measurement '%s' is specified." % type_measurement)
226 @brief Calculates centroid of cluster that is represented by the entry.
227 @details It's calculated once when it's requested after the last changes.
229 @return (array_like) Centroid of cluster that is represented by the entry.
242 @brief Calculates radius of cluster that is represented by the entry.
243 @details It's calculated once when it's requested after the last changes.
245 @return (double) Radius of cluster that is represented by the entry.
256 radius_part_2 = 2.0 * numpy.dot(self.
linear_sum, centroid)
257 radius_part_3 = N * numpy.dot(centroid, centroid)
259 self.
__radius = ((1.0 / N) * (radius_part_1 - radius_part_2 + radius_part_3)) ** 0.5
265 @brief Calculates diameter of cluster that is represented by the entry.
266 @details It's calculated once when it's requested after the last changes.
268 @return (double) Diameter of cluster that is represented by the entry.
277 if diameter_part < 0.000000001:
285 def __get_average_inter_cluster_distance(self, entry):
287 @brief Calculates average inter cluster distance between current and specified clusters.
289 @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
291 @return (double) Average inter cluster distance.
295 linear_part_distance = numpy.dot(self.
linear_sum, entry.linear_sum)
300 def __get_average_intra_cluster_distance(self, entry):
302 @brief Calculates average intra cluster distance between current and specified clusters.
304 @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
306 @return (double) Average intra cluster distance.
310 linear_part_first = numpy.add(self.
linear_sum, entry.linear_sum)
311 linear_part_second = linear_part_first
313 linear_part_distance = numpy.dot(linear_part_first, linear_part_second)
315 general_part_distance = 2.0 * (self.
number_points + entry.number_points) * (self.
square_sum + entry.square_sum) - 2.0 * linear_part_distance
317 return (general_part_distance / ((self.
number_points + entry.number_points) * (self.
number_points + entry.number_points - 1.0))) ** 0.5
320 def __get_variance_increase_distance(self, entry):
322 @brief Calculates variance increase distance between current and specified clusters.
324 @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
326 @return (double) Variance increase distance.
330 linear_part_12 = numpy.add(self.
linear_sum, entry.linear_sum)
331 variance_part_first = (self.
square_sum + entry.square_sum) - \
332 2.0 * numpy.dot(linear_part_12, linear_part_12) / (self.
number_points + entry.number_points) + \
333 (self.
number_points + entry.number_points) * numpy.dot(linear_part_12, linear_part_12) / (self.
number_points + entry.number_points)**2.0
338 linear_part_22 = numpy.dot(entry.linear_sum, entry.linear_sum)
339 variance_part_third = -(entry.square_sum - (2.0 / entry.number_points) * linear_part_22 + entry.number_points * (1.0 / entry.number_points ** 2.0) * linear_part_22)
341 return variance_part_first + variance_part_second + variance_part_third
346 @brief Representation of node of CF-Tree.
352 @brief Constructor of abstract CF node.
354 @param[in] feature (cfentry): Clustering feature of the created node.
355 @param[in] parent (cfnode): Parent of the created node.
366 self.
type = cfnode_type.CFNODE_DUMMY
371 @return (string) Default representation of CF node.
375 return 'CF node %s, parent %s, feature %s' % (hex(id(self)), self.
parent, self.
feature)
380 @return (string) String representation of CF node.
388 @brief Calculates distance between nodes in line with specified type measurement.
390 @param[in] node (cfnode): CF-node that is used for calculation distance to the current node.
391 @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance.
393 @return (double) Distance between two nodes.
402 @brief Representation of clustering feature non-leaf node.
409 @return (list) List of successors of the node.
417 @brief Create CF Non-leaf node.
419 @param[in] feature (cfentry): Clustering feature of the created node.
420 @param[in] parent (non_leaf_node): Parent of the created node.
421 @param[in] successors (list): List of successors of the node.
428 self.
type = cfnode_type.CFNODE_NONLEAF
435 @return (string) Representation of non-leaf node representation.
438 return 'Non-leaf node %s, parent %s, feature %s, successors: %d' % (hex(id(self)), self.
parent, self.
feature, len(self.
successors))
443 @return (string) String non-leaf representation.
451 @brief Insert successor to the node.
453 @param[in] successor (cfnode): Successor for adding.
457 self.
feature += successor.feature
460 successor.parent = self
465 @brief Merge non-leaf node to the current.
467 @param[in] node (non_leaf_node): Non-leaf node that should be merged with current.
473 for child
in node.successors:
480 @brief Find pair of farthest successors of the node in line with measurement type.
482 @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest successors.
484 @return (list) Pair of farthest successors represented by list [cfnode1, cfnode2].
488 farthest_node1 =
None
489 farthest_node2 =
None
490 farthest_distance = 0.0
497 candidate_distance = candidate1.get_distance(candidate2, type_measurement)
499 if candidate_distance > farthest_distance:
500 farthest_distance = candidate_distance
501 farthest_node1 = candidate1
502 farthest_node2 = candidate2
504 return [farthest_node1, farthest_node2]
509 @brief Find pair of nearest successors of the node in line with measurement type.
511 @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest successors.
513 @return (list) Pair of nearest successors represented by list.
519 nearest_distance = float(
"Inf")
526 candidate_distance = candidate1.get_distance(candidate2, type_measurement)
528 if candidate_distance < nearest_distance:
529 nearest_distance = candidate_distance
530 nearest_node1 = candidate1
531 nearest_node2 = candidate2
533 return [nearest_node1, nearest_node2]
538 @brief Represents clustering feature leaf node.
545 @return (list) List of leaf nodes.
553 @brief Create CF Leaf node.
555 @param[in] feature (cfentry): Clustering feature of the created node.
556 @param[in] parent (non_leaf_node): Parent of the created node.
557 @param[in] entries (list): List of entries of the node.
564 self.
type = cfnode_type.CFNODE_LEAF
571 @return (string) Default leaf node represenation.
576 text_entries +=
"\t" + str(entry) +
"\n"
578 return "Leaf-node: '%s', parent: '%s', feature: '%s', entries: '%d'" % \
584 @return (string) String leaf node representation.
592 @brief Insert new clustering feature to the leaf node.
594 @param[in] entry (cfentry): Clustering feature.
604 @brief Merge leaf node to the current.
606 @param[in] node (leaf_node): Leaf node that should be merged with current.
613 for entry
in node.entries:
619 @brief Find pair of farthest entries of the node.
621 @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest entries.
623 @return (list) Pair of farthest entries of the node that are represented by list.
627 farthest_entity1 =
None
628 farthest_entity2 =
None
629 farthest_distance = 0
631 for i
in range(0, len(self.
entries)):
634 for j
in range(i + 1, len(self.
entries)):
636 candidate_distance = candidate1.get_distance(candidate2, type_measurement)
638 if candidate_distance > farthest_distance:
639 farthest_distance = candidate_distance
640 farthest_entity1 = candidate1
641 farthest_entity2 = candidate2
643 return [farthest_entity1, farthest_entity2]
648 @brief Find nearest index of nearest entry of node for the specified entry.
650 @param[in] entry (cfentry): Entry that is used for calculation distance.
651 @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
653 @return (uint) Index of nearest entry of node for the specified entry.
657 minimum_distance = float(
'Inf')
660 for candidate_index
in range(0, len(self.
entries)):
662 if candidate_distance < minimum_distance:
663 minimum_distance = candidate_distance
664 nearest_index = candidate_index
671 @brief Find nearest entry of node for the specified entry.
673 @param[in] entry (cfentry): Entry that is used for calculation distance.
674 @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
676 @return (cfentry) Nearest entry of node for the specified entry.
680 min_key =
lambda cur_entity: cur_entity.get_distance(entry, type_measurement)
681 return min(self.
entries, key=min_key)
686 @brief CF-Tree representation.
687 @details A CF-tree is a height-balanced tree with two parameters: branching factor and threshold.
694 @return (cfnode) Root of the tree.
703 @return (list) List of all leaf nodes in the tree.
712 @return (unit) Number of nodes (leaf and non-leaf) in the tree.
721 @return (uint) Number of entries in the tree.
730 @return (uint) Height of the tree.
739 @return (uint) Branching factor of the tree.
740 @details Branching factor defines maximum number of successors in each non-leaf node.
749 @return (double) Threshold of the tree that represents maximum diameter of sub-clusters that is formed by leaf node entries.
758 @return (uint) Maximum number of entries in each leaf node.
767 @return (measurement_type) Type that is used for measuring.
773 def __init__(self, branch_factor, max_entries, threshold, type_measurement = measurement_type.CENTROID_EUCLIDEAN_DISTANCE):
775 @brief Create CF-tree.
777 @param[in] branch_factor (uint): Maximum number of children for non-leaf nodes.
778 @param[in] max_entries (uint): Maximum number of entries for leaf nodes.
779 @param[in] threshold (double): Maximum diameter of feature clustering for each leaf node.
780 @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance metrics.
805 @brief Traverses CF-tree to obtain nodes at the specified level.
807 @param[in] level (uint): CF-tree level from that nodes should be returned.
809 @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
820 def __recursive_get_level_nodes(self, level, node):
822 @brief Traverses CF-tree to obtain nodes at the specified level recursively.
824 @param[in] level (uint): Current CF-tree level.
825 @param[in] node (cfnode): CF-node from that traversing is performed.
827 @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
833 level_nodes.append(node)
836 for sucessor
in node.successors:
844 @brief Insert point that is represented by list of coordinates.
846 @param[in] point (list): Point represented by list of coordinates that should be inserted to CF tree.
850 entry =
cfentry(len([point]), linear_sum([point]), square_sum([point]))
856 @brief Insert clustering feature to the tree.
858 @param[in] entry (cfentry): Clustering feature that should be inserted.
874 if child_node_updation
is True:
882 @brief Search nearest leaf to the specified clustering feature.
884 @param[in] entry (cfentry): Clustering feature.
885 @param[in] search_node (cfnode): Node from that searching should be started, if None then search process will be started for the root.
887 @return (leaf_node) Nearest node to the specified clustering feature.
891 if search_node
is None:
894 nearest_node = search_node
896 if search_node.type == cfnode_type.CFNODE_NONLEAF:
897 min_key =
lambda child_node: child_node.feature.get_distance(entry, self.
__type_measurement)
898 nearest_child_node = min(search_node.successors, key = min_key)
905 def __recursive_insert(self, entry, search_node):
907 @brief Recursive insert of the entry to the tree.
908 @details It performs all required procedures during insertion such as splitting, merging.
910 @param[in] entry (cfentry): Clustering feature.
911 @param[in] search_node (cfnode): Node from that insertion should be started.
913 @return (bool) True if number of nodes at the below level is changed, otherwise False.
918 if search_node.type == cfnode_type.CFNODE_NONLEAF:
926 def __insert_for_leaf_node(self, entry, search_node):
928 @brief Recursive insert entry from leaf node to the tree.
930 @param[in] entry (cfentry): Clustering feature.
931 @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
933 @return (bool) True if number of nodes at the below level is changed, otherwise False.
937 node_amount_updation =
False
940 index_nearest_entry = search_node.get_nearest_index_entry(entry, self.
__type_measurement)
941 nearest_entry = search_node.entries[index_nearest_entry]
942 merged_entry = nearest_entry + entry
947 search_node.insert_entry(entry)
952 node_amount_updation =
True
958 search_node.entries[index_nearest_entry] = merged_entry
959 search_node.feature += entry
961 return node_amount_updation
964 def __insert_for_noneleaf_node(self, entry, search_node):
966 @brief Recursive insert entry from none-leaf node to the tree.
968 @param[in] entry (cfentry): Clustering feature.
969 @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
971 @return (bool) True if number of nodes at the below level is changed, otherwise False.
975 node_amount_updation =
False
977 min_key =
lambda child_node: child_node.get_distance(search_node, self.
__type_measurement)
978 nearest_child_node = min(search_node.successors, key=min_key)
983 search_node.feature += entry
989 if search_node
is self.
__root:
991 search_node.parent = self.
__root
1000 parent = search_node.parent
1001 parent.successors.remove(search_node)
1002 parent.successors.append(new_node1)
1003 parent.successors.append(new_node2)
1007 node_amount_updation =
True
1009 elif child_node_updation
is True:
1014 return node_amount_updation
1017 def __merge_nearest_successors(self, node):
1019 @brief Find nearest sucessors and merge them.
1021 @param[in] node (non_leaf_node): Node whose two nearest successors should be merged.
1023 @return (bool): True if merging has been successfully performed, otherwise False.
1027 merging_result =
False
1029 if node.successors[0].type == cfnode_type.CFNODE_NONLEAF:
1030 [nearest_child_node1, nearest_child_node2] = node.get_nearest_successors(self.
__type_measurement)
1032 if len(nearest_child_node1.successors) + len(nearest_child_node2.successors) <= self.
__branch_factor:
1033 node.successors.remove(nearest_child_node2)
1034 if nearest_child_node2.type == cfnode_type.CFNODE_LEAF:
1035 self.
__leafes.remove(nearest_child_node2)
1037 nearest_child_node1.merge(nearest_child_node2)
1039 merging_result =
True
1041 return merging_result
1044 def __split_procedure(self, split_node):
1046 @brief Starts node splitting procedure in the CF-tree from the specify node.
1048 @param[in] split_node (cfnode): CF-tree node that should be splitted.
1051 if split_node
is self.
__root:
1053 split_node.parent = self.
__root
1066 parent = split_node.parent
1067 parent.successors.remove(split_node)
1068 parent.successors.append(new_node1)
1069 parent.successors.append(new_node2)
1075 def __split_nonleaf_node(self, node):
1077 @brief Performs splitting of the specified non-leaf node.
1079 @param[in] node (non_leaf_node): Non-leaf node that should be splitted.
1081 @return (list) New pair of non-leaf nodes [non_leaf_node1, non_leaf_node2].
1085 [farthest_node1, farthest_node2] = node.get_farthest_successors(self.
__type_measurement)
1088 new_node1 =
non_leaf_node(farthest_node1.feature, node.parent, [farthest_node1])
1089 new_node2 =
non_leaf_node(farthest_node2.feature, node.parent, [farthest_node2])
1091 farthest_node1.parent = new_node1
1092 farthest_node2.parent = new_node2
1095 for successor
in node.successors:
1096 if (successor
is not farthest_node1)
and (successor
is not farthest_node2):
1100 if distance1 < distance2:
1101 new_node1.insert_successor(successor)
1103 new_node2.insert_successor(successor)
1105 return [new_node1, new_node2]
1108 def __split_leaf_node(self, node):
1110 @brief Performs splitting of the specified leaf node.
1112 @param[in] node (leaf_node): Leaf node that should be splitted.
1114 @return (list) New pair of leaf nodes [leaf_node1, leaf_node2].
1116 @warning Splitted node is transformed to non_leaf.
1121 [farthest_entity1, farthest_entity2] = node.get_farthest_entries(self.
__type_measurement)
1124 new_node1 =
leaf_node(farthest_entity1, node.parent, [farthest_entity1])
1125 new_node2 =
leaf_node(farthest_entity2, node.parent, [farthest_entity2])
1128 for entity
in node.entries:
1129 if (entity
is not farthest_entity1)
and (entity
is not farthest_entity2):
1133 if distance1 < distance2:
1134 new_node1.insert_entry(entity)
1136 new_node2.insert_entry(entity)
1138 return [new_node1, new_node2]