cftree.py
1 """!
2 
3 @brief Data Structure: CF-Tree
4 @details Implementation based on paper @cite article::birch::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 from copy import copy
28 
29 from pyclustering.cluster import cluster_visualizer
30 
31 from pyclustering.utils import euclidean_distance_square
32 from pyclustering.utils import manhattan_distance
33 from pyclustering.utils import list_math_addition, list_math_subtraction, list_math_multiplication
34 from pyclustering.utils import linear_sum, square_sum
35 
36 from enum import IntEnum
37 
38 
39 class measurement_type(IntEnum):
40  """!
41  @brief Enumeration of measurement types for CF-Tree.
42 
43  @see cftree
44 
45  """
46 
47 
48  CENTROID_EUCLIDEAN_DISTANCE = 0
49 
50 
51  CENTROID_MANHATTAN_DISTANCE = 1
52 
53 
54  AVERAGE_INTER_CLUSTER_DISTANCE = 2
55 
56 
57  AVERAGE_INTRA_CLUSTER_DISTANCE = 3
58 
59 
60  VARIANCE_INCREASE_DISTANCE = 4
61 
62 
63 class cfnode_type(IntEnum):
64  """!
65  @brief Enumeration of CF-Node types that are used by CF-Tree.
66 
67  @see cfnode
68  @see cftree
69 
70  """
71 
72 
73  CFNODE_DUMMY = 0
74 
75 
76  CFNODE_LEAF = 1
77 
78 
79  CFNODE_NONLEAF = 2
80 
81 
82 class cfentry:
83  """!
84  @brief Clustering feature representation.
85 
86  @see cfnode
87  @see cftree
88 
89  """
90 
91  @property
92  def number_points(self):
93  """!
94  @brief Returns number of points that are encoded.
95 
96  @return (uint) Number of encoded points.
97 
98  """
99  return self.__number_points
100 
101  @property
102  def linear_sum(self):
103  """!
104  @brief Returns linear sum.
105 
106  @return (list) Linear sum.
107 
108  """
109 
110  return self.__linear_sum
111 
112  @property
113  def square_sum(self):
114  """!
115  @brief Returns square sum.
116 
117  @return (double) Square sum.
118 
119  """
120  return self.__square_sum
121 
122 
123  def __init__(self, number_points, linear_sum, square_sum):
124  """!
125  @brief CF-entry constructor.
126 
127  @param[in] number_points (uint): Number of objects that is represented by the entry.
128  @param[in] linear_sum (list): Linear sum of values that represent objects in each dimension.
129  @param[in] square_sum (double): Square sum of values that represent objects.
130 
131  """
132 
133  self.__number_points = number_points
134  self.__linear_sum = linear_sum
135  self.__square_sum = square_sum
136 
137  self.__centroid = None
138  self.__radius = None
139  self.__diameter = None
140 
141 
142  def __copy__(self):
143  """!
144  @returns (cfentry) Makes copy of the cfentry instance.
145 
146  """
147  return cfentry(self.__number_points, self.__linear_sum, self.__square_sum)
148 
149 
150  def __repr__(self):
151  """!
152  @return (string) Default cfentry representation.
153 
154  """
155  return 'CF (%s, %s, %0.2f) [%s]' % (self.number_points, self.linear_sum, self.__square_sum, hex(id(self)))
156 
157 
158  def __str__(self):
159  """!
160  @brief Default cfentry string representation.
161 
162  """
163  return self.__repr__()
164 
165 
166  def __add__(self, entry):
167  """!
168  @brief Overloaded operator add. Performs addition of two clustering features.
169 
170  @param[in] entry (cfentry): Entry that is added to the current.
171 
172  @return (cfentry) Result of addition of two clustering features.
173 
174  """
175 
176  number_points = self.number_points + entry.number_points
177  result_linear_sum = list_math_addition(self.linear_sum, entry.linear_sum)
178  result_square_sum = self.square_sum + entry.square_sum
179 
180  return cfentry(number_points, result_linear_sum, result_square_sum)
181 
182 
183  def __sub__(self, entry):
184  """!
185  @brief Overloaded operator sub. Performs substraction of two clustering features.
186  @details Substraction can't be performed with clustering feature whose description is less then substractor.
187 
188  @param[in] entry (cfentry): Entry that is substracted from the current.
189 
190  @return (cfentry) Result of substraction of two clustering features.
191 
192  """
193 
194  number_points = self.number_points - entry.number_points
195  result_linear_sum = list_math_subtraction(self.linear_sum, entry.linear_sum)
196  result_square_sum = self.square_sum - entry.square_sum
197 
198  if (number_points < 0) or (result_square_sum < 0):
199  raise NameError("Substruction with negative result is not possible for clustering features.")
200 
201  return cfentry(number_points, result_linear_sum, result_square_sum)
202 
203 
204  def __eq__(self, entry):
205  """!
206  @brief Overloaded operator eq.
207  @details Performs comparison of two clustering features.
208 
209  @param[in] entry (cfentry): Entry that is used for comparison with current.
210 
211  @return (bool) True is both clustering features are equals in line with tolerance, otherwise False.
212 
213  """
214 
215  tolerance = 0.00001
216 
217  result = (self.__number_points == entry.number_points)
218  result &= ((self.square_sum + tolerance > entry.square_sum) and (self.square_sum - tolerance < entry.square_sum))
219 
220  for index_dimension in range(0, len(self.linear_sum)):
221  result &= ((self.linear_sum[index_dimension] + tolerance > entry.linear_sum[index_dimension]) and (self.linear_sum[index_dimension] - tolerance < entry.linear_sum[index_dimension]))
222 
223  return result
224 
225 
226  def get_distance(self, entry, type_measurement):
227  """!
228  @brief Calculates distance between two clusters in line with measurement type.
229 
230  @details In case of usage CENTROID_EUCLIDIAN_DISTANCE square euclidian distance will be returned.
231  Square root should be taken from the result for obtaining real euclidian distance between
232  entries.
233 
234  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
235  @param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters.
236 
237  @return (double) Distance between two clusters.
238 
239  """
240 
241  if type_measurement is measurement_type.CENTROID_EUCLIDEAN_DISTANCE:
242  return euclidean_distance_square(entry.get_centroid(), self.get_centroid())
243 
244  elif type_measurement is measurement_type.CENTROID_MANHATTAN_DISTANCE:
245  return manhattan_distance(entry.get_centroid(), self.get_centroid())
246 
247  elif type_measurement is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE:
248  return self.__get_average_inter_cluster_distance(entry)
249 
250  elif type_measurement is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE:
251  return self.__get_average_intra_cluster_distance(entry)
252 
253  elif type_measurement is measurement_type.VARIANCE_INCREASE_DISTANCE:
254  return self.__get_variance_increase_distance(entry)
255 
256  else:
257  raise ValueError("Unsupported type of measurement '%s' is specified." % type_measurement)
258 
259 
260  def get_centroid(self):
261  """!
262  @brief Calculates centroid of cluster that is represented by the entry.
263  @details It's calculated once when it's requested after the last changes.
264 
265  @return (list) Centroid of cluster that is represented by the entry.
266 
267  """
268 
269  if self.__centroid is not None:
270  return self.__centroid
271 
272  self.__centroid = [0] * len(self.linear_sum)
273  for index_dimension in range(0, len(self.linear_sum)):
274  self.__centroid[index_dimension] = self.linear_sum[index_dimension] / self.number_points
275 
276  return self.__centroid
277 
278 
279  def get_radius(self):
280  """!
281  @brief Calculates radius of cluster that is represented by the entry.
282  @details It's calculated once when it's requested after the last changes.
283 
284  @return (double) Radius of cluster that is represented by the entry.
285 
286  """
287 
288  if self.__radius is not None:
289  return self.__radius
290 
291  centroid = self.get_centroid()
292 
293  radius_part_1 = self.square_sum
294 
295  if type(centroid) == list:
296  radius_part_2 = 2.0 * sum(list_math_multiplication(self.linear_sum, centroid))
297  radius_part_3 = self.number_points * sum(list_math_multiplication(centroid, centroid))
298  else:
299  radius_part_2 = 2.0 * self.linear_sum * centroid
300  radius_part_3 = self.number_points * centroid * centroid
301 
302  self.__radius = ((1.0 / self.number_points) * (radius_part_1 - radius_part_2 + radius_part_3)) ** 0.5
303  return self.__radius
304 
305 
306  def get_diameter(self):
307  """!
308  @brief Calculates diameter of cluster that is represented by the entry.
309  @details It's calculated once when it's requested after the last changes.
310 
311  @return (double) Diameter of cluster that is represented by the entry.
312 
313  """
314 
315  if self.__diameter is not None:
316  return self.__diameter
317 
318  if type(self.linear_sum) == list:
319  diameter_part = self.square_sum * self.number_points - 2.0 * sum(list_math_multiplication(self.linear_sum, self.linear_sum)) + self.square_sum * self.number_points
320  else:
321  diameter_part = self.square_sum * self.number_points - 2.0 * self.linear_sum * self.linear_sum + self.square_sum * self.number_points
322 
323  self.__diameter = (diameter_part / (self.number_points * (self.number_points - 1))) ** 0.5
324  return self.__diameter
325 
326 
327  def __get_average_inter_cluster_distance(self, entry):
328  """!
329  @brief Calculates average inter cluster distance between current and specified clusters.
330 
331  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
332 
333  @return (double) Average inter cluster distance.
334 
335  """
336 
337  linear_part_distance = sum(list_math_multiplication(self.linear_sum, entry.linear_sum))
338 
339  return ((entry.number_points * self.square_sum - 2.0 * linear_part_distance + self.number_points * entry.square_sum) / (self.number_points * entry.number_points)) ** 0.5
340 
341 
342  def __get_average_intra_cluster_distance(self, entry):
343  """!
344  @brief Calculates average intra cluster distance between current and specified clusters.
345 
346  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
347 
348  @return (double) Average intra cluster distance.
349 
350  """
351 
352  linear_part_first = list_math_addition(self.linear_sum, entry.linear_sum)
353  linear_part_second = linear_part_first
354 
355  linear_part_distance = sum(list_math_multiplication(linear_part_first, linear_part_second))
356 
357  general_part_distance = 2.0 * (self.number_points + entry.number_points) * (self.square_sum + entry.square_sum) - 2.0 * linear_part_distance
358 
359  return (general_part_distance / ((self.number_points + entry.number_points) * (self.number_points + entry.number_points - 1.0))) ** 0.5
360 
361 
362  def __get_variance_increase_distance(self, entry):
363  """!
364  @brief Calculates variance increase distance between current and specified clusters.
365 
366  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
367 
368  @return (double) Variance increase distance.
369 
370  """
371 
372  linear_part_12 = list_math_addition(self.linear_sum, entry.linear_sum)
373  variance_part_first = (self.square_sum + entry.square_sum) - \
374  2.0 * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points) + \
375  (self.number_points + entry.number_points) * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points)**2.0
376 
377  linear_part_11 = sum(list_math_multiplication(self.linear_sum, self.linear_sum))
378  variance_part_second = -(self.square_sum - (2.0 * linear_part_11 / self.number_points) + (linear_part_11 / self.number_points))
379 
380  linear_part_22 = sum(list_math_multiplication(entry.linear_sum, entry.linear_sum))
381  variance_part_third = -(entry.square_sum - (2.0 / entry.number_points) * linear_part_22 + entry.number_points * (1.0 / entry.number_points ** 2.0) * linear_part_22)
382 
383  return variance_part_first + variance_part_second + variance_part_third
384 
385 
386 class cfnode:
387  """!
388  @brief Representation of node of CF-Tree.
389 
390  """
391 
392  def __init__(self, feature, parent, payload):
393  """!
394  @brief Constructor of abstract CF node.
395 
396  @param[in] feature (cfentry): Clustering feature of the created node.
397  @param[in] parent (cfnode): Parent of the created node.
398  @param[in] payload (*): Data that is stored by the node.
399 
400  """
401 
402 
403  self.feature = copy(feature)
404 
405 
406  self.parent = parent
407 
408 
409  self.type = cfnode_type.CFNODE_DUMMY
410 
411 
412  self.payload = payload
413 
414 
415  def __repr__(self):
416  """!
417  @return (string) Default representation of CF node.
418 
419  """
420 
421  return 'CF node %s, parent %s, feature %s' % (hex(id(self)), self.parent, self.feature)
422 
423 
424  def __str__(self):
425  """!
426  @return (string) String representation of CF node.
427 
428  """
429  return self.__repr__()
430 
431 
432  def get_distance(self, node, type_measurement):
433  """!
434  @brief Calculates distance between nodes in line with specified type measurement.
435 
436  @param[in] node (cfnode): CF-node that is used for calculation distance to the current node.
437  @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance.
438 
439  @return (double) Distance between two nodes.
440 
441  """
442 
443  return self.feature.get_distance(node.feature, type_measurement)
444 
445 
447  """!
448  @brief Representation of clustering feature non-leaf node.
449 
450  """
451 
452  @property
453  def successors(self):
454  """!
455  @return (list) List of successors of the node.
456 
457  """
458  return self.__successors
459 
460 
461  def __init__(self, feature, parent, successors, payload):
462  """!
463  @brief Create CF Non-leaf node.
464 
465  @param[in] feature (cfentry): Clustering feature of the created node.
466  @param[in] parent (non_leaf_node): Parent of the created node.
467  @param[in] successors (list): List of successors of the node.
468  @param[in] payload (*): Data that is stored by the node.
469 
470  """
471 
472  super().__init__(feature, parent, payload)
473 
474 
475  self.type = cfnode_type.CFNODE_NONLEAF
476 
477  self.__successors = successors
478 
479 
480  def __repr__(self):
481  """!
482  @return (string) Representation of non-leaf node representation.
483 
484  """
485  return 'Non-leaf node %s, parent %s, feature %s, successors: %d' % (hex(id(self)), self.parent, self.feature, len(self.successors))
486 
487 
488  def __str__(self):
489  """!
490  @return (string) String non-leaf representation.
491 
492  """
493  return self.__repr__()
494 
495 
496  def insert_successor(self, successor):
497  """!
498  @brief Insert successor to the node.
499 
500  @param[in] successor (cfnode): Successor for adding.
501 
502  """
503 
504  self.feature += successor.feature
505  self.successors.append(successor)
506 
507  successor.parent = self
508 
509 
510  def remove_successor(self, successor):
511  """!
512  @brief Remove successor from the node.
513 
514  @param[in] successor (cfnode): Successor for removing.
515 
516  """
517 
518  self.feature -= successor.feature
519  self.successors.append(successor)
520 
521  successor.parent = self
522 
523 
524  def merge(self, node):
525  """!
526  @brief Merge non-leaf node to the current.
527 
528  @param[in] node (non_leaf_node): Non-leaf node that should be merged with current.
529 
530  """
531 
532  self.feature += node.feature
533 
534  for child in node.successors:
535  child.parent = self
536  self.successors.append(child)
537 
538 
539  def get_farthest_successors(self, type_measurement):
540  """!
541  @brief Find pair of farthest successors of the node in line with measurement type.
542 
543  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest successors.
544 
545  @return (list) Pair of farthest successors represented by list [cfnode1, cfnode2].
546 
547  """
548 
549  farthest_node1 = None
550  farthest_node2 = None
551  farthest_distance = 0.0
552 
553  for i in range(0, len(self.successors)):
554  candidate1 = self.successors[i]
555 
556  for j in range(i + 1, len(self.successors)):
557  candidate2 = self.successors[j]
558  candidate_distance = candidate1.get_distance(candidate2, type_measurement)
559 
560  if candidate_distance > farthest_distance:
561  farthest_distance = candidate_distance
562  farthest_node1 = candidate1
563  farthest_node2 = candidate2
564 
565  return [farthest_node1, farthest_node2]
566 
567 
568  def get_nearest_successors(self, type_measurement):
569  """!
570  @brief Find pair of nearest successors of the node in line with measurement type.
571 
572  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest successors.
573 
574  @return (list) Pair of nearest successors represented by list.
575 
576  """
577 
578  nearest_node1 = None
579  nearest_node2 = None
580  nearest_distance = float("Inf")
581 
582  for i in range(0, len(self.successors)):
583  candidate1 = self.successors[i]
584 
585  for j in range(i + 1, len(self.successors)):
586  candidate2 = self.successors[j]
587  candidate_distance = candidate1.get_distance(candidate2, type_measurement)
588 
589  if candidate_distance < nearest_distance:
590  nearest_distance = candidate_distance
591  nearest_node1 = candidate1
592  nearest_node2 = candidate2
593 
594  return [nearest_node1, nearest_node2]
595 
596 
598  """!
599  @brief Represents clustering feature leaf node.
600 
601  """
602 
603  @property
604  def entries(self):
605  """!
606  @return (list) List of leaf nodes.
607 
608  """
609  return self.__entries
610 
611 
612  def __init__(self, feature, parent, entries, payload):
613  """!
614  @brief Create CF Leaf node.
615 
616  @param[in] feature (cfentry): Clustering feature of the created node.
617  @param[in] parent (non_leaf_node): Parent of the created node.
618  @param[in] entries (list): List of entries of the node.
619  @param[in] payload (*): Data that is stored by the node.
620 
621  """
622 
623  super().__init__(feature, parent, payload)
624 
625 
626  self.type = cfnode_type.CFNODE_LEAF
627 
628  self.__entries = entries # list of clustering features
629 
630 
631  def __repr__(self):
632  """!
633  @return (string) Default leaf node represenation.
634 
635  """
636  text_entries = "\n"
637  for entry in self.entries:
638  text_entries += "\t" + str(entry) + "\n"
639 
640  return 'Leaf-node %s, parent %s, feature %s, entries: %d %s' % (hex(id(self)), self.parent, self.feature, len(self.entries), text_entries)
641 
642 
643  def __str__(self):
644  """!
645  @return (string) String leaf node representation.
646 
647  """
648  return self.__repr__()
649 
650 
651  def insert_entry(self, entry):
652  """!
653  @brief Insert new clustering feature to the leaf node.
654 
655  @param[in] entry (cfentry): Clustering feature.
656 
657  """
658 
659  self.feature += entry
660  self.entries.append(entry)
661 
662 
663  def remove_entry(self, entry):
664  """!
665  @brief Remove clustering feature from the leaf node.
666 
667  @param[in] entry (cfentry): Clustering feature.
668 
669  """
670 
671  self.feature -= entry
672  self.entries.remove(entry)
673 
674 
675  def merge(self, node):
676  """!
677  @brief Merge leaf node to the current.
678 
679  @param[in] node (leaf_node): Leaf node that should be merged with current.
680 
681  """
682 
683  self.feature += node.feature
684 
685  # Move entries from merged node
686  for entry in node.entries:
687  self.entries.append(entry)
688 
689 
690  def get_farthest_entries(self, type_measurement):
691  """!
692  @brief Find pair of farthest entries of the node.
693 
694  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest entries.
695 
696  @return (list) Pair of farthest entries of the node that are represented by list.
697 
698  """
699 
700  farthest_entity1 = None
701  farthest_entity2 = None
702  farthest_distance = 0
703 
704  for i in range(0, len(self.entries)):
705  candidate1 = self.entries[i]
706 
707  for j in range(i + 1, len(self.entries)):
708  candidate2 = self.entries[j]
709  candidate_distance = candidate1.get_distance(candidate2, type_measurement)
710 
711  if candidate_distance > farthest_distance:
712  farthest_distance = candidate_distance
713  farthest_entity1 = candidate1
714  farthest_entity2 = candidate2
715 
716  return [farthest_entity1, farthest_entity2]
717 
718 
719  def get_nearest_index_entry(self, entry, type_measurement):
720  """!
721  @brief Find nearest index of nearest entry of node for the specified entry.
722 
723  @param[in] entry (cfentry): Entry that is used for calculation distance.
724  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
725 
726  @return (uint) Index of nearest entry of node for the specified entry.
727 
728  """
729 
730  minimum_distance = float('Inf')
731  nearest_index = 0
732 
733  for candidate_index in range(0, len(self.entries)):
734  candidate_distance = self.entries[candidate_index].get_distance(entry, type_measurement)
735  if candidate_distance < minimum_distance:
736  nearest_index = candidate_index
737 
738  return nearest_index
739 
740 
741  def get_nearest_entry(self, entry, type_measurement):
742  """!
743  @brief Find nearest entry of node for the specified entry.
744 
745  @param[in] entry (cfentry): Entry that is used for calculation distance.
746  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
747 
748  @return (cfentry) Nearest entry of node for the specified entry.
749 
750  """
751 
752  min_key = lambda cur_entity: cur_entity.get_distance(entry, type_measurement)
753  return min(self.__entries, key=min_key)
754 
755 
756 class cftree:
757  """!
758  @brief CF-Tree representation.
759  @details A CF-tree is a height-balanced tree with two parameters: branching factor and threshold.
760 
761  """
762 
763  @property
764  def root(self):
765  """!
766  @return (cfnode) Root of the tree.
767 
768  """
769  return self.__root
770 
771 
772  @property
773  def leafes(self):
774  """!
775  @return (list) List of all non-leaf nodes in the tree.
776 
777  """
778  return self.__leafes
779 
780 
781  @property
782  def amount_nodes(self):
783  """!
784  @return (unit) Number of nodes (leaf and non-leaf) in the tree.
785 
786  """
787  return self.__amount_nodes
788 
789 
790  @property
791  def amount_entries(self):
792  """!
793  @return (uint) Number of entries in the tree.
794 
795  """
796  return self.__amount_entries
797 
798 
799  @property
800  def height(self):
801  """!
802  @return (uint) Height of the tree.
803 
804  """
805  return self.__height
806 
807 
808  @property
809  def branch_factor(self):
810  """!
811  @return (uint) Branching factor of the tree.
812  @details Branching factor defines maximum number of successors in each non-leaf node.
813 
814  """
815  return self.__branch_factor
816 
817 
818  @property
819  def threshold(self):
820  """!
821  @return (double) Threshold of the tree that represents maximum diameter of sub-clusters that is formed by leaf node entries.
822 
823  """
824  return self.__threshold
825 
826 
827  @property
828  def max_entries(self):
829  """!
830  @return (uint) Maximum number of entries in each leaf node.
831 
832  """
833  return self.__max_entries
834 
835 
836  @property
837  def type_measurement(self):
838  """!
839  @return (measurement_type) Type that is used for measuring.
840 
841  """
842  return self.__type_measurement
843 
844 
845  def __init__(self, branch_factor, max_entries, threshold, type_measurement = measurement_type.CENTROID_EUCLIDEAN_DISTANCE):
846  """!
847  @brief Create CF-tree.
848 
849  @param[in] branch_factor (uint): Maximum number of children for non-leaf nodes.
850  @param[in] max_entries (uint): Maximum number of entries for leaf nodes.
851  @param[in] threshold (double): Maximum diameter of feature clustering for each leaf node.
852  @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance metrics.
853 
854  """
855 
856  self.__root = None
857 
858  self.__branch_factor = branch_factor # maximum number of children
859  if self.__branch_factor < 2:
860  self.__branch_factor = 2
861 
862  self.__threshold = threshold # maximum diameter of sub-clusters stored at the leaf nodes
863  self.__max_entries = max_entries
864 
865  self.__leafes = []
866 
867  self.__type_measurement = type_measurement
868 
869  # statistics
870  self.__amount_nodes = 0 # root, despite it can be None.
871  self.__amount_entries = 0
872  self.__height = 0 # tree size with root.
873 
874 
875  def get_level_nodes(self, level):
876  """!
877  @brief Traverses CF-tree to obtain nodes at the specified level.
878 
879  @param[in] level (uint): CF-tree level from that nodes should be returned.
880 
881  @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
882 
883  """
884 
885  level_nodes = [];
886  if (level < self.__height):
887  level_nodes = self.__recursive_get_level_nodes(level, self.__root);
888 
889  return level_nodes;
890 
891 
892  def __recursive_get_level_nodes(self, level, node):
893  """!
894  @brief Traverses CF-tree to obtain nodes at the specified level recursively.
895 
896  @param[in] level (uint): Current CF-tree level.
897  @param[in] node (cfnode): CF-node from that traversing is performed.
898 
899  @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
900 
901  """
902 
903  level_nodes = [];
904  if (level is 0):
905  level_nodes.append(node);
906 
907  else:
908  for sucessor in node.successors:
909  level_nodes += self.__recursive_get_level_nodes(level - 1, sucessor);
910 
911  return level_nodes;
912 
913 
914  def insert_cluster(self, cluster):
915  """!
916  @brief Insert cluster that is represented as list of points where each point is represented by list of coordinates.
917  @details Clustering feature is created for that cluster and inserted to the tree.
918 
919  @param[in] cluster (list): Cluster that is represented by list of points that should be inserted to the tree.
920 
921  """
922 
923  entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster));
924  self.insert(entry);
925 
926 
927  def insert(self, entry):
928  """!
929  @brief Insert clustering feature to the tree.
930 
931  @param[in] entry (cfentry): Clustering feature that should be inserted.
932 
933  """
934 
935  if (self.__root is None):
936  node = leaf_node(entry, None, [ entry ], None);
937 
938  self.__root = node;
939  self.__leafes.append(node);
940 
941  # Update statistics
942  self.__amount_entries += 1;
943  self.__amount_nodes += 1;
944  self.__height += 1; # root has successor now
945  else:
946  child_node_updation = self.__recursive_insert(entry, self.__root);
947  if (child_node_updation is True):
948  # Splitting has been finished, check for possibility to merge (at least we have already two children).
949  if (self.__merge_nearest_successors(self.__root) is True):
950  self.__amount_nodes -= 1;
951 
952 
953  def find_nearest_leaf(self, entry, search_node = None):
954  """!
955  @brief Search nearest leaf to the specified clustering feature.
956 
957  @param[in] entry (cfentry): Clustering feature.
958  @param[in] search_node (cfnode): Node from that searching should be started, if None then search process will be started for the root.
959 
960  @return (leaf_node) Nearest node to the specified clustering feature.
961 
962  """
963 
964  if (search_node is None):
965  search_node = self.__root;
966 
967  nearest_node = search_node;
968 
969  if (search_node.type == cfnode_type.CFNODE_NONLEAF):
970  min_key = lambda child_node: child_node.feature.get_distance(entry, self.__type_measurement);
971  nearest_child_node = min(search_node.successors, key = min_key);
972 
973  nearest_node = self.find_nearest_leaf(entry, nearest_child_node);
974 
975  return nearest_node;
976 
977 
978  def __recursive_insert(self, entry, search_node):
979  """!
980  @brief Recursive insert of the entry to the tree.
981  @details It performs all required procedures during insertion such as splitting, merging.
982 
983  @param[in] entry (cfentry): Clustering feature.
984  @param[in] search_node (cfnode): Node from that insertion should be started.
985 
986  @return (bool) True if number of nodes at the below level is changed, otherwise False.
987 
988  """
989 
990  # None-leaf node
991  if (search_node.type == cfnode_type.CFNODE_NONLEAF):
992  return self.__insert_for_noneleaf_node(entry, search_node);
993 
994  # Leaf is reached
995  else:
996  return self.__insert_for_leaf_node(entry, search_node);
997 
998 
999  def __insert_for_leaf_node(self, entry, search_node):
1000  """!
1001  @brief Recursive insert entry from leaf node to the tree.
1002 
1003  @param[in] entry (cfentry): Clustering feature.
1004  @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
1005 
1006  @return (bool) True if number of nodes at the below level is changed, otherwise False.
1007 
1008  """
1009 
1010  node_amount_updation = False;
1011 
1012  # Try to absorb by the entity
1013  index_nearest_entry = search_node.get_nearest_index_entry(entry, self.__type_measurement);
1014  merged_entry = search_node.entries[index_nearest_entry] + entry;
1015 
1016  # Otherwise try to add new entry
1017  if (merged_entry.get_diameter() > self.__threshold):
1018  # If it's not exceeded append entity and update feature of the leaf node.
1019  search_node.insert_entry(entry);
1020 
1021  # Otherwise current node should be splitted
1022  if (len(search_node.entries) > self.__max_entries):
1023  self.__split_procedure(search_node);
1024  node_amount_updation = True;
1025 
1026  # Update statistics
1027  self.__amount_entries += 1;
1028 
1029  else:
1030  search_node.entries[index_nearest_entry] = merged_entry;
1031  search_node.feature += entry;
1032 
1033  return node_amount_updation;
1034 
1035 
1036  def __insert_for_noneleaf_node(self, entry, search_node):
1037  """!
1038  @brief Recursive insert entry from none-leaf node to the tree.
1039 
1040  @param[in] entry (cfentry): Clustering feature.
1041  @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
1042 
1043  @return (bool) True if number of nodes at the below level is changed, otherwise False.
1044 
1045  """
1046 
1047  node_amount_updation = False;
1048 
1049  min_key = lambda child_node: child_node.get_distance(search_node, self.__type_measurement);
1050  nearest_child_node = min(search_node.successors, key = min_key);
1051 
1052  child_node_updation = self.__recursive_insert(entry, nearest_child_node);
1053 
1054  # Update clustering feature of none-leaf node.
1055  search_node.feature += entry;
1056 
1057  # Check branch factor, probably some leaf has been splitted and threshold has been exceeded.
1058  if (len(search_node.successors) > self.__branch_factor):
1059 
1060  # Check if it's aleady root then new root should be created (height is increased in this case).
1061  if (search_node is self.__root):
1062  self.__root = non_leaf_node(search_node.feature, None, [ search_node ], None);
1063  search_node.parent = self.__root;
1064 
1065  # Update statistics
1066  self.__amount_nodes += 1;
1067  self.__height += 1;
1068 
1069  [new_node1, new_node2] = self.__split_nonleaf_node(search_node);
1070 
1071  # Update parent list of successors
1072  parent = search_node.parent;
1073  parent.successors.remove(search_node);
1074  parent.successors.append(new_node1);
1075  parent.successors.append(new_node2);
1076 
1077  # Update statistics
1078  self.__amount_nodes += 1;
1079  node_amount_updation = True;
1080 
1081  elif (child_node_updation is True):
1082  # Splitting has been finished, check for possibility to merge (at least we have already two children).
1083  if (self.__merge_nearest_successors(search_node) is True):
1084  self.__amount_nodes -= 1;
1085 
1086  return node_amount_updation;
1087 
1088 
1089  def __merge_nearest_successors(self, node):
1090  """!
1091  @brief Find nearest sucessors and merge them.
1092 
1093  @param[in] node (non_leaf_node): Node whose two nearest successors should be merged.
1094 
1095  @return (bool): True if merging has been successfully performed, otherwise False.
1096 
1097  """
1098 
1099  merging_result = False;
1100 
1101  if (node.successors[0].type == cfnode_type.CFNODE_NONLEAF):
1102  [nearest_child_node1, nearest_child_node2] = node.get_nearest_successors(self.__type_measurement);
1103 
1104  if (len(nearest_child_node1.successors) + len(nearest_child_node2.successors) <= self.__branch_factor):
1105  node.successors.remove(nearest_child_node2);
1106  if (nearest_child_node2.type == cfnode_type.CFNODE_LEAF):
1107  self.__leafes.remove(nearest_child_node2);
1108 
1109  nearest_child_node1.merge(nearest_child_node2);
1110 
1111  merging_result = True;
1112 
1113  return merging_result;
1114 
1115 
1116  def __split_procedure(self, split_node):
1117  """!
1118  @brief Starts node splitting procedure in the CF-tree from the specify node.
1119 
1120  @param[in] split_node (cfnode): CF-tree node that should be splitted.
1121 
1122  """
1123  if (split_node is self.__root):
1124  self.__root = non_leaf_node(split_node.feature, None, [ split_node ], None);
1125  split_node.parent = self.__root;
1126 
1127  # Update statistics
1128  self.__amount_nodes += 1;
1129  self.__height += 1;
1130 
1131  [new_node1, new_node2] = self.__split_leaf_node(split_node);
1132 
1133  self.__leafes.remove(split_node);
1134  self.__leafes.append(new_node1);
1135  self.__leafes.append(new_node2);
1136 
1137  # Update parent list of successors
1138  parent = split_node.parent;
1139  parent.successors.remove(split_node);
1140  parent.successors.append(new_node1);
1141  parent.successors.append(new_node2);
1142 
1143  # Update statistics
1144  self.__amount_nodes += 1;
1145 
1146 
1147  def __split_nonleaf_node(self, node):
1148  """!
1149  @brief Performs splitting of the specified non-leaf node.
1150 
1151  @param[in] node (non_leaf_node): Non-leaf node that should be splitted.
1152 
1153  @return (list) New pair of non-leaf nodes [non_leaf_node1, non_leaf_node2].
1154 
1155  """
1156 
1157  [farthest_node1, farthest_node2] = node.get_farthest_successors(self.__type_measurement);
1158 
1159  # create new non-leaf nodes
1160  new_node1 = non_leaf_node(farthest_node1.feature, node.parent, [ farthest_node1 ], None);
1161  new_node2 = non_leaf_node(farthest_node2.feature, node.parent, [ farthest_node2 ], None);
1162 
1163  farthest_node1.parent = new_node1;
1164  farthest_node2.parent = new_node2;
1165 
1166  # re-insert other successors
1167  for successor in node.successors:
1168  if ( (successor is not farthest_node1) and (successor is not farthest_node2) ):
1169  distance1 = new_node1.get_distance(successor, self.__type_measurement);
1170  distance2 = new_node2.get_distance(successor, self.__type_measurement);
1171 
1172  if (distance1 < distance2):
1173  new_node1.insert_successor(successor);
1174  else:
1175  new_node2.insert_successor(successor);
1176 
1177  return [new_node1, new_node2];
1178 
1179 
1180  def __split_leaf_node(self, node):
1181  """!
1182  @brief Performs splitting of the specified leaf node.
1183 
1184  @param[in] node (leaf_node): Leaf node that should be splitted.
1185 
1186  @return (list) New pair of leaf nodes [leaf_node1, leaf_node2].
1187 
1188  @warning Splitted node is transformed to non_leaf.
1189 
1190  """
1191 
1192  # search farthest pair of entries
1193  [farthest_entity1, farthest_entity2] = node.get_farthest_entries(self.__type_measurement);
1194 
1195  # create new nodes
1196  new_node1 = leaf_node(farthest_entity1, node.parent, [ farthest_entity1 ], None);
1197  new_node2 = leaf_node(farthest_entity2, node.parent, [ farthest_entity2 ], None);
1198 
1199  # re-insert other entries
1200  for entity in node.entries:
1201  if ( (entity is not farthest_entity1) and (entity is not farthest_entity2) ):
1202  distance1 = new_node1.feature.get_distance(entity, self.__type_measurement);
1203  distance2 = new_node2.feature.get_distance(entity, self.__type_measurement);
1204 
1205  if (distance1 < distance2):
1206  new_node1.insert_entry(entity);
1207  else:
1208  new_node2.insert_entry(entity);
1209 
1210  return [new_node1, new_node2];
1211 
1212 
1213  def show_feature_destibution(self, data = None):
1214  """!
1215  @brief Shows feature distribution.
1216  @details Only features in 1D, 2D, 3D space can be visualized.
1217 
1218  @param[in] data (list): List of points that will be used for visualization, if it not specified than feature will be displayed only.
1219 
1220  """
1221  visualizer = cluster_visualizer();
1222 
1223  print("amount of nodes: ", self.__amount_nodes);
1224 
1225  if (data is not None):
1226  visualizer.append_cluster(data, marker = 'x');
1227 
1228  for level in range(0, self.height):
1229  level_nodes = self.get_level_nodes(level);
1230 
1231  centers = [ node.feature.get_centroid() for node in level_nodes ];
1232  visualizer.append_cluster(centers, None, markersize = (self.height - level + 1) * 5);
1233 
1234  visualizer.show();
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:359
pyclustering module for cluster analysis.
Definition: __init__.py:1
def __init__(self, feature, parent, entries, payload)
Create CF Leaf node.
Definition: cftree.py:612
def linear_sum(self)
Returns linear sum.
Definition: cftree.py:102
Represents clustering feature leaf node.
Definition: cftree.py:597
def get_distance(self, entry, type_measurement)
Calculates distance between two clusters in line with measurement type.
Definition: cftree.py:226
def remove_entry(self, entry)
Remove clustering feature from the leaf node.
Definition: cftree.py:663
def show_feature_destibution(self, data=None)
Shows feature distribution.
Definition: cftree.py:1213
def insert_entry(self, entry)
Insert new clustering feature to the leaf node.
Definition: cftree.py:651
def get_farthest_successors(self, type_measurement)
Find pair of farthest successors of the node in line with measurement type.
Definition: cftree.py:539
def __str__(self)
Default cfentry string representation.
Definition: cftree.py:158
def __add__(self, entry)
Overloaded operator add.
Definition: cftree.py:166
def __insert_for_noneleaf_node(self, entry, search_node)
Recursive insert entry from none-leaf node to the tree.
Definition: cftree.py:1036
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def __split_procedure(self, split_node)
Starts node splitting procedure in the CF-tree from the specify node.
Definition: cftree.py:1116
def __recursive_get_level_nodes(self, level, node)
Traverses CF-tree to obtain nodes at the specified level recursively.
Definition: cftree.py:892
def __get_average_inter_cluster_distance(self, entry)
Calculates average inter cluster distance between current and specified clusters. ...
Definition: cftree.py:327
def __split_nonleaf_node(self, node)
Performs splitting of the specified non-leaf node.
Definition: cftree.py:1147
def merge(self, node)
Merge non-leaf node to the current.
Definition: cftree.py:524
def insert_successor(self, successor)
Insert successor to the node.
Definition: cftree.py:496
feature
Clustering feature of the node.
Definition: cftree.py:403
Representation of clustering feature non-leaf node.
Definition: cftree.py:446
def __get_variance_increase_distance(self, entry)
Calculates variance increase distance between current and specified clusters.
Definition: cftree.py:362
def get_nearest_index_entry(self, entry, type_measurement)
Find nearest index of nearest entry of node for the specified entry.
Definition: cftree.py:719
def merge(self, node)
Merge leaf node to the current.
Definition: cftree.py:675
def __init__(self, feature, parent, payload)
Constructor of abstract CF node.
Definition: cftree.py:392
def get_diameter(self)
Calculates diameter of cluster that is represented by the entry.
Definition: cftree.py:306
def __insert_for_leaf_node(self, entry, search_node)
Recursive insert entry from leaf node to the tree.
Definition: cftree.py:999
def insert(self, entry)
Insert clustering feature to the tree.
Definition: cftree.py:927
def find_nearest_leaf(self, entry, search_node=None)
Search nearest leaf to the specified clustering feature.
Definition: cftree.py:953
def __merge_nearest_successors(self, node)
Find nearest sucessors and merge them.
Definition: cftree.py:1089
CF-Tree representation.
Definition: cftree.py:756
def __eq__(self, entry)
Overloaded operator eq.
Definition: cftree.py:204
def insert_cluster(self, cluster)
Insert cluster that is represented as list of points where each point is represented by list of coord...
Definition: cftree.py:914
def get_radius(self)
Calculates radius of cluster that is represented by the entry.
Definition: cftree.py:279
def get_farthest_entries(self, type_measurement)
Find pair of farthest entries of the node.
Definition: cftree.py:690
def get_nearest_entry(self, entry, type_measurement)
Find nearest entry of node for the specified entry.
Definition: cftree.py:741
def square_sum(self)
Returns square sum.
Definition: cftree.py:113
parent
Pointer to the parent node (None for root).
Definition: cftree.py:406
def __init__(self, number_points, linear_sum, square_sum)
CF-entry constructor.
Definition: cftree.py:123
def get_distance(self, node, type_measurement)
Calculates distance between nodes in line with specified type measurement.
Definition: cftree.py:432
def __sub__(self, entry)
Overloaded operator sub.
Definition: cftree.py:183
Enumeration of measurement types for CF-Tree.
Definition: cftree.py:39
type
Type node (leaf or non-leaf).
Definition: cftree.py:409
Clustering feature representation.
Definition: cftree.py:82
def remove_successor(self, successor)
Remove successor from the node.
Definition: cftree.py:510
Representation of node of CF-Tree.
Definition: cftree.py:386
def number_points(self)
Returns number of points that are encoded.
Definition: cftree.py:92
def __init__(self, branch_factor, max_entries, threshold, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE)
Create CF-tree.
Definition: cftree.py:845
Enumeration of CF-Node types that are used by CF-Tree.
Definition: cftree.py:63
def __recursive_insert(self, entry, search_node)
Recursive insert of the entry to the tree.
Definition: cftree.py:978
payload
Payload of node where user data can be stored.
Definition: cftree.py:412
def get_centroid(self)
Calculates centroid of cluster that is represented by the entry.
Definition: cftree.py:260
def __split_leaf_node(self, node)
Performs splitting of the specified leaf node.
Definition: cftree.py:1180
def get_level_nodes(self, level)
Traverses CF-tree to obtain nodes at the specified level.
Definition: cftree.py:875
def get_nearest_successors(self, type_measurement)
Find pair of nearest successors of the node in line with measurement type.
Definition: cftree.py:568
def __get_average_intra_cluster_distance(self, entry)
Calculates average intra cluster distance between current and specified clusters. ...
Definition: cftree.py:342
def __init__(self, feature, parent, successors, payload)
Create CF Non-leaf node.
Definition: cftree.py:461