cftree.py
1 """!
2 
3 @brief Data Structure: CF-Tree
4 @details Implementation based on paper @cite article::birch::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 from copy import copy
28 
29 from pyclustering.cluster import cluster_visualizer
30 
31 from pyclustering.utils import euclidean_distance_square
32 from pyclustering.utils import manhattan_distance
33 from pyclustering.utils import list_math_addition, list_math_subtraction, list_math_multiplication
34 from pyclustering.utils import linear_sum, square_sum
35 
36 from enum import IntEnum
37 
38 
39 class measurement_type(IntEnum):
40  """!
41  @brief Enumeration of measurement types for CF-Tree.
42 
43  @see cftree
44 
45  """
46 
47 
48  CENTROID_EUCLIDEAN_DISTANCE = 0;
49 
50 
51  CENTROID_MANHATTAN_DISTANCE = 1;
52 
53 
54  AVERAGE_INTER_CLUSTER_DISTANCE = 2;
55 
56 
57  AVERAGE_INTRA_CLUSTER_DISTANCE = 3;
58 
59 
60  VARIANCE_INCREASE_DISTANCE = 4;
61 
62 
63 class cfnode_type(IntEnum):
64  """!
65  @brief Enumeration of CF-Node types that are used by CF-Tree.
66 
67  @see cfnode
68  @see cftree
69 
70  """
71 
72 
73  CFNODE_DUMMY = 0;
74 
75 
76  CFNODE_LEAF = 1;
77 
78 
79  CFNODE_NONLEAF = 2;
80 
81 
82 class cfentry:
83  """!
84  @brief Clustering feature representation.
85 
86  @see cfnode
87  @see cftree
88 
89  """
90 
91  @property
92  def number_points(self):
93  """!
94  @brief Returns number of points that are encoded.
95 
96  @return (uint) Number of encoded points.
97 
98  """
99  return self.__number_points;
100 
101  @property
102  def linear_sum(self):
103  """!
104  @brief Returns linear sum.
105 
106  @return (list) Linear sum.
107 
108  """
109 
110  return self.__linear_sum;
111 
112  @property
113  def square_sum(self):
114  """!
115  @brief Returns square sum.
116 
117  @return (double) Square sum.
118 
119  """
120  return self.__square_sum;
121 
122 
123  def __init__(self, number_points, linear_sum, square_sum):
124  """!
125  @brief CF-entry constructor.
126 
127  @param[in] number_points (uint): Number of objects that is represented by the entry.
128  @param[in] linear_sum (list): Linear sum of values that represent objects in each dimension.
129  @param[in] square_sum (double): Square sum of values that represent objects.
130 
131  """
132 
133  self.__number_points = number_points;
134  self.__linear_sum = linear_sum;
135  self.__square_sum = square_sum;
136 
137  self.__centroid = None;
138  self.__radius = None;
139  self.__diameter = None;
140 
141 
142  def __copy__(self):
143  """!
144  @returns (cfentry) Makes copy of the cfentry instance.
145 
146  """
147  return cfentry(self.__number_points, self.__linear_sum, self.__square_sum);
148 
149 
150  def __repr__(self):
151  """!
152  @return (string) Default cfentry representation.
153 
154  """
155  return 'CF (%s, %s, %0.2f) [%s]' % ( self.number_points, self.linear_sum, self.__square_sum, hex(id(self)) );
156 
157 
158  def __str__(self):
159  """!
160  @brief Default cfentry string representation.
161 
162  """
163  return self.__repr__();
164 
165 
166  def __add__(self, entry):
167  """!
168  @brief Overloaded operator add. Performs addition of two clustering features.
169 
170  @param[in] entry (cfentry): Entry that is added to the current.
171 
172  @return (cfentry) Result of addition of two clustering features.
173 
174  """
175 
176  number_points = self.number_points + entry.number_points;
177  result_linear_sum = list_math_addition(self.linear_sum, entry.linear_sum);
178  result_square_sum = self.square_sum + entry.square_sum;
179 
180  return cfentry(number_points, result_linear_sum, result_square_sum);
181 
182 
183  def __sub__(self, entry):
184  """!
185  @brief Overloaded operator sub. Performs substraction of two clustering features.
186  @details Substraction can't be performed with clustering feature whose description is less then substractor.
187 
188  @param[in] entry (cfentry): Entry that is substracted from the current.
189 
190  @return (cfentry) Result of substraction of two clustering features.
191 
192  """
193 
194  number_points = self.number_points - entry.number_points;
195  result_linear_sum = list_math_subtraction(self.linear_sum, entry.linear_sum);
196  result_square_sum = self.square_sum - entry.square_sum;
197 
198  if ( (number_points < 0) or (result_square_sum < 0) ):
199  raise NameError("Substruction with negative result is not possible for clustering features.");
200 
201  return cfentry(number_points, result_linear_sum, result_square_sum);
202 
203 
204  def __eq__(self, entry):
205  """!
206  @brief Overloaded operator eq.
207  @details Performs comparison of two clustering features.
208 
209  @param[in] entry (cfentry): Entry that is used for comparison with current.
210 
211  @return (bool) True is both clustering features are equals in line with tolerance, otherwise False.
212 
213  """
214 
215  tolerance = 0.00001;
216 
217  result = (self.__number_points == entry.number_points);
218  result &= ( (self.square_sum + tolerance > entry.square_sum) and (self.square_sum - tolerance < entry.square_sum) );
219 
220  for index_dimension in range(0, len(self.linear_sum)):
221  result &= ( (self.linear_sum[index_dimension] + tolerance > entry.linear_sum[index_dimension]) and (self.linear_sum[index_dimension] - tolerance < entry.linear_sum[index_dimension]) );
222 
223  return result;
224 
225 
226  def get_distance(self, entry, type_measurement):
227  """!
228  @brief Calculates distance between two clusters in line with measurement type.
229 
230  @details In case of usage CENTROID_EUCLIDIAN_DISTANCE square euclidian distance will be returned.
231  Square root should be taken from the result for obtaining real euclidian distance between
232  entries.
233 
234  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
235  @param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters.
236 
237  @return (double) Distance between two clusters.
238 
239  """
240 
241  if (type_measurement is measurement_type.CENTROID_EUCLIDEAN_DISTANCE):
242  return euclidean_distance_square(entry.get_centroid(), self.get_centroid());
243 
244  elif (type_measurement is measurement_type.CENTROID_MANHATTAN_DISTANCE):
245  return manhattan_distance(entry.get_centroid(), self.get_centroid());
246 
247  elif (type_measurement is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE):
248  return self.__get_average_inter_cluster_distance(entry);
249 
250  elif (type_measurement is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE):
251  return self.__get_average_intra_cluster_distance(entry);
252 
253  elif (type_measurement is measurement_type.VARIANCE_INCREASE_DISTANCE):
254  return self.__get_variance_increase_distance(entry);
255 
256  else:
257  assert 0;
258 
259 
260  def get_centroid(self):
261  """!
262  @brief Calculates centroid of cluster that is represented by the entry.
263  @details It's calculated once when it's requested after the last changes.
264 
265  @return (list) Centroid of cluster that is represented by the entry.
266 
267  """
268 
269  if (self.__centroid is not None):
270  return self.__centroid;
271 
272  self.__centroid = [0] * len(self.linear_sum);
273  for index_dimension in range(0, len(self.linear_sum)):
274  self.__centroid[index_dimension] = self.linear_sum[index_dimension] / self.number_points;
275 
276  return self.__centroid;
277 
278 
279  def get_radius(self):
280  """!
281  @brief Calculates radius of cluster that is represented by the entry.
282  @details It's calculated once when it's requested after the last changes.
283 
284  @return (double) Radius of cluster that is represented by the entry.
285 
286  """
287 
288  if (self.__radius is not None):
289  return self.__radius;
290 
291  centroid = self.get_centroid();
292 
293  radius_part_1 = self.square_sum;
294 
295  radius_part_2 = 0.0;
296  radius_part_3 = 0.0;
297 
298  if (type(centroid) == list):
299  radius_part_2 = 2.0 * sum(list_math_multiplication(self.linear_sum, centroid));
300  radius_part_3 = self.number_points * sum(list_math_multiplication(centroid, centroid));
301  else:
302  radius_part_2 = 2.0 * self.linear_sum * centroid;
303  radius_part_3 = self.number_points * centroid * centroid;
304 
305  self.__radius = ( (1.0 / self.number_points) * (radius_part_1 - radius_part_2 + radius_part_3) ) ** 0.5;
306  return self.__radius;
307 
308 
309  def get_diameter(self):
310  """!
311  @brief Calculates diameter of cluster that is represented by the entry.
312  @details It's calculated once when it's requested after the last changes.
313 
314  @return (double) Diameter of cluster that is represented by the entry.
315 
316  """
317 
318  if (self.__diameter is not None):
319  return self.__diameter;
320 
321  diameter_part = 0.0;
322  if (type(self.linear_sum) == list):
323  diameter_part = self.square_sum * self.number_points - 2.0 * sum(list_math_multiplication(self.linear_sum, self.linear_sum)) + self.square_sum * self.number_points;
324  else:
325  diameter_part = self.square_sum * self.number_points - 2.0 * self.linear_sum * self.linear_sum + self.square_sum * self.number_points;
326 
327  self.__diameter = ( diameter_part / (self.number_points * (self.number_points - 1)) ) ** 0.5;
328  return self.__diameter;
329 
330 
331  def __get_average_inter_cluster_distance(self, entry):
332  """!
333  @brief Calculates average inter cluster distance between current and specified clusters.
334 
335  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
336 
337  @return (double) Average inter cluster distance.
338 
339  """
340 
341  linear_part_distance = sum(list_math_multiplication(self.linear_sum, entry.linear_sum));
342 
343  return ( (entry.number_points * self.square_sum - 2.0 * linear_part_distance + self.number_points * entry.square_sum) / (self.number_points * entry.number_points) ) ** 0.5;
344 
345 
346  def __get_average_intra_cluster_distance(self, entry):
347  """!
348  @brief Calculates average intra cluster distance between current and specified clusters.
349 
350  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
351 
352  @return (double) Average intra cluster distance.
353 
354  """
355 
356  linear_part_first = list_math_addition(self.linear_sum, entry.linear_sum);
357  linear_part_second = linear_part_first;
358 
359  linear_part_distance = sum(list_math_multiplication(linear_part_first, linear_part_second));
360 
361  general_part_distance = 2.0 * (self.number_points + entry.number_points) * (self.square_sum + entry.square_sum) - 2.0 * linear_part_distance;
362 
363  return (general_part_distance / ( (self.number_points + entry.number_points) * (self.number_points + entry.number_points - 1.0) )) ** 0.5;
364 
365 
366  def __get_variance_increase_distance(self, entry):
367  """!
368  @brief Calculates variance increase distance between current and specified clusters.
369 
370  @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
371 
372  @return (double) Variance increase distance.
373 
374  """
375 
376  linear_part_12 = list_math_addition(self.linear_sum, entry.linear_sum);
377  variance_part_first = (self.square_sum + entry.square_sum) - \
378  2.0 * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points) + \
379  (self.number_points + entry.number_points) * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points)**2.0;
380 
381 
382  linear_part_11 = sum(list_math_multiplication(self.linear_sum, self.linear_sum));
383  variance_part_second = -( self.square_sum - (2.0 * linear_part_11 / self.number_points) + (linear_part_11 / self.number_points) );
384 
385  linear_part_22 = sum(list_math_multiplication(entry.linear_sum, entry.linear_sum));
386  variance_part_third = -( entry.square_sum - (2.0 / entry.number_points) * linear_part_22 + entry.number_points * (1.0 / entry.number_points ** 2.0) * linear_part_22 );
387 
388  return (variance_part_first + variance_part_second + variance_part_third);
389 
390 
391 class cfnode:
392  """!
393  @brief Representation of node of CF-Tree.
394 
395  """
396 
397  def __init__(self, feature, parent, payload):
398  """!
399  @brief Constructor of abstract CF node.
400 
401  @param[in] feature (cfentry): Clustering feature of the created node.
402  @param[in] parent (cfnode): Parent of the created node.
403  @param[in] payload (*): Data that is stored by the node.
404 
405  """
406 
407 
408  self.feature = copy(feature);
409 
410 
411  self.parent = parent;
412 
413 
414  self.type = cfnode_type.CFNODE_DUMMY;
415 
416 
417  self.payload = payload;
418 
419 
420  def __repr__(self):
421  """!
422  @return (string) Default representation of CF node.
423 
424  """
425 
426  return 'CF node %s, parent %s, feature %s' % ( hex(id(self)), self.parent, self.feature );
427 
428 
429  def __str__(self):
430  """!
431  @return (string) String representation of CF node.
432 
433  """
434  return self.__repr__();
435 
436 
437  def get_distance(self, node, type_measurement):
438  """!
439  @brief Calculates distance between nodes in line with specified type measurement.
440 
441  @param[in] node (cfnode): CF-node that is used for calculation distance to the current node.
442  @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance.
443 
444  @return (double) Distance between two nodes.
445 
446  """
447 
448  return self.feature.get_distance(node.feature, type_measurement);
449 
450 
452  """!
453  @brief Representation of clustering feature non-leaf node.
454 
455  """
456 
457  @property
458  def successors(self):
459  """!
460  @return (list) List of successors of the node.
461 
462  """
463  return self.__successors;
464 
465 
466  def __init__(self, feature, parent, successors, payload):
467  """!
468  @brief Create CF Non-leaf node.
469 
470  @param[in] feature (cfentry): Clustering feature of the created node.
471  @param[in] parent (non_leaf_node): Parent of the created node.
472  @param[in] successors (list): List of successors of the node.
473  @param[in] payload (*): Data that is stored by the node.
474 
475  """
476 
477  super().__init__(feature, parent, payload);
478 
479 
480  self.type = cfnode_type.CFNODE_NONLEAF;
481 
482  self.__successors = successors;
483 
484 
485  def __repr__(self):
486  """!
487  @return (string) Representation of non-leaf node representation.
488 
489  """
490  return 'Non-leaf node %s, parent %s, feature %s, successors: %d' % ( hex(id(self)), self.parent, self.feature, len(self.successors) );
491 
492 
493  def __str__(self):
494  """!
495  @return (string) String non-leaf representation.
496 
497  """
498  return self.__repr__();
499 
500 
501  def insert_successor(self, successor):
502  """!
503  @brief Insert successor to the node.
504 
505  @param[in] successor (cfnode): Successor for adding.
506 
507  """
508 
509  self.feature += successor.feature;
510  self.successors.append(successor);
511 
512  successor.parent = self;
513 
514 
515  def remove_successor(self, successor):
516  """!
517  @brief Remove successor from the node.
518 
519  @param[in] successor (cfnode): Successor for removing.
520 
521  """
522 
523  self.feature -= successor.feature;
524  self.successors.append(successor);
525 
526  successor.parent = self;
527 
528 
529  def merge(self, node):
530  """!
531  @brief Merge non-leaf node to the current.
532 
533  @param[in] node (non_leaf_node): Non-leaf node that should be merged with current.
534 
535  """
536 
537  self.feature += node.feature;
538 
539  for child in node.successors:
540  child.parent = self;
541  self.successors.append(child);
542 
543 
544  def get_farthest_successors(self, type_measurement):
545  """!
546  @brief Find pair of farthest successors of the node in line with measurement type.
547 
548  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest successors.
549 
550  @return (list) Pair of farthest successors represented by list [cfnode1, cfnode2].
551 
552  """
553 
554  farthest_node1 = None;
555  farthest_node2 = None;
556  farthest_distance = 0;
557 
558  for i in range(0, len(self.successors)):
559  candidate1 = self.successors[i];
560 
561  for j in range(i + 1, len(self.successors)):
562  candidate2 = self.successors[j];
563  candidate_distance = candidate1.get_distance(candidate2, type_measurement);
564 
565  if (candidate_distance > farthest_distance):
566  farthest_distance = candidate_distance;
567  farthest_node1 = candidate1;
568  farthest_node2 = candidate2;
569 
570  return [farthest_node1, farthest_node2];
571 
572 
573  def get_nearest_successors(self, type_measurement):
574  """!
575  @brief Find pair of nearest successors of the node in line with measurement type.
576 
577  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest successors.
578 
579  @return (list) Pair of nearest successors represented by list.
580 
581  """
582 
583  nearest_node1 = None;
584  nearest_node2 = None;
585  nearest_distance = float("Inf");
586 
587  for i in range(0, len(self.successors)):
588  candidate1 = self.successors[i];
589 
590  for j in range(i + 1, len(self.successors)):
591  candidate2 = self.successors[j];
592  candidate_distance = candidate1.get_distance(candidate2, type_measurement);
593 
594  if (candidate_distance < nearest_distance):
595  nearest_distance = candidate_distance;
596  nearest_node1 = candidate1;
597  nearest_node2 = candidate2;
598 
599  return [nearest_node1, nearest_node2];
600 
601 
603  """!
604  @brief Represents clustering feature leaf node.
605 
606  """
607 
608  @property
609  def entries(self):
610  """!
611  @return (list) List of leaf nodes.
612 
613  """
614  return self.__entries;
615 
616 
617  def __init__(self, feature, parent, entries, payload):
618  """!
619  @brief Create CF Leaf node.
620 
621  @param[in] feature (cfentry): Clustering feature of the created node.
622  @param[in] parent (non_leaf_node): Parent of the created node.
623  @param[in] entries (list): List of entries of the node.
624  @param[in] payload (*): Data that is stored by the node.
625 
626  """
627 
628  super().__init__(feature, parent, payload);
629 
630 
631  self.type = cfnode_type.CFNODE_LEAF;
632 
633  self.__entries = entries; # list of clustering features
634 
635 
636  def __repr__(self):
637  """!
638  @return (string) Default leaf node represenation.
639 
640  """
641  text_entries = "\n";
642  for entry in self.entries:
643  text_entries += "\t" + str(entry) + "\n";
644 
645  return 'Leaf-node %s, parent %s, feature %s, entries: %d %s' % ( hex(id(self)), self.parent, self.feature, len(self.entries), text_entries );
646 
647 
648  def __str__(self):
649  """!
650  @return (string) String leaf node representation.
651 
652  """
653  return self.__repr__();
654 
655 
656  def insert_entry(self, entry):
657  """!
658  @brief Insert new clustering feature to the leaf node.
659 
660  @param[in] entry (cfentry): Clustering feature.
661 
662  """
663 
664  self.feature += entry;
665  self.entries.append(entry);
666 
667 
668  def remove_entry(self, entry):
669  """!
670  @brief Remove clustering feature from the leaf node.
671 
672  @param[in] entry (cfentry): Clustering feature.
673 
674  """
675 
676  self.feature -= entry;
677  self.entries.remove(entry);
678 
679 
680  def merge(self, node):
681  """!
682  @brief Merge leaf node to the current.
683 
684  @param[in] node (leaf_node): Leaf node that should be merged with current.
685 
686  """
687 
688  self.feature += node.feature;
689 
690  # Move entries from merged node
691  for entry in node.entries:
692  self.entries.append(entry);
693 
694 
695  def get_farthest_entries(self, type_measurement):
696  """!
697  @brief Find pair of farthest entries of the node.
698 
699  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest entries.
700 
701  @return (list) Pair of farthest entries of the node that are represented by list.
702 
703  """
704 
705  farthest_entity1 = None;
706  farthest_entity2 = None;
707  farthest_distance = 0;
708 
709  for i in range(0, len(self.entries)):
710  candidate1 = self.entries[i];
711 
712  for j in range(i + 1, len(self.entries)):
713  candidate2 = self.entries[j];
714  candidate_distance = candidate1.get_distance(candidate2, type_measurement);
715 
716  if (candidate_distance > farthest_distance):
717  farthest_distance = candidate_distance;
718  farthest_entity1 = candidate1;
719  farthest_entity2 = candidate2;
720 
721  return [farthest_entity1, farthest_entity2];
722 
723 
724  def get_nearest_index_entry(self, entry, type_measurement):
725  """!
726  @brief Find nearest index of nearest entry of node for the specified entry.
727 
728  @param[in] entry (cfentry): Entry that is used for calculation distance.
729  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
730 
731  @return (uint) Index of nearest entry of node for the specified entry.
732 
733  """
734 
735  minimum_distance = float('Inf');
736  nearest_index = 0;
737 
738  for candidate_index in range(0, len(self.entries)):
739  candidate_distance = self.entries[candidate_index].get_distance(entry, type_measurement);
740  if (candidate_distance < minimum_distance):
741  nearest_index = candidate_index;
742 
743  return nearest_index;
744 
745 
746  def get_nearest_entry(self, entry, type_measurement):
747  """!
748  @brief Find nearest entry of node for the specified entry.
749 
750  @param[in] entry (cfentry): Entry that is used for calculation distance.
751  @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
752 
753  @return (cfentry) Nearest entry of node for the specified entry.
754 
755  """
756 
757  min_key = lambda cur_entity: cur_entity.get_distance(entry, type_measurement);
758  return min(self.__entries, key = min_key);
759 
760 
761 class cftree:
762  """!
763  @brief CF-Tree representation.
764  @details A CF-tree is a height-balanced tree with two parameters: branching factor and threshold.
765 
766  """
767 
768  @property
769  def root(self):
770  """!
771  @return (cfnode) Root of the tree.
772 
773  """
774  return self.__root;
775 
776 
777  @property
778  def leafes(self):
779  """!
780  @return (list) List of all non-leaf nodes in the tree.
781 
782  """
783  return self.__leafes;
784 
785 
786  @property
787  def amount_nodes(self):
788  """!
789  @return (unit) Number of nodes (leaf and non-leaf) in the tree.
790 
791  """
792  return self.__amount_nodes;
793 
794 
795  @property
796  def amount_entries(self):
797  """!
798  @return (uint) Number of entries in the tree.
799 
800  """
801  return self.__amount_entries;
802 
803 
804  @property
805  def height(self):
806  """!
807  @return (uint) Height of the tree.
808 
809  """
810  return self.__height;
811 
812 
813  @property
814  def branch_factor(self):
815  """!
816  @return (uint) Branching factor of the tree.
817  @details Branching factor defines maximum number of successors in each non-leaf node.
818 
819  """
820  return self.__branch_factor;
821 
822 
823  @property
824  def threshold(self):
825  """!
826  @return (double) Threshold of the tree that represents maximum diameter of sub-clusters that is formed by leaf node entries.
827 
828  """
829  return self.__threshold;
830 
831 
832  @property
833  def max_entries(self):
834  """!
835  @return (uint) Maximum number of entries in each leaf node.
836 
837  """
838  return self.__max_entries;
839 
840 
841  @property
842  def type_measurement(self):
843  """!
844  @return (measurement_type) Type that is used for measuring.
845 
846  """
847  return self.__type_measurement;
848 
849 
850  def __init__(self, branch_factor, max_entries, threshold, type_measurement = measurement_type.CENTROID_EUCLIDEAN_DISTANCE):
851  """!
852  @brief Create CF-tree.
853 
854  @param[in] branch_factor (uint): Maximum number of children for non-leaf nodes.
855  @param[in] max_entries (uint): Maximum number of entries for leaf nodes.
856  @param[in] threshold (double): Maximum diameter of feature clustering for each leaf node.
857  @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance metrics.
858 
859  """
860 
861  self.__root = None;
862 
863  self.__branch_factor = branch_factor; # maximum number of children
864  if (self.__branch_factor < 2):
865  self.__branch_factor = 2;
866 
867 
868  self.__threshold = threshold; # maximum diameter of sub-clusters stored at the leaf nodes
869  self.__max_entries = max_entries;
870 
871  self.__leafes = [];
872 
873  self.__type_measurement = type_measurement;
874 
875  # statistics
876  self.__amount_nodes = 0; # root, despite it can be None.
877  self.__amount_entries = 0;
878  self.__height = 0; # tree size with root.
879 
880 
881  def get_level_nodes(self, level):
882  """!
883  @brief Traverses CF-tree to obtain nodes at the specified level.
884 
885  @param[in] level (uint): CF-tree level from that nodes should be returned.
886 
887  @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
888 
889  """
890 
891  level_nodes = [];
892  if (level < self.__height):
893  level_nodes = self.__recursive_get_level_nodes(level, self.__root);
894 
895  return level_nodes;
896 
897 
898  def __recursive_get_level_nodes(self, level, node):
899  """!
900  @brief Traverses CF-tree to obtain nodes at the specified level recursively.
901 
902  @param[in] level (uint): Current CF-tree level.
903  @param[in] node (cfnode): CF-node from that traversing is performed.
904 
905  @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
906 
907  """
908 
909  level_nodes = [];
910  if (level is 0):
911  level_nodes.append(node);
912 
913  else:
914  for sucessor in node.successors:
915  level_nodes += self.__recursive_get_level_nodes(level - 1, sucessor);
916 
917  return level_nodes;
918 
919 
920  def insert_cluster(self, cluster):
921  """!
922  @brief Insert cluster that is represented as list of points where each point is represented by list of coordinates.
923  @details Clustering feature is created for that cluster and inserted to the tree.
924 
925  @param[in] cluster (list): Cluster that is represented by list of points that should be inserted to the tree.
926 
927  """
928 
929  entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster));
930  self.insert(entry);
931 
932 
933  def insert(self, entry):
934  """!
935  @brief Insert clustering feature to the tree.
936 
937  @param[in] entry (cfentry): Clustering feature that should be inserted.
938 
939  """
940 
941  if (self.__root is None):
942  node = leaf_node(entry, None, [ entry ], None);
943 
944  self.__root = node;
945  self.__leafes.append(node);
946 
947  # Update statistics
948  self.__amount_entries += 1;
949  self.__amount_nodes += 1;
950  self.__height += 1; # root has successor now
951  else:
952  child_node_updation = self.__recursive_insert(entry, self.__root);
953  if (child_node_updation is True):
954  # Splitting has been finished, check for possibility to merge (at least we have already two children).
955  if (self.__merge_nearest_successors(self.__root) is True):
956  self.__amount_nodes -= 1;
957 
958 
959  def find_nearest_leaf(self, entry, search_node = None):
960  """!
961  @brief Search nearest leaf to the specified clustering feature.
962 
963  @param[in] entry (cfentry): Clustering feature.
964  @param[in] search_node (cfnode): Node from that searching should be started, if None then search process will be started for the root.
965 
966  @return (leaf_node) Nearest node to the specified clustering feature.
967 
968  """
969 
970  if (search_node is None):
971  search_node = self.__root;
972 
973  nearest_node = search_node;
974 
975  if (search_node.type == cfnode_type.CFNODE_NONLEAF):
976  min_key = lambda child_node: child_node.feature.get_distance(entry, self.__type_measurement);
977  nearest_child_node = min(search_node.successors, key = min_key);
978 
979  nearest_node = self.find_nearest_leaf(entry, nearest_child_node);
980 
981  return nearest_node;
982 
983 
984  def __recursive_insert(self, entry, search_node):
985  """!
986  @brief Recursive insert of the entry to the tree.
987  @details It performs all required procedures during insertion such as splitting, merging.
988 
989  @param[in] entry (cfentry): Clustering feature.
990  @param[in] search_node (cfnode): Node from that insertion should be started.
991 
992  @return (bool) True if number of nodes at the below level is changed, otherwise False.
993 
994  """
995 
996  # None-leaf node
997  if (search_node.type == cfnode_type.CFNODE_NONLEAF):
998  return self.__insert_for_noneleaf_node(entry, search_node);
999 
1000  # Leaf is reached
1001  else:
1002  return self.__insert_for_leaf_node(entry, search_node);
1003 
1004 
1005  def __insert_for_leaf_node(self, entry, search_node):
1006  """!
1007  @brief Recursive insert entry from leaf node to the tree.
1008 
1009  @param[in] entry (cfentry): Clustering feature.
1010  @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
1011 
1012  @return (bool) True if number of nodes at the below level is changed, otherwise False.
1013 
1014  """
1015 
1016  node_amount_updation = False;
1017 
1018  # Try to absorb by the entity
1019  index_nearest_entry = search_node.get_nearest_index_entry(entry, self.__type_measurement);
1020  merged_entry = search_node.entries[index_nearest_entry] + entry;
1021 
1022  # Otherwise try to add new entry
1023  if (merged_entry.get_diameter() > self.__threshold):
1024  # If it's not exceeded append entity and update feature of the leaf node.
1025  search_node.insert_entry(entry);
1026 
1027  # Otherwise current node should be splitted
1028  if (len(search_node.entries) > self.__max_entries):
1029  self.__split_procedure(search_node);
1030  node_amount_updation = True;
1031 
1032  # Update statistics
1033  self.__amount_entries += 1;
1034 
1035  else:
1036  search_node.entries[index_nearest_entry] = merged_entry;
1037  search_node.feature += entry;
1038 
1039  return node_amount_updation;
1040 
1041 
1042  def __insert_for_noneleaf_node(self, entry, search_node):
1043  """!
1044  @brief Recursive insert entry from none-leaf node to the tree.
1045 
1046  @param[in] entry (cfentry): Clustering feature.
1047  @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
1048 
1049  @return (bool) True if number of nodes at the below level is changed, otherwise False.
1050 
1051  """
1052 
1053  node_amount_updation = False;
1054 
1055  min_key = lambda child_node: child_node.get_distance(search_node, self.__type_measurement);
1056  nearest_child_node = min(search_node.successors, key = min_key);
1057 
1058  child_node_updation = self.__recursive_insert(entry, nearest_child_node);
1059 
1060  # Update clustering feature of none-leaf node.
1061  search_node.feature += entry;
1062 
1063  # Check branch factor, probably some leaf has been splitted and threshold has been exceeded.
1064  if (len(search_node.successors) > self.__branch_factor):
1065 
1066  # Check if it's aleady root then new root should be created (height is increased in this case).
1067  if (search_node is self.__root):
1068  self.__root = non_leaf_node(search_node.feature, None, [ search_node ], None);
1069  search_node.parent = self.__root;
1070 
1071  # Update statistics
1072  self.__amount_nodes += 1;
1073  self.__height += 1;
1074 
1075  [new_node1, new_node2] = self.__split_nonleaf_node(search_node);
1076 
1077  # Update parent list of successors
1078  parent = search_node.parent;
1079  parent.successors.remove(search_node);
1080  parent.successors.append(new_node1);
1081  parent.successors.append(new_node2);
1082 
1083  # Update statistics
1084  self.__amount_nodes += 1;
1085  node_amount_updation = True;
1086 
1087  elif (child_node_updation is True):
1088  # Splitting has been finished, check for possibility to merge (at least we have already two children).
1089  if (self.__merge_nearest_successors(search_node) is True):
1090  self.__amount_nodes -= 1;
1091 
1092  return node_amount_updation;
1093 
1094 
1095  def __merge_nearest_successors(self, node):
1096  """!
1097  @brief Find nearest sucessors and merge them.
1098 
1099  @param[in] node (non_leaf_node): Node whose two nearest successors should be merged.
1100 
1101  @return (bool): True if merging has been successfully performed, otherwise False.
1102 
1103  """
1104 
1105  merging_result = False;
1106 
1107  if (node.successors[0].type == cfnode_type.CFNODE_NONLEAF):
1108  [nearest_child_node1, nearest_child_node2] = node.get_nearest_successors(self.__type_measurement);
1109 
1110  if (len(nearest_child_node1.successors) + len(nearest_child_node2.successors) <= self.__branch_factor):
1111  node.successors.remove(nearest_child_node2);
1112  if (nearest_child_node2.type == cfnode_type.CFNODE_LEAF):
1113  self.__leafes.remove(nearest_child_node2);
1114 
1115  nearest_child_node1.merge(nearest_child_node2);
1116 
1117  merging_result = True;
1118 
1119  return merging_result;
1120 
1121 
1122  def __split_procedure(self, split_node):
1123  """!
1124  @brief Starts node splitting procedure in the CF-tree from the specify node.
1125 
1126  @param[in] split_node (cfnode): CF-tree node that should be splitted.
1127 
1128  """
1129  if (split_node is self.__root):
1130  self.__root = non_leaf_node(split_node.feature, None, [ split_node ], None);
1131  split_node.parent = self.__root;
1132 
1133  # Update statistics
1134  self.__amount_nodes += 1;
1135  self.__height += 1;
1136 
1137  [new_node1, new_node2] = self.__split_leaf_node(split_node);
1138 
1139  self.__leafes.remove(split_node);
1140  self.__leafes.append(new_node1);
1141  self.__leafes.append(new_node2);
1142 
1143  # Update parent list of successors
1144  parent = split_node.parent;
1145  parent.successors.remove(split_node);
1146  parent.successors.append(new_node1);
1147  parent.successors.append(new_node2);
1148 
1149  # Update statistics
1150  self.__amount_nodes += 1;
1151 
1152 
1153  def __split_nonleaf_node(self, node):
1154  """!
1155  @brief Performs splitting of the specified non-leaf node.
1156 
1157  @param[in] node (non_leaf_node): Non-leaf node that should be splitted.
1158 
1159  @return (list) New pair of non-leaf nodes [non_leaf_node1, non_leaf_node2].
1160 
1161  """
1162 
1163  [farthest_node1, farthest_node2] = node.get_farthest_successors(self.__type_measurement);
1164 
1165  # create new non-leaf nodes
1166  new_node1 = non_leaf_node(farthest_node1.feature, node.parent, [ farthest_node1 ], None);
1167  new_node2 = non_leaf_node(farthest_node2.feature, node.parent, [ farthest_node2 ], None);
1168 
1169  farthest_node1.parent = new_node1;
1170  farthest_node2.parent = new_node2;
1171 
1172  # re-insert other successors
1173  for successor in node.successors:
1174  if ( (successor is not farthest_node1) and (successor is not farthest_node2) ):
1175  distance1 = new_node1.get_distance(successor, self.__type_measurement);
1176  distance2 = new_node2.get_distance(successor, self.__type_measurement);
1177 
1178  if (distance1 < distance2):
1179  new_node1.insert_successor(successor);
1180  else:
1181  new_node2.insert_successor(successor);
1182 
1183  return [new_node1, new_node2];
1184 
1185 
1186  def __split_leaf_node(self, node):
1187  """!
1188  @brief Performs splitting of the specified leaf node.
1189 
1190  @param[in] node (leaf_node): Leaf node that should be splitted.
1191 
1192  @return (list) New pair of leaf nodes [leaf_node1, leaf_node2].
1193 
1194  @warning Splitted node is transformed to non_leaf.
1195 
1196  """
1197 
1198  # search farthest pair of entries
1199  [farthest_entity1, farthest_entity2] = node.get_farthest_entries(self.__type_measurement);
1200 
1201  # create new nodes
1202  new_node1 = leaf_node(farthest_entity1, node.parent, [ farthest_entity1 ], None);
1203  new_node2 = leaf_node(farthest_entity2, node.parent, [ farthest_entity2 ], None);
1204 
1205  # re-insert other entries
1206  for entity in node.entries:
1207  if ( (entity is not farthest_entity1) and (entity is not farthest_entity2) ):
1208  distance1 = new_node1.feature.get_distance(entity, self.__type_measurement);
1209  distance2 = new_node2.feature.get_distance(entity, self.__type_measurement);
1210 
1211  if (distance1 < distance2):
1212  new_node1.insert_entry(entity);
1213  else:
1214  new_node2.insert_entry(entity);
1215 
1216  return [new_node1, new_node2];
1217 
1218 
1219  def show_feature_destibution(self, data = None):
1220  """!
1221  @brief Shows feature distribution.
1222  @details Only features in 1D, 2D, 3D space can be visualized.
1223 
1224  @param[in] data (list): List of points that will be used for visualization, if it not specified than feature will be displayed only.
1225 
1226  """
1227  visualizer = cluster_visualizer();
1228 
1229  print("amount of nodes: ", self.__amount_nodes);
1230 
1231  if (data is not None):
1232  visualizer.append_cluster(data, marker = 'x');
1233 
1234  for level in range(0, self.height):
1235  level_nodes = self.get_level_nodes(level);
1236 
1237  centers = [ node.feature.get_centroid() for node in level_nodes ];
1238  visualizer.append_cluster(centers, None, markersize = (self.height - level + 1) * 5);
1239 
1240  visualizer.show();
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:359
pyclustering module for cluster analysis.
Definition: __init__.py:1
def __init__(self, feature, parent, entries, payload)
Create CF Leaf node.
Definition: cftree.py:617
def linear_sum(self)
Returns linear sum.
Definition: cftree.py:102
Represents clustering feature leaf node.
Definition: cftree.py:602
def get_distance(self, entry, type_measurement)
Calculates distance between two clusters in line with measurement type.
Definition: cftree.py:226
def remove_entry(self, entry)
Remove clustering feature from the leaf node.
Definition: cftree.py:668
def show_feature_destibution(self, data=None)
Shows feature distribution.
Definition: cftree.py:1219
def insert_entry(self, entry)
Insert new clustering feature to the leaf node.
Definition: cftree.py:656
def get_farthest_successors(self, type_measurement)
Find pair of farthest successors of the node in line with measurement type.
Definition: cftree.py:544
def __str__(self)
Default cfentry string representation.
Definition: cftree.py:158
def __add__(self, entry)
Overloaded operator add.
Definition: cftree.py:166
def __insert_for_noneleaf_node(self, entry, search_node)
Recursive insert entry from none-leaf node to the tree.
Definition: cftree.py:1042
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def __split_procedure(self, split_node)
Starts node splitting procedure in the CF-tree from the specify node.
Definition: cftree.py:1122
def __recursive_get_level_nodes(self, level, node)
Traverses CF-tree to obtain nodes at the specified level recursively.
Definition: cftree.py:898
def __get_average_inter_cluster_distance(self, entry)
Calculates average inter cluster distance between current and specified clusters. ...
Definition: cftree.py:331
def __split_nonleaf_node(self, node)
Performs splitting of the specified non-leaf node.
Definition: cftree.py:1153
def merge(self, node)
Merge non-leaf node to the current.
Definition: cftree.py:529
def insert_successor(self, successor)
Insert successor to the node.
Definition: cftree.py:501
feature
Clustering feature of the node.
Definition: cftree.py:408
Representation of clustering feature non-leaf node.
Definition: cftree.py:451
def __get_variance_increase_distance(self, entry)
Calculates variance increase distance between current and specified clusters.
Definition: cftree.py:366
def get_nearest_index_entry(self, entry, type_measurement)
Find nearest index of nearest entry of node for the specified entry.
Definition: cftree.py:724
def merge(self, node)
Merge leaf node to the current.
Definition: cftree.py:680
def __init__(self, feature, parent, payload)
Constructor of abstract CF node.
Definition: cftree.py:397
def get_diameter(self)
Calculates diameter of cluster that is represented by the entry.
Definition: cftree.py:309
def __insert_for_leaf_node(self, entry, search_node)
Recursive insert entry from leaf node to the tree.
Definition: cftree.py:1005
def insert(self, entry)
Insert clustering feature to the tree.
Definition: cftree.py:933
def find_nearest_leaf(self, entry, search_node=None)
Search nearest leaf to the specified clustering feature.
Definition: cftree.py:959
def __merge_nearest_successors(self, node)
Find nearest sucessors and merge them.
Definition: cftree.py:1095
CF-Tree representation.
Definition: cftree.py:761
def __eq__(self, entry)
Overloaded operator eq.
Definition: cftree.py:204
def insert_cluster(self, cluster)
Insert cluster that is represented as list of points where each point is represented by list of coord...
Definition: cftree.py:920
def get_radius(self)
Calculates radius of cluster that is represented by the entry.
Definition: cftree.py:279
def get_farthest_entries(self, type_measurement)
Find pair of farthest entries of the node.
Definition: cftree.py:695
def get_nearest_entry(self, entry, type_measurement)
Find nearest entry of node for the specified entry.
Definition: cftree.py:746
def square_sum(self)
Returns square sum.
Definition: cftree.py:113
parent
Pointer to the parent node (None for root).
Definition: cftree.py:411
def __init__(self, number_points, linear_sum, square_sum)
CF-entry constructor.
Definition: cftree.py:123
def get_distance(self, node, type_measurement)
Calculates distance between nodes in line with specified type measurement.
Definition: cftree.py:437
def __sub__(self, entry)
Overloaded operator sub.
Definition: cftree.py:183
Enumeration of measurement types for CF-Tree.
Definition: cftree.py:39
type
Type node (leaf or non-leaf).
Definition: cftree.py:414
Clustering feature representation.
Definition: cftree.py:82
def remove_successor(self, successor)
Remove successor from the node.
Definition: cftree.py:515
Representation of node of CF-Tree.
Definition: cftree.py:391
def number_points(self)
Returns number of points that are encoded.
Definition: cftree.py:92
def __init__(self, branch_factor, max_entries, threshold, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE)
Create CF-tree.
Definition: cftree.py:850
Enumeration of CF-Node types that are used by CF-Tree.
Definition: cftree.py:63
def __recursive_insert(self, entry, search_node)
Recursive insert of the entry to the tree.
Definition: cftree.py:984
payload
Payload of node where user data can be stored.
Definition: cftree.py:417
def get_centroid(self)
Calculates centroid of cluster that is represented by the entry.
Definition: cftree.py:260
def __split_leaf_node(self, node)
Performs splitting of the specified leaf node.
Definition: cftree.py:1186
def get_level_nodes(self, level)
Traverses CF-tree to obtain nodes at the specified level.
Definition: cftree.py:881
def get_nearest_successors(self, type_measurement)
Find pair of nearest successors of the node in line with measurement type.
Definition: cftree.py:573
def __get_average_intra_cluster_distance(self, entry)
Calculates average intra cluster distance between current and specified clusters. ...
Definition: cftree.py:346
def __init__(self, feature, parent, successors, payload)
Create CF Non-leaf node.
Definition: cftree.py:466