3 @brief BIRCH (Balanced Iterative Reducing and Clustering using Hierarchies) cluster analysis algorithm.
4 @details Implementation based on paper @cite article::birch::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
22 @brief Class represents the clustering algorithm BIRCH (Balanced Iterative Reducing and Clustering using
25 @details BIRCH is suitable for large databases. The algorithm incrementally and dynamically clusters
26 incoming multi-dimensional metric data points using the concepts of Clustering Feature and CF tree.
27 A Clustering Feature is a triple summarizing the information that is maintained about a cluster.
28 The Clustering Feature vector is defined as a triple:
29 \f[CF=\left ( N, \overrightarrow{LS}, SS \right )\f]
31 Example how to extract clusters from 'OldFaithful' sample using BIRCH algorithm:
33 from pyclustering.cluster.birch import birch
34 from pyclustering.cluster import cluster_visualizer
35 from pyclustering.utils import read_sample
36 from pyclustering.samples.definitions import FAMOUS_SAMPLES
38 # Sample for cluster analysis (represented by list)
39 sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL)
41 # Create BIRCH algorithm
42 birch_instance = birch(sample, 2, diameter=3.0)
45 birch_instance.process()
47 # Obtain results of clustering
48 clusters = birch_instance.get_clusters()
50 # Visualize allocated clusters
51 visualizer = cluster_visualizer()
52 visualizer.append_clusters(clusters, sample)
56 Here is the clustering result produced by BIRCH algorithm:
57 @image html birch_clustering_old_faithful.png "Fig. 1. BIRCH clustering - sample 'OldFaithful'."
59 Methods 'get_cf_entries' and 'get_cf_clusters' can be used to obtain information how does an input data is
60 encoded. Here is an example how the encoding information can be extracted and visualized:
62 from pyclustering.cluster.birch import birch
63 from pyclustering.cluster import cluster_visualizer
64 from pyclustering.utils import read_sample
65 from pyclustering.samples.definitions import FCPS_SAMPLES
67 # Sample 'Lsun' for cluster analysis (represented by list of points)
68 sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
70 # Create BIRCH algorithm
71 birch_instance = birch(sample, 3, diameter=0.5)
74 birch_instance.process()
76 # Obtain results of clustering
77 clusters = birch_instance.get_clusters()
79 # Obtain information how does the 'Lsun' sample is encoded in the CF-tree.
80 cf_entries = birch_instance.get_cf_entries()
81 cf_clusters = birch_instance.get_cf_cluster()
83 cf_centroids = [entry.get_centroid() for entry in cf_entries]
85 # Visualize allocated clusters
86 visualizer = cluster_visualizer(2, 2, titles=["Encoded data by CF-entries", "Data clusters"])
87 visualizer.append_clusters(cf_clusters, cf_centroids, canvas=0)
88 visualizer.append_clusters(clusters, sample, canvas=1)
92 Here is the clustering result produced by BIRCH algorithm:
93 @image html birch_cf_encoding_lsun.png "Fig. 2. CF-tree encoding and BIRCH clustering of 'Lsun' sample."
97 def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, diameter=0.5,
98 type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE,
100 diameter_multiplier=1.5,
103 @brief Constructor of clustering algorithm BIRCH.
105 @param[in] data (list): An input data represented as a list of points (objects) where each point is be represented by list of coordinates.
106 @param[in] number_clusters (uint): Amount of clusters that should be allocated.
107 @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
108 @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
109 @param[in] diameter (double): CF-entry diameter that used for CF-Tree construction, it might be increase if 'entry_size_limit' is exceeded.
110 @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
111 @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded
112 during creation then the 'diameter' is increased and CF-Tree is rebuilt.
113 @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when 'entry_size_limit' is exceeded.
114 @param[in] ccore (bool): If True than C++ part of the library is used for processing.
129 self.
__tree =
cftree(branching_factor, max_node_entries, diameter, type_measurement)
137 @brief Performs cluster analysis in line with rules of BIRCH algorithm.
139 @return (birch) Returns itself (BIRCH instance).
148 cf_data = [feature.get_centroid()
for feature
in self.
__features]
154 set_encoding(type_encoding.CLUSTER_INDEX_LABELING).
get_clusters()
158 index_cf_entry = numpy.argmin(numpy.sum(numpy.square(
159 numpy.subtract(cf_data, self.
__pointer_data[index_point])), axis=1))
160 index_cluster = cf_labels[index_cf_entry]
161 self.
__clusters[index_cluster].append(index_point)
168 @brief Returns list of allocated clusters, each cluster is represented by a list of indexes where each index
169 corresponds to a point in an input dataset.
171 @return (list) List of allocated clusters.
182 @brief Returns CF-entries that encodes an input dataset.
184 @return (list) CF-entries that encodes an input dataset.
194 @brief Returns list of allocated CF-entry clusters where each cluster is represented by indexes (each index
195 corresponds to CF-entry).
197 @return (list) List of allocated CF-entry clusters.
207 @brief Returns clustering result representation type that indicate how clusters are encoded.
209 @return (type_encoding) Clustering result representation.
215 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
218 def __verify_arguments(self):
220 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
224 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__pointer_data))
227 raise ValueError(
"Amount of cluster (current value: '%d') for allocation should be greater than 0." %
231 raise ValueError(
"Limit entry size (current value: '%d') should be greater than 0." %
235 def __extract_features(self):
237 @brief Extracts features from CF-tree cluster.
243 if len(self.
__tree.leafes) == 1:
245 for entry
in self.
__tree.leafes[0].entries:
250 for leaf_node
in self.
__tree.leafes:
254 def __insert_data(self):
256 @brief Inserts input data to the tree.
258 @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
264 self.
__tree.insert_point(point)
270 def __rebuild_tree(self, index_point):
272 @brief Rebuilt tree in case of maxumum number of entries is exceeded.
274 @param[in] index_point (uint): Index of point that is used as end point of re-building.
276 @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
280 rebuild_result =
False
285 while rebuild_result
is False:
287 if increased_diameter == 0.0:
288 increased_diameter = 1.0
293 for index_point
in range(0, index_point + 1):
295 tree.insert_point(point)
302 rebuild_result =
True