3 @brief Cluster analysis algorithm: BIRCH 4 @details Implementation based on paper @cite article::birch::1. 6 @authors Andrei Novikov (pyclustering@yandex.ru) 8 @copyright GNU Public License 10 @cond GNU_PUBLIC_LICENSE 11 PyClustering is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 16 PyClustering is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 21 You should have received a copy of the GNU General Public License 22 along with this program. If not, see <http://www.gnu.org/licenses/>. 37 @brief Class represents clustering algorithm BIRCH. 41 # sample for cluster analysis (represented by list) 42 sample = read_sample(path_to_sample); 44 # create object of birch that uses CCORE for processing 45 birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True); 48 birch_instance.process(); 50 # obtain results of clustering 51 clusters = birch_instance.get_clusters(); 56 def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, ccore = True):
58 @brief Constructor of clustering algorithm BIRCH. 60 @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple. 61 @param[in] number_clusters (uint): Number of clusters that should be allocated. 62 @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree. 63 @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree. 64 @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded. 65 @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics. 66 @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt. 67 @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded. 68 @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem. 70 @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation. 74 birch_instance1 = birch(sample1, 2); # two clusters should be allocated 75 birch_instance2 = birch(sample2, 5); # five clusters should be allocated 77 # three clusters should be allocated, but also each leaf node can have maximum 5 78 # entries and each entry can have maximum 5 descriptors with initial diameter 0.05. 79 birch_instance3 = birch(sample3, 3, 5, 5, 0.05); 93 self.
__tree =
cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
101 @brief Performs cluster analysis in line with rules of BIRCH algorithm. 103 @remark Results of clustering can be obtained using corresponding gets methods. 113 current_number_clusters = len(self.
__features);
121 current_number_clusters = len(self.
__features);
129 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. 131 @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned. 133 @return (list) List of allocated clusters. 145 @brief Returns clustering result representation type that indicate how clusters are encoded. 147 @return (type_encoding) Clustering result representation. 153 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
156 def __extract_features(self):
158 @brief Extracts features from CF-tree cluster. 164 if (len(self.
__tree.leafes) == 1):
166 for entry
in self.
__tree.leafes[0].entries:
171 for node
in self.
__tree.leafes:
175 def __decode_data(self):
177 @brief Decodes data from CF-tree features. 187 self.
__clusters[cluster_index].append(index_point);
190 def __insert_data(self):
192 @brief Inserts input data to the tree. 194 @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt. 200 self.
__tree.insert_cluster( [ point ] );
208 def __rebuild_tree(self, index_point):
210 @brief Rebuilt tree in case of maxumum number of entries is exceeded. 212 @param[in] index_point (uint): Index of point that is used as end point of re-building. 214 @return (cftree) Rebuilt tree with encoded points till specified point from input data space. 218 rebuild_result =
False;
223 while(rebuild_result
is False):
225 if (increased_diameter == 0.0):
226 increased_diameter = 1.0;
229 tree =
cftree(self.
__tree.branch_factor, self.
__tree.max_entries, increased_diameter, self.
__tree.type_measurement);
231 for index_point
in range(0, index_point + 1):
233 tree.insert_cluster([point]);
240 rebuild_result =
True;
245 def __find_nearest_cluster_features(self):
247 @brief Find pair of nearest CF entries. 249 @return (list) List of two nearest enties that are represented by list [index_point1, index_point2]. 253 minimum_distance = float(
"Inf");
257 for index_candidate1
in range(0, len(self.
__features)):
259 for index_candidate2
in range(index_candidate1 + 1, len(self.
__features)):
263 if (distance < minimum_distance):
264 minimum_distance = distance;
266 index1 = index_candidate1;
267 index2 = index_candidate2;
269 return [index1, index2];
272 def __get_nearest_feature(self, point, feature_collection):
274 @brief Find nearest entry for specified point. 276 @param[in] point (list): Pointer to point from input dataset. 277 @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point. 279 @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry. 283 minimum_distance = float(
"Inf");
284 index_nearest_feature = -1;
286 for index_entry
in range(0, len(feature_collection)):
287 point_entry =
cfentry(1, linear_sum([ point ]), square_sum([ point ]));
289 distance = feature_collection[index_entry].get_distance(point_entry, self.
__measurement_type);
290 if (distance < minimum_distance):
291 minimum_distance = distance;
292 index_nearest_feature = index_entry;
294 return (minimum_distance, index_nearest_feature);
def __extract_features(self)
Extracts features from CF-tree cluster.
def __insert_data(self)
Inserts input data to the tree.
Utils that are used by modules of pyclustering.
Module for representing clustering results.
def __rebuild_tree(self, index_point)
Rebuilt tree in case of maxumum number of entries is exceeded.
Class represents clustering algorithm BIRCH.
def process(self)
Performs cluster analysis in line with rules of BIRCH algorithm.
def __init__(self, data, number_clusters, branching_factor=5, max_node_entries=5, initial_diameter=0.1, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=200, diameter_multiplier=1.5, ccore=True)
Constructor of clustering algorithm BIRCH.
def __decode_data(self)
Decodes data from CF-tree features.
Clustering feature representation.
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
def __get_nearest_feature(self, point, feature_collection)
Find nearest entry for specified point.
def __find_nearest_cluster_features(self)
Find pair of nearest CF entries.
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...