birch.py
1 """!
2 
3 @brief Cluster analysis algorithm: BIRCH
4 @details Implementation based on paper @cite article::birch::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from pyclustering.utils import linear_sum, square_sum;
29 
30 from pyclustering.cluster.encoder import type_encoding;
31 
32 from pyclustering.container.cftree import cftree, cfentry, measurement_type;
33 
34 
35 class birch:
36  """!
37  @brief Class represents clustering algorithm BIRCH.
38 
39  Example:
40  @code
41  # sample for cluster analysis (represented by list)
42  sample = read_sample(path_to_sample);
43 
44  # create object of birch that uses CCORE for processing
45  birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True);
46 
47  # cluster analysis
48  birch_instance.process();
49 
50  # obtain results of clustering
51  clusters = birch_instance.get_clusters();
52  @endcode
53 
54  """
55 
56  def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, ccore = True):
57  """!
58  @brief Constructor of clustering algorithm BIRCH.
59 
60  @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
61  @param[in] number_clusters (uint): Number of clusters that should be allocated.
62  @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
63  @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
64  @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
65  @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
66  @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
67  @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
68  @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem.
69 
70  @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
71 
72  Example:
73  @code
74  birch_instance1 = birch(sample1, 2); # two clusters should be allocated
75  birch_instance2 = birch(sample2, 5); # five clusters should be allocated
76 
77  # three clusters should be allocated, but also each leaf node can have maximum 5
78  # entries and each entry can have maximum 5 descriptors with initial diameter 0.05.
79  birch_instance3 = birch(sample3, 3, 5, 5, 0.05);
80  @endcode
81 
82  """
83 
84  self.__pointer_data = data;
85  self.__number_clusters = number_clusters;
86 
87  self.__measurement_type = type_measurement;
88  self.__entry_size_limit = entry_size_limit;
89  self.__diameter_multiplier = diameter_multiplier;
90  self.__ccore = ccore;
91 
92  self.__features = None;
93  self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
94 
95  self.__clusters = [];
96  self.__noise = [];
97 
98 
99  def process(self):
100  """!
101  @brief Performs cluster analysis in line with rules of BIRCH algorithm.
102 
103  @remark Results of clustering can be obtained using corresponding gets methods.
104 
105  @see get_clusters()
106 
107  """
108 
109  self.__insert_data();
110  self.__extract_features();
111 
112  # in line with specification modify hierarchical algorithm should be used for further clustering
113  current_number_clusters = len(self.__features);
114 
115  while (current_number_clusters > self.__number_clusters):
116  indexes = self.__find_nearest_cluster_features();
117 
118  self.__features[indexes[0]] += self.__features[indexes[1]];
119  self.__features.pop(indexes[1]);
120 
121  current_number_clusters = len(self.__features);
122 
123  # decode data
124  self.__decode_data();
125 
126 
127  def get_clusters(self):
128  """!
129  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
130 
131  @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
132 
133  @return (list) List of allocated clusters.
134 
135  @see process()
136  @see get_noise()
137 
138  """
139 
140  return self.__clusters;
141 
142 
144  """!
145  @brief Returns clustering result representation type that indicate how clusters are encoded.
146 
147  @return (type_encoding) Clustering result representation.
148 
149  @see get_clusters()
150 
151  """
152 
153  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
154 
155 
156  def __extract_features(self):
157  """!
158  @brief Extracts features from CF-tree cluster.
159 
160  """
161 
162  self.__features = [];
163 
164  if (len(self.__tree.leafes) == 1):
165  # parameters are too general, copy all entries
166  for entry in self.__tree.leafes[0].entries:
167  self.__features.append(entry);
168 
169  else:
170  # copy all leaf clustering features
171  for node in self.__tree.leafes:
172  self.__features.append(node.feature);
173 
174 
175  def __decode_data(self):
176  """!
177  @brief Decodes data from CF-tree features.
178 
179  """
180 
181  self.__clusters = [ [] for _ in range(self.__number_clusters) ];
182  self.__noise = [];
183 
184  for index_point in range(0, len(self.__pointer_data)):
185  (_, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features);
186 
187  self.__clusters[cluster_index].append(index_point);
188 
189 
190  def __insert_data(self):
191  """!
192  @brief Inserts input data to the tree.
193 
194  @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
195 
196  """
197 
198  for index_point in range(0, len(self.__pointer_data)):
199  point = self.__pointer_data[index_point];
200  self.__tree.insert_cluster( [ point ] );
201 
202  if (self.__tree.amount_entries > self.__entry_size_limit):
203  self.__tree = self.__rebuild_tree(index_point);
204 
205  #self.__tree.show_feature_destibution(self.__pointer_data);
206 
207 
208  def __rebuild_tree(self, index_point):
209  """!
210  @brief Rebuilt tree in case of maxumum number of entries is exceeded.
211 
212  @param[in] index_point (uint): Index of point that is used as end point of re-building.
213 
214  @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
215 
216  """
217 
218  rebuild_result = False;
219  increased_diameter = self.__tree.threshold * self.__diameter_multiplier;
220 
221  tree = None;
222 
223  while(rebuild_result is False):
224  # increase diameter and rebuild tree
225  if (increased_diameter == 0.0):
226  increased_diameter = 1.0;
227 
228  # build tree with update parameters
229  tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement);
230 
231  for index_point in range(0, index_point + 1):
232  point = self.__pointer_data[index_point];
233  tree.insert_cluster([point]);
234 
235  if (tree.amount_entries > self.__entry_size_limit):
236  increased_diameter *= self.__diameter_multiplier;
237  continue;
238 
239  # Re-build is successful.
240  rebuild_result = True;
241 
242  return tree;
243 
244 
245  def __find_nearest_cluster_features(self):
246  """!
247  @brief Find pair of nearest CF entries.
248 
249  @return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
250 
251  """
252 
253  minimum_distance = float("Inf");
254  index1 = 0;
255  index2 = 0;
256 
257  for index_candidate1 in range(0, len(self.__features)):
258  feature1 = self.__features[index_candidate1];
259  for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
260  feature2 = self.__features[index_candidate2];
261 
262  distance = feature1.get_distance(feature2, self.__measurement_type);
263  if (distance < minimum_distance):
264  minimum_distance = distance;
265 
266  index1 = index_candidate1;
267  index2 = index_candidate2;
268 
269  return [index1, index2];
270 
271 
272  def __get_nearest_feature(self, point, feature_collection):
273  """!
274  @brief Find nearest entry for specified point.
275 
276  @param[in] point (list): Pointer to point from input dataset.
277  @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
278 
279  @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
280 
281  """
282 
283  minimum_distance = float("Inf");
284  index_nearest_feature = -1;
285 
286  for index_entry in range(0, len(feature_collection)):
287  point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]));
288 
289  distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type);
290  if (distance < minimum_distance):
291  minimum_distance = distance;
292  index_nearest_feature = index_entry;
293 
294  return (minimum_distance, index_nearest_feature);
def __extract_features(self)
Extracts features from CF-tree cluster.
Definition: birch.py:156
def __insert_data(self)
Inserts input data to the tree.
Definition: birch.py:190
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
Module for representing clustering results.
Definition: encoder.py:1
def __rebuild_tree(self, index_point)
Rebuilt tree in case of maxumum number of entries is exceeded.
Definition: birch.py:208
Class represents clustering algorithm BIRCH.
Definition: birch.py:35
CF-Tree representation.
Definition: cftree.py:761
def process(self)
Performs cluster analysis in line with rules of BIRCH algorithm.
Definition: birch.py:99
def __init__(self, data, number_clusters, branching_factor=5, max_node_entries=5, initial_diameter=0.1, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=200, diameter_multiplier=1.5, ccore=True)
Constructor of clustering algorithm BIRCH.
Definition: birch.py:56
def __decode_data(self)
Decodes data from CF-tree features.
Definition: birch.py:175
Clustering feature representation.
Definition: cftree.py:82
Data Structure: CF-Tree.
Definition: cftree.py:1
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: birch.py:143
def __get_nearest_feature(self, point, feature_collection)
Find nearest entry for specified point.
Definition: birch.py:272
def __find_nearest_cluster_features(self)
Find pair of nearest CF entries.
Definition: birch.py:245
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: birch.py:127