birch.py
1 """!
2 
3 @brief Cluster analysis algorithm: BIRCH
4 @details Implementation based on paper @cite article::birch::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from pyclustering.utils import linear_sum, square_sum
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.container.cftree import cftree, cfentry, measurement_type
33 
34 
35 class birch:
36  """!
37  @brief Class represents clustering algorithm BIRCH.
38 
39  Example how to extract clusters from 'OldFaithful' sample using BIRCH algorithm:
40  @code
41  from pyclustering.cluster.birch import birch, measurement_type
42  from pyclustering.cluster import cluster_visualizer
43  from pyclustering.utils import read_sample
44  from pyclustering.samples.definitions import FAMOUS_SAMPLES
45 
46  # Sample for cluster analysis (represented by list)
47  sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL)
48 
49  # Create BIRCH algorithm
50  birch_instance = birch(sample, 2)
51 
52  # Cluster analysis
53  birch_instance.process()
54 
55  # Obtain results of clustering
56  clusters = birch_instance.get_clusters()
57 
58  # Visualize allocated clusters
59  visualizer = cluster_visualizer()
60  visualizer.append_clusters(clusters, sample)
61  visualizer.show()
62  @endcode
63 
64  """
65 
66  def __init__(self, data, number_clusters, branching_factor=5, max_node_entries=5, initial_diameter=0.1,
67  type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE,
68  entry_size_limit=200,
69  diameter_multiplier=1.5,
70  ccore=True):
71  """!
72  @brief Constructor of clustering algorithm BIRCH.
73 
74  @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
75  @param[in] number_clusters (uint): Number of clusters that should be allocated.
76  @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
77  @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
78  @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
79  @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
80  @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
81  @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
82  @param[in] ccore (bool): If True than CCORE (C++ part of the library) will be used for solving the problem.
83 
84  @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
85 
86  """
87 
88  self.__pointer_data = data
89  self.__number_clusters = number_clusters
90 
91  self.__measurement_type = type_measurement
92  self.__entry_size_limit = entry_size_limit
93  self.__diameter_multiplier = diameter_multiplier
94  self.__ccore = ccore
95 
96  self.__verify_arguments()
97 
98  self.__features = None
99  self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement)
100 
101  self.__clusters = []
102  self.__noise = []
103 
104 
105  def process(self):
106  """!
107  @brief Performs cluster analysis in line with rules of BIRCH algorithm.
108 
109  @return (birch) Returns itself (BIRCH instance).
110 
111  @see get_clusters()
112 
113  """
114 
115  self.__insert_data()
116  self.__extract_features()
117 
118  # in line with specification modify hierarchical algorithm should be used for further clustering
119  current_number_clusters = len(self.__features)
120 
121  while current_number_clusters > self.__number_clusters:
122  indexes = self.__find_nearest_cluster_features()
123 
124  self.__features[indexes[0]] += self.__features[indexes[1]]
125  self.__features.pop(indexes[1])
126 
127  current_number_clusters = len(self.__features)
128 
129  # decode data
130  self.__decode_data()
131  return self
132 
133 
134  def get_clusters(self):
135  """!
136  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
137 
138  @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
139 
140  @return (list) List of allocated clusters.
141 
142  @see process()
143  @see get_noise()
144 
145  """
146 
147  return self.__clusters
148 
149 
151  """!
152  @brief Returns clustering result representation type that indicate how clusters are encoded.
153 
154  @return (type_encoding) Clustering result representation.
155 
156  @see get_clusters()
157 
158  """
159 
160  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
161 
162 
163  def __verify_arguments(self):
164  """!
165  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
166 
167  """
168  if len(self.__pointer_data) == 0:
169  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
170 
171  if self.__number_clusters <= 0:
172  raise ValueError("Amount of cluster (current value: '%d') for allocation should be greater than 0." %
173  self.__number_clusters)
174 
175  if self.__entry_size_limit <= 0:
176  raise ValueError("Limit entry size (current value: '%d') should be greater than 0." %
177  self.__entry_size_limit)
178 
179 
180  def __extract_features(self):
181  """!
182  @brief Extracts features from CF-tree cluster.
183 
184  """
185 
186  self.__features = []
187 
188  if len(self.__tree.leafes) == 1:
189  # parameters are too general, copy all entries
190  for entry in self.__tree.leafes[0].entries:
191  self.__features.append(entry)
192 
193  else:
194  # copy all leaf clustering features
195  for node in self.__tree.leafes:
196  self.__features.append(node.feature)
197 
198 
199  def __decode_data(self):
200  """!
201  @brief Decodes data from CF-tree features.
202 
203  """
204 
205  self.__clusters = [[] for _ in range(self.__number_clusters)]
206  self.__noise = []
207 
208  for index_point in range(0, len(self.__pointer_data)):
209  (_, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features)
210 
211  self.__clusters[cluster_index].append(index_point)
212 
213 
214  def __insert_data(self):
215  """!
216  @brief Inserts input data to the tree.
217 
218  @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
219 
220  """
221 
222  for index_point in range(0, len(self.__pointer_data)):
223  point = self.__pointer_data[index_point]
224  self.__tree.insert_cluster([point])
225 
226  if self.__tree.amount_entries > self.__entry_size_limit:
227  self.__tree = self.__rebuild_tree(index_point)
228 
229  #self.__tree.show_feature_destibution(self.__pointer_data);
230 
231 
232  def __rebuild_tree(self, index_point):
233  """!
234  @brief Rebuilt tree in case of maxumum number of entries is exceeded.
235 
236  @param[in] index_point (uint): Index of point that is used as end point of re-building.
237 
238  @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
239 
240  """
241 
242  rebuild_result = False
243  increased_diameter = self.__tree.threshold * self.__diameter_multiplier
244 
245  tree = None
246 
247  while rebuild_result is False:
248  # increase diameter and rebuild tree
249  if increased_diameter == 0.0:
250  increased_diameter = 1.0
251 
252  # build tree with update parameters
253  tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement)
254 
255  for index_point in range(0, index_point + 1):
256  point = self.__pointer_data[index_point]
257  tree.insert_cluster([point])
258 
259  if tree.amount_entries > self.__entry_size_limit:
260  increased_diameter *= self.__diameter_multiplier
261  continue
262 
263  # Re-build is successful.
264  rebuild_result = True
265 
266  return tree
267 
268 
269  def __find_nearest_cluster_features(self):
270  """!
271  @brief Find pair of nearest CF entries.
272 
273  @return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
274 
275  """
276 
277  minimum_distance = float("Inf")
278  index1 = 0
279  index2 = 0
280 
281  for index_candidate1 in range(0, len(self.__features)):
282  feature1 = self.__features[index_candidate1]
283  for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
284  feature2 = self.__features[index_candidate2]
285 
286  distance = feature1.get_distance(feature2, self.__measurement_type)
287  if distance < minimum_distance:
288  minimum_distance = distance
289 
290  index1 = index_candidate1
291  index2 = index_candidate2
292 
293  return [index1, index2]
294 
295 
296  def __get_nearest_feature(self, point, feature_collection):
297  """!
298  @brief Find nearest entry for specified point.
299 
300  @param[in] point (list): Pointer to point from input dataset.
301  @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
302 
303  @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
304 
305  """
306 
307  minimum_distance = float("Inf")
308  index_nearest_feature = -1
309 
310  for index_entry in range(0, len(feature_collection)):
311  point_entry = cfentry(1, linear_sum([ point ]), square_sum([point]))
312 
313  distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type)
314  if distance < minimum_distance:
315  minimum_distance = distance
316  index_nearest_feature = index_entry
317 
318  return minimum_distance, index_nearest_feature
def __extract_features(self)
Extracts features from CF-tree cluster.
Definition: birch.py:180
def __insert_data(self)
Inserts input data to the tree.
Definition: birch.py:214
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
Module for representing clustering results.
Definition: encoder.py:1
def __rebuild_tree(self, index_point)
Rebuilt tree in case of maxumum number of entries is exceeded.
Definition: birch.py:232
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: birch.py:163
Class represents clustering algorithm BIRCH.
Definition: birch.py:35
CF-Tree representation.
Definition: cftree.py:756
def process(self)
Performs cluster analysis in line with rules of BIRCH algorithm.
Definition: birch.py:105
def __init__(self, data, number_clusters, branching_factor=5, max_node_entries=5, initial_diameter=0.1, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=200, diameter_multiplier=1.5, ccore=True)
Constructor of clustering algorithm BIRCH.
Definition: birch.py:70
def __decode_data(self)
Decodes data from CF-tree features.
Definition: birch.py:199
Clustering feature representation.
Definition: cftree.py:82
Data Structure: CF-Tree.
Definition: cftree.py:1
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: birch.py:150
def __get_nearest_feature(self, point, feature_collection)
Find nearest entry for specified point.
Definition: birch.py:296
def __find_nearest_cluster_features(self)
Find pair of nearest CF entries.
Definition: birch.py:269
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: birch.py:134