birch.py
1 """!
2 
3 @brief BIRCH (Balanced Iterative Reducing and Clustering using Hierarchies) cluster analysis algorithm.
4 @details Implementation based on paper @cite article::birch::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 import numpy
28 
29 from pyclustering.cluster.agglomerative import agglomerative, type_link
30 from pyclustering.cluster.encoder import cluster_encoder, type_encoding
31 
32 from pyclustering.container.cftree import cftree, measurement_type
33 
34 
35 class birch:
36  """!
37  @brief Class represents the clustering algorithm BIRCH (Balanced Iterative Reducing and Clustering using
38  Hierarchies).
39 
40  @details BIRCH is suitable for large databases. The algorithm incrementally and dynamically clusters
41  incoming multi-dimensional metric data points using the concepts of Clustering Feature and CF tree.
42  A Clustering Feature is a triple summarizing the information that is maintained about a cluster.
43  The Clustering Feature vector is defined as a triple:
44  \f[CF=\left ( N, \overrightarrow{LS}, SS \right )\f]
45 
46  Example how to extract clusters from 'OldFaithful' sample using BIRCH algorithm:
47  @code
48  from pyclustering.cluster.birch import birch
49  from pyclustering.cluster import cluster_visualizer
50  from pyclustering.utils import read_sample
51  from pyclustering.samples.definitions import FAMOUS_SAMPLES
52 
53  # Sample for cluster analysis (represented by list)
54  sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL)
55 
56  # Create BIRCH algorithm
57  birch_instance = birch(sample, 2, diameter=3.0)
58 
59  # Cluster analysis
60  birch_instance.process()
61 
62  # Obtain results of clustering
63  clusters = birch_instance.get_clusters()
64 
65  # Visualize allocated clusters
66  visualizer = cluster_visualizer()
67  visualizer.append_clusters(clusters, sample)
68  visualizer.show()
69  @endcode
70 
71  Here is the clustering result produced by BIRCH algorithm:
72  @image html birch_clustering_old_faithful.png "Fig. 1. BIRCH clustering - sample 'OldFaithful'."
73 
74  Methods 'get_cf_entries' and 'get_cf_clusters' can be used to obtain information how does an input data is
75  encoded. Here is an example how the encoding information can be extracted and visualized:
76  @code
77  from pyclustering.cluster.birch import birch
78  from pyclustering.cluster import cluster_visualizer
79  from pyclustering.utils import read_sample
80  from pyclustering.samples.definitions import FCPS_SAMPLES
81 
82  # Sample 'Lsun' for cluster analysis (represented by list of points)
83  sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
84 
85  # Create BIRCH algorithm
86  birch_instance = birch(sample, 3, diameter=0.5)
87 
88  # Cluster analysis
89  birch_instance.process()
90 
91  # Obtain results of clustering
92  clusters = birch_instance.get_clusters()
93 
94  # Obtain information how does the 'Lsun' sample is encoded in the CF-tree.
95  cf_entries = birch_instance.get_cf_entries()
96  cf_clusters = birch_instance.get_cf_cluster()
97 
98  cf_centroids = [entry.get_centroid() for entry in cf_entries]
99 
100  # Visualize allocated clusters
101  visualizer = cluster_visualizer(2, 2, titles=["Encoded data by CF-entries", "Data clusters"])
102  visualizer.append_clusters(cf_clusters, cf_centroids, canvas=0)
103  visualizer.append_clusters(clusters, sample, canvas=1)
104  visualizer.show()
105  @endcode
106 
107  Here is the clustering result produced by BIRCH algorithm:
108  @image html birch_cf_encoding_lsun.png "Fig. 2. CF-tree encoding and BIRCH clustering of 'Lsun' sample."
109 
110  """
111 
112  def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, diameter=0.5,
113  type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE,
114  entry_size_limit=500,
115  diameter_multiplier=1.5,
116  ccore=True):
117  """!
118  @brief Constructor of clustering algorithm BIRCH.
119 
120  @param[in] data (list): An input data represented as a list of points (objects) where each point is be represented by list of coordinates.
121  @param[in] number_clusters (uint): Amount of clusters that should be allocated.
122  @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
123  @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
124  @param[in] diameter (double): CF-entry diameter that used for CF-Tree construction, it might be increase if 'entry_size_limit' is exceeded.
125  @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
126  @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded
127  during creation then the 'diameter' is increased and CF-Tree is rebuilt.
128  @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when 'entry_size_limit' is exceeded.
129  @param[in] ccore (bool): If True than C++ part of the library is used for processing.
130 
131  """
132 
133  self.__pointer_data = data
134  self.__number_clusters = number_clusters
135 
136  self.__measurement_type = type_measurement
137  self.__entry_size_limit = entry_size_limit
138  self.__diameter_multiplier = diameter_multiplier
139  self.__ccore = ccore
140 
141  self.__verify_arguments()
142 
143  self.__features = None
144  self.__tree = cftree(branching_factor, max_node_entries, diameter, type_measurement)
145 
146  self.__clusters = []
147  self.__cf_clusters = []
148 
149 
150  def process(self):
151  """!
152  @brief Performs cluster analysis in line with rules of BIRCH algorithm.
153 
154  @return (birch) Returns itself (BIRCH instance).
155 
156  @see get_clusters()
157 
158  """
159 
160  self.__insert_data()
161  self.__extract_features()
162 
163  cf_data = [feature.get_centroid() for feature in self.__features]
164 
165  algorithm = agglomerative(cf_data, self.__number_clusters, type_link.SINGLE_LINK).process()
166  self.__cf_clusters = algorithm.get_clusters()
167 
168  cf_labels = cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, self.__cf_clusters, cf_data).\
169  set_encoding(type_encoding.CLUSTER_INDEX_LABELING).get_clusters()
170 
171  self.__clusters = [[] for _ in range(len(self.__cf_clusters))]
172  for index_point in range(len(self.__pointer_data)):
173  index_cf_entry = numpy.argmin(numpy.sum(numpy.square(
174  numpy.subtract(cf_data, self.__pointer_data[index_point])), axis=1))
175  index_cluster = cf_labels[index_cf_entry]
176  self.__clusters[index_cluster].append(index_point)
177 
178  return self
179 
180 
181  def get_clusters(self):
182  """!
183  @brief Returns list of allocated clusters, each cluster is represented by a list of indexes where each index
184  corresponds to a point in an input dataset.
185 
186  @return (list) List of allocated clusters.
187 
188  @see process()
189 
190  """
191 
192  return self.__clusters
193 
194 
195  def get_cf_entries(self):
196  """!
197  @brief Returns CF-entries that encodes an input dataset.
198 
199  @return (list) CF-entries that encodes an input dataset.
200 
201  @see get_cf_cluster
202 
203  """
204  return self.__features
205 
206 
207  def get_cf_cluster(self):
208  """!
209  @brief Returns list of allocated CF-entry clusters where each cluster is represented by indexes (each index
210  corresponds to CF-entry).
211 
212  @return (list) List of allocated CF-entry clusters.
213 
214  @see get_cf_entries
215 
216  """
217  return self.__cf_clusters
218 
219 
221  """!
222  @brief Returns clustering result representation type that indicate how clusters are encoded.
223 
224  @return (type_encoding) Clustering result representation.
225 
226  @see get_clusters()
227 
228  """
229 
230  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
231 
232 
233  def __verify_arguments(self):
234  """!
235  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
236 
237  """
238  if len(self.__pointer_data) == 0:
239  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
240 
241  if self.__number_clusters <= 0:
242  raise ValueError("Amount of cluster (current value: '%d') for allocation should be greater than 0." %
243  self.__number_clusters)
244 
245  if self.__entry_size_limit <= 0:
246  raise ValueError("Limit entry size (current value: '%d') should be greater than 0." %
247  self.__entry_size_limit)
248 
249 
250  def __extract_features(self):
251  """!
252  @brief Extracts features from CF-tree cluster.
253 
254  """
255 
256  self.__features = []
257 
258  if len(self.__tree.leafes) == 1:
259  # parameters are too general, copy all entries
260  for entry in self.__tree.leafes[0].entries:
261  self.__features.append(entry)
262 
263  else:
264  # copy all leaf clustering features
265  for leaf_node in self.__tree.leafes:
266  self.__features += leaf_node.entries
267 
268 
269  def __insert_data(self):
270  """!
271  @brief Inserts input data to the tree.
272 
273  @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
274 
275  """
276 
277  for index_point in range(0, len(self.__pointer_data)):
278  point = self.__pointer_data[index_point]
279  self.__tree.insert_point(point)
280 
281  if self.__tree.amount_entries > self.__entry_size_limit:
282  self.__tree = self.__rebuild_tree(index_point)
283 
284 
285  def __rebuild_tree(self, index_point):
286  """!
287  @brief Rebuilt tree in case of maxumum number of entries is exceeded.
288 
289  @param[in] index_point (uint): Index of point that is used as end point of re-building.
290 
291  @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
292 
293  """
294 
295  rebuild_result = False
296  increased_diameter = self.__tree.threshold * self.__diameter_multiplier
297 
298  tree = None
299 
300  while rebuild_result is False:
301  # increase diameter and rebuild tree
302  if increased_diameter == 0.0:
303  increased_diameter = 1.0
304 
305  # build tree with update parameters
306  tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement)
307 
308  for index_point in range(0, index_point + 1):
309  point = self.__pointer_data[index_point]
310  tree.insert_point(point)
311 
312  if tree.amount_entries > self.__entry_size_limit:
313  increased_diameter *= self.__diameter_multiplier
314  continue
315 
316  # Re-build is successful.
317  rebuild_result = True
318 
319  return tree
def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, diameter=0.5, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=500, diameter_multiplier=1.5, ccore=True)
Constructor of clustering algorithm BIRCH.
Definition: birch.py:116
def __extract_features(self)
Extracts features from CF-tree cluster.
Definition: birch.py:250
def __insert_data(self)
Inserts input data to the tree.
Definition: birch.py:269
Module for representing clustering results.
Definition: encoder.py:1
Class represents agglomerative algorithm for cluster analysis.
def get_cf_cluster(self)
Returns list of allocated CF-entry clusters where each cluster is represented by indexes (each index ...
Definition: birch.py:207
def __rebuild_tree(self, index_point)
Rebuilt tree in case of maxumum number of entries is exceeded.
Definition: birch.py:285
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: birch.py:233
Class represents the clustering algorithm BIRCH (Balanced Iterative Reducing and Clustering using Hie...
Definition: birch.py:35
CF-Tree representation.
Definition: cftree.py:699
def process(self)
Performs cluster analysis in line with rules of BIRCH algorithm.
Definition: birch.py:150
def get_cf_entries(self)
Returns CF-entries that encodes an input dataset.
Definition: birch.py:195
Data Structure: CF-Tree.
Definition: cftree.py:1
Provides service to change clustering result representation.
Definition: encoder.py:46
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: birch.py:220
Cluster analysis algorithm: agglomerative algorithm.
Definition: agglomerative.py:1
def get_clusters(self)
Returns list of allocated clusters, each cluster is represented by a list of indexes where each index...
Definition: birch.py:181