pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
birch.py
1 """!
2 
3 @brief BIRCH (Balanced Iterative Reducing and Clustering using Hierarchies) cluster analysis algorithm.
4 @details Implementation based on paper @cite article::birch::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 import numpy
13 
14 from pyclustering.cluster.agglomerative import agglomerative, type_link
15 from pyclustering.cluster.encoder import cluster_encoder, type_encoding
16 
17 from pyclustering.container.cftree import cftree, measurement_type
18 
19 
20 class birch:
21  """!
22  @brief Class represents the clustering algorithm BIRCH (Balanced Iterative Reducing and Clustering using
23  Hierarchies).
24 
25  @details BIRCH is suitable for large databases. The algorithm incrementally and dynamically clusters
26  incoming multi-dimensional metric data points using the concepts of Clustering Feature and CF tree.
27  A Clustering Feature is a triple summarizing the information that is maintained about a cluster.
28  The Clustering Feature vector is defined as a triple:
29  \f[CF=\left ( N, \overrightarrow{LS}, SS \right )\f]
30 
31  Example how to extract clusters from 'OldFaithful' sample using BIRCH algorithm:
32  @code
33  from pyclustering.cluster.birch import birch
34  from pyclustering.cluster import cluster_visualizer
35  from pyclustering.utils import read_sample
36  from pyclustering.samples.definitions import FAMOUS_SAMPLES
37 
38  # Sample for cluster analysis (represented by list)
39  sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL)
40 
41  # Create BIRCH algorithm
42  birch_instance = birch(sample, 2, diameter=3.0)
43 
44  # Cluster analysis
45  birch_instance.process()
46 
47  # Obtain results of clustering
48  clusters = birch_instance.get_clusters()
49 
50  # Visualize allocated clusters
51  visualizer = cluster_visualizer()
52  visualizer.append_clusters(clusters, sample)
53  visualizer.show()
54  @endcode
55 
56  Here is the clustering result produced by BIRCH algorithm:
57  @image html birch_clustering_old_faithful.png "Fig. 1. BIRCH clustering - sample 'OldFaithful'."
58 
59  Methods 'get_cf_entries' and 'get_cf_clusters' can be used to obtain information how does an input data is
60  encoded. Here is an example how the encoding information can be extracted and visualized:
61  @code
62  from pyclustering.cluster.birch import birch
63  from pyclustering.cluster import cluster_visualizer
64  from pyclustering.utils import read_sample
65  from pyclustering.samples.definitions import FCPS_SAMPLES
66 
67  # Sample 'Lsun' for cluster analysis (represented by list of points)
68  sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
69 
70  # Create BIRCH algorithm
71  birch_instance = birch(sample, 3, diameter=0.5)
72 
73  # Cluster analysis
74  birch_instance.process()
75 
76  # Obtain results of clustering
77  clusters = birch_instance.get_clusters()
78 
79  # Obtain information how does the 'Lsun' sample is encoded in the CF-tree.
80  cf_entries = birch_instance.get_cf_entries()
81  cf_clusters = birch_instance.get_cf_cluster()
82 
83  cf_centroids = [entry.get_centroid() for entry in cf_entries]
84 
85  # Visualize allocated clusters
86  visualizer = cluster_visualizer(2, 2, titles=["Encoded data by CF-entries", "Data clusters"])
87  visualizer.append_clusters(cf_clusters, cf_centroids, canvas=0)
88  visualizer.append_clusters(clusters, sample, canvas=1)
89  visualizer.show()
90  @endcode
91 
92  Here is the clustering result produced by BIRCH algorithm:
93  @image html birch_cf_encoding_lsun.png "Fig. 2. CF-tree encoding and BIRCH clustering of 'Lsun' sample."
94 
95  """
96 
97  def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, diameter=0.5,
98  type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE,
99  entry_size_limit=500,
100  diameter_multiplier=1.5,
101  ccore=True):
102  """!
103  @brief Constructor of clustering algorithm BIRCH.
104 
105  @param[in] data (list): An input data represented as a list of points (objects) where each point is be represented by list of coordinates.
106  @param[in] number_clusters (uint): Amount of clusters that should be allocated.
107  @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
108  @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
109  @param[in] diameter (double): CF-entry diameter that used for CF-Tree construction, it might be increase if 'entry_size_limit' is exceeded.
110  @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
111  @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded
112  during creation then the 'diameter' is increased and CF-Tree is rebuilt.
113  @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when 'entry_size_limit' is exceeded.
114  @param[in] ccore (bool): If True than C++ part of the library is used for processing.
115 
116  """
117 
118  self.__pointer_data = data
119  self.__number_clusters = number_clusters
120 
121  self.__measurement_type = type_measurement
122  self.__entry_size_limit = entry_size_limit
123  self.__diameter_multiplier = diameter_multiplier
124  self.__ccore = ccore
125 
126  self.__verify_arguments()
127 
128  self.__features = None
129  self.__tree = cftree(branching_factor, max_node_entries, diameter, type_measurement)
130 
131  self.__clusters = []
132  self.__cf_clusters = []
133 
134 
135  def process(self):
136  """!
137  @brief Performs cluster analysis in line with rules of BIRCH algorithm.
138 
139  @return (birch) Returns itself (BIRCH instance).
140 
141  @see get_clusters()
142 
143  """
144 
145  self.__insert_data()
146  self.__extract_features()
147 
148  cf_data = [feature.get_centroid() for feature in self.__features]
149 
150  algorithm = agglomerative(cf_data, self.__number_clusters, type_link.SINGLE_LINK).process()
151  self.__cf_clusters = algorithm.get_clusters()
152 
153  cf_labels = cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, self.__cf_clusters, cf_data).\
154  set_encoding(type_encoding.CLUSTER_INDEX_LABELING).get_clusters()
155 
156  self.__clusters = [[] for _ in range(len(self.__cf_clusters))]
157  for index_point in range(len(self.__pointer_data)):
158  index_cf_entry = numpy.argmin(numpy.sum(numpy.square(
159  numpy.subtract(cf_data, self.__pointer_data[index_point])), axis=1))
160  index_cluster = cf_labels[index_cf_entry]
161  self.__clusters[index_cluster].append(index_point)
162 
163  return self
164 
165 
166  def get_clusters(self):
167  """!
168  @brief Returns list of allocated clusters, each cluster is represented by a list of indexes where each index
169  corresponds to a point in an input dataset.
170 
171  @return (list) List of allocated clusters.
172 
173  @see process()
174 
175  """
176 
177  return self.__clusters
178 
179 
180  def get_cf_entries(self):
181  """!
182  @brief Returns CF-entries that encodes an input dataset.
183 
184  @return (list) CF-entries that encodes an input dataset.
185 
186  @see get_cf_cluster
187 
188  """
189  return self.__features
190 
191 
192  def get_cf_cluster(self):
193  """!
194  @brief Returns list of allocated CF-entry clusters where each cluster is represented by indexes (each index
195  corresponds to CF-entry).
196 
197  @return (list) List of allocated CF-entry clusters.
198 
199  @see get_cf_entries
200 
201  """
202  return self.__cf_clusters
203 
204 
206  """!
207  @brief Returns clustering result representation type that indicate how clusters are encoded.
208 
209  @return (type_encoding) Clustering result representation.
210 
211  @see get_clusters()
212 
213  """
214 
215  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
216 
217 
218  def __verify_arguments(self):
219  """!
220  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
221 
222  """
223  if len(self.__pointer_data) == 0:
224  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
225 
226  if self.__number_clusters <= 0:
227  raise ValueError("Amount of cluster (current value: '%d') for allocation should be greater than 0." %
228  self.__number_clusters)
229 
230  if self.__entry_size_limit <= 0:
231  raise ValueError("Limit entry size (current value: '%d') should be greater than 0." %
232  self.__entry_size_limit)
233 
234 
235  def __extract_features(self):
236  """!
237  @brief Extracts features from CF-tree cluster.
238 
239  """
240 
241  self.__features = []
242 
243  if len(self.__tree.leafes) == 1:
244  # parameters are too general, copy all entries
245  for entry in self.__tree.leafes[0].entries:
246  self.__features.append(entry)
247 
248  else:
249  # copy all leaf clustering features
250  for leaf_node in self.__tree.leafes:
251  self.__features += leaf_node.entries
252 
253 
254  def __insert_data(self):
255  """!
256  @brief Inserts input data to the tree.
257 
258  @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
259 
260  """
261 
262  for index_point in range(0, len(self.__pointer_data)):
263  point = self.__pointer_data[index_point]
264  self.__tree.insert_point(point)
265 
266  if self.__tree.amount_entries > self.__entry_size_limit:
267  self.__tree = self.__rebuild_tree(index_point)
268 
269 
270  def __rebuild_tree(self, index_point):
271  """!
272  @brief Rebuilt tree in case of maxumum number of entries is exceeded.
273 
274  @param[in] index_point (uint): Index of point that is used as end point of re-building.
275 
276  @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
277 
278  """
279 
280  rebuild_result = False
281  increased_diameter = self.__tree.threshold * self.__diameter_multiplier
282 
283  tree = None
284 
285  while rebuild_result is False:
286  # increase diameter and rebuild tree
287  if increased_diameter == 0.0:
288  increased_diameter = 1.0
289 
290  # build tree with update parameters
291  tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement)
292 
293  for index_point in range(0, index_point + 1):
294  point = self.__pointer_data[index_point]
295  tree.insert_point(point)
296 
297  if tree.amount_entries > self.__entry_size_limit:
298  increased_diameter *= self.__diameter_multiplier
299  continue
300 
301  # Re-build is successful.
302  rebuild_result = True
303 
304  return tree
pyclustering.cluster.birch.birch.__ccore
__ccore
Definition: birch.py:120
pyclustering.cluster.birch.birch.__init__
def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, diameter=0.5, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=500, diameter_multiplier=1.5, ccore=True)
Constructor of clustering algorithm BIRCH.
Definition: birch.py:97
pyclustering.cluster.birch.birch.get_clusters
def get_clusters(self)
Returns list of allocated clusters, each cluster is represented by a list of indexes where each index...
Definition: birch.py:166
pyclustering.cluster.birch.birch
Class represents the clustering algorithm BIRCH (Balanced Iterative Reducing and Clustering using Hie...
Definition: birch.py:20
pyclustering.cluster.birch.birch.__tree
__tree
Definition: birch.py:125
pyclustering.cluster.birch.birch.__extract_features
def __extract_features(self)
Extracts features from CF-tree cluster.
Definition: birch.py:235
pyclustering.cluster.birch.birch.__pointer_data
__pointer_data
Definition: birch.py:114
pyclustering.container.cftree.cftree
CF-Tree representation.
Definition: cftree.py:684
pyclustering.cluster.agglomerative
Cluster analysis algorithm: agglomerative algorithm.
Definition: agglomerative.py:1
pyclustering.cluster.agglomerative.agglomerative
Class represents agglomerative algorithm for cluster analysis.
Definition: agglomerative.py:43
pyclustering.cluster.birch.birch.__rebuild_tree
def __rebuild_tree(self, index_point)
Rebuilt tree in case of maxumum number of entries is exceeded.
Definition: birch.py:270
pyclustering.cluster.birch.birch.get_cf_cluster
def get_cf_cluster(self)
Returns list of allocated CF-entry clusters where each cluster is represented by indexes (each index ...
Definition: birch.py:192
pyclustering.cluster.encoder.cluster_encoder
Provides service to change clustering result representation.
Definition: encoder.py:32
pyclustering.cluster.birch.birch.__clusters
__clusters
Definition: birch.py:127
pyclustering.container.cftree
Data Structure: CF-Tree.
Definition: cftree.py:1
pyclustering.cluster.birch.birch.__verify_arguments
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: birch.py:218
pyclustering.cluster.birch.birch.process
def process(self)
Performs cluster analysis in line with rules of BIRCH algorithm.
Definition: birch.py:135
pyclustering.cluster.birch.birch.__features
__features
Definition: birch.py:124
pyclustering.cluster.birch.birch.__number_clusters
__number_clusters
Definition: birch.py:115
pyclustering.cluster.birch.birch.__cf_clusters
__cf_clusters
Definition: birch.py:128
pyclustering.cluster.birch.birch.get_cluster_encoding
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: birch.py:205
pyclustering.cluster.birch.birch.get_cf_entries
def get_cf_entries(self)
Returns CF-entries that encodes an input dataset.
Definition: birch.py:180
pyclustering.cluster.birch.birch.__entry_size_limit
__entry_size_limit
Definition: birch.py:118
pyclustering.cluster.birch.birch.__measurement_type
__measurement_type
Definition: birch.py:117
pyclustering.cluster.birch.birch.__diameter_multiplier
__diameter_multiplier
Definition: birch.py:119
pyclustering.cluster.encoder
Module for representing clustering results.
Definition: encoder.py:1
pyclustering.cluster.birch.birch.__insert_data
def __insert_data(self)
Inserts input data to the tree.
Definition: birch.py:254