bsas.py
1 """!
2 
3 @brief Cluster analysis algorithm: BSAS (Basic Sequential Algorithmic Scheme).
4 @details Implementation based on paper @cite book::pattern_recognition::2009.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from pyclustering.core.wrapper import ccore_library
29 from pyclustering.core.bsas_wrapper import bsas as bsas_wrapper
30 from pyclustering.core.metric_wrapper import metric_wrapper
31 
32 from pyclustering.cluster import cluster_visualizer
33 from pyclustering.cluster.encoder import type_encoding
34 
35 from pyclustering.utils.metric import type_metric, distance_metric
36 
37 
39  """!
40  @brief Visualizer of BSAS algorithm's results.
41  @details BSAS visualizer provides visualization services that are specific for BSAS algorithm.
42 
43  """
44 
45  @staticmethod
46  def show_clusters(sample, clusters, representatives, **kwargs):
47  """!
48  @brief Display BSAS clustering results.
49 
50  @param[in] sample (list): Dataset that was used for clustering.
51  @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
52  @param[in] representatives (array_like): Allocated representatives correspond to clusters.
53  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display', 'offset').
54 
55  <b>Keyword Args:</b><br>
56  - figure (figure): If 'None' then new is figure is created, otherwise specified figure is used for visualization.
57  - display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'.
58  - offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified).
59 
60  @return (figure) Figure where clusters were drawn.
61 
62  """
63 
64  figure = kwargs.get('figure', None)
65  display = kwargs.get('display', True)
66  offset = kwargs.get('offset', 0)
67 
68  visualizer = cluster_visualizer()
69  visualizer.append_clusters(clusters, sample, canvas=offset)
70 
71  for cluster_index in range(len(clusters)):
72  visualizer.append_cluster_attribute(offset, cluster_index, [representatives[cluster_index]], '*', 10)
73 
74  return visualizer.show(figure=figure, display=display)
75 
76 
77 class bsas:
78  """!
79  @brief Class represents BSAS clustering algorithm - basic sequential algorithmic scheme.
80  @details Algorithm has two mandatory parameters: maximum allowable number of clusters and threshold
81  of dissimilarity or in other words maximum distance between points. Distance metric also can
82  be specified using 'metric' parameters, by default 'Manhattan' distance is used.
83  BSAS using following rule for updating cluster representative:
84 
85  \f[
86  \vec{m}_{C_{k}}^{new}=\frac{ \left ( n_{C_{k}^{new}} - 1 \right )\vec{m}_{C_{k}}^{old} + \vec{x} }{n_{C_{k}^{new}}}
87  \f]
88 
89  Clustering results of this algorithm depends on objects order in input data.
90 
91  Example:
92  @code
93  from pyclustering.cluster.bsas import bsas, bsas_visualizer
94  from pyclustering.utils import read_sample
95  from pyclustering.samples.definitions import SIMPLE_SAMPLES
96 
97  # Read data sample from 'Simple02.data'.
98  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2)
99 
100  # Prepare algorithm's parameters.
101  max_clusters = 3
102  threshold = 1.0
103 
104  # Create instance of BSAS algorithm.
105  bsas_instance = bsas(sample, max_clusters, threshold)
106  bsas_instance.process()
107 
108  # Get clustering results.
109  clusters = bsas_instance.get_clusters()
110  representatives = bsas_instance.get_representatives()
111 
112  # Display results.
113  bsas_visualizer.show_clusters(sample, clusters, representatives)
114  @endcode
115 
116  @see pyclustering.cluster.mbsas, pyclustering.cluster.ttsas
117 
118  """
119 
120  def __init__(self, data, maximum_clusters, threshold, ccore=True, **kwargs):
121  """!
122  @brief Creates classical BSAS algorithm.
123 
124  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
125  @param[in] maximum_clusters: Maximum allowable number of clusters that can be allocated during processing.
126  @param[in] threshold: Threshold of dissimilarity (maximum distance) between points.
127  @param[in] ccore (bool): If True than CCORE (C++ part of the library) will be used for solving.
128  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
129 
130  <b>Keyword Args:</b><br>
131  - metric (distance_metric): Metric that is used for distance calculation between two points.
132 
133  """
134 
135  self._data = data
136  self._amount = maximum_clusters
137  self._threshold = threshold
138  self._metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN))
139  self._ccore = ccore and self._metric.get_type() != type_metric.USER_DEFINED
140 
141  self._clusters = []
142  self._representatives = []
143 
144  if self._ccore is True:
145  self._ccore = ccore_library.workable()
146 
147  self._verify_arguments()
148 
149 
150  def process(self):
151  """!
152  @brief Performs cluster analysis in line with rules of BSAS algorithm.
153 
154  @return (bsas) Returns itself (BSAS instance).
155 
156  @remark Results of clustering can be obtained using corresponding get methods.
157 
158  @see get_clusters()
159  @see get_representatives()
160 
161  """
162 
163  if self._ccore is True:
164  self.__process_by_ccore()
165  else:
166  self.__prcess_by_python()
167 
168  return self
169 
170 
171  def __process_by_ccore(self):
172  ccore_metric = metric_wrapper.create_instance(self._metric)
173  self._clusters, self._representatives = bsas_wrapper(self._data, self._amount, self._threshold, ccore_metric.get_pointer())
174 
175 
176  def __prcess_by_python(self):
177  self._clusters.append([0])
178  self._representatives.append(self._data[0])
179 
180  for i in range(1, len(self._data)):
181  point = self._data[i]
182  index_cluster, distance = self._find_nearest_cluster(point)
183 
184  if (distance > self._threshold) and (len(self._clusters) < self._amount):
185  self._representatives.append(point)
186  self._clusters.append([i])
187  else:
188  self._clusters[index_cluster].append(i)
189  self._update_representative(index_cluster, point)
190 
191 
192  def get_clusters(self):
193  """!
194  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
195 
196  @see process()
197  @see get_representatives()
198 
199  """
200  return self._clusters
201 
202 
204  """!
205  @brief Returns list of representatives of allocated clusters.
206 
207  @see process()
208  @see get_clusters()
209 
210  """
211  return self._representatives
212 
213 
215  """!
216  @brief Returns clustering result representation type that indicate how clusters are encoded.
217 
218  @return (type_encoding) Clustering result representation.
219 
220  @see get_clusters()
221 
222  """
223 
224  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
225 
226 
227  def _find_nearest_cluster(self, point):
228  """!
229  @brief Find nearest cluster to the specified point.
230 
231  @param[in] point (list): Point from dataset.
232 
233  @return (uint, double) Index of nearest cluster and distance to it.
234 
235  """
236  index_cluster = -1
237  nearest_distance = float('inf')
238 
239  for index in range(len(self._representatives)):
240  distance = self._metric(point, self._representatives[index])
241  if distance < nearest_distance:
242  index_cluster = index
243  nearest_distance = distance
244 
245  return index_cluster, nearest_distance
246 
247 
248  def _update_representative(self, index_cluster, point):
249  """!
250  @brief Update cluster representative in line with new cluster size and added point to it.
251 
252  @param[in] index_cluster (uint): Index of cluster whose representative should be updated.
253  @param[in] point (list): Point that was added to cluster.
254 
255  """
256  length = len(self._clusters[index_cluster])
257  rep = self._representatives[index_cluster]
258 
259  for dimension in range(len(rep)):
260  rep[dimension] = ( (length - 1) * rep[dimension] + point[dimension] ) / length
261 
262 
263  def _verify_arguments(self):
264  """!
265  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
266 
267  """
268  if len(self._data) == 0:
269  raise ValueError("Input data is empty (size: '%d')." % len(self._data))
270 
271  if self._amount <= 0:
272  raise ValueError("Amount of cluster (current value: '%d') for allocation should be greater than 0." %
273  self._amount)
274 
275  if self._threshold < 0:
276  raise ValueError("Threshold of dissimilarity (current value: '%d') between points should be greater or "
277  "equal to 0." % self._threshold)
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:390
pyclustering module for cluster analysis.
Definition: __init__.py:1
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: bsas.py:214
Class represents BSAS clustering algorithm - basic sequential algorithmic scheme. ...
Definition: bsas.py:77
def get_representatives(self)
Returns list of representatives of allocated clusters.
Definition: bsas.py:203
def process(self)
Performs cluster analysis in line with rules of BSAS algorithm.
Definition: bsas.py:150
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:67
def __init__(self, data, maximum_clusters, threshold, ccore=True, kwargs)
Creates classical BSAS algorithm.
Definition: bsas.py:120
def _find_nearest_cluster(self, point)
Find nearest cluster to the specified point.
Definition: bsas.py:227
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: bsas.py:192
def __prcess_by_python(self)
Definition: bsas.py:176
def _verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: bsas.py:263
Visualizer of BSAS algorithm&#39;s results.
Definition: bsas.py:38
def show_clusters(sample, clusters, representatives, kwargs)
Display BSAS clustering results.
Definition: bsas.py:46
def __process_by_ccore(self)
Definition: bsas.py:171
def _update_representative(self, index_cluster, point)
Update cluster representative in line with new cluster size and added point to it.
Definition: bsas.py:248