pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
bsas.py
1 """!
2 
3 @brief Cluster analysis algorithm: BSAS (Basic Sequential Algorithmic Scheme).
4 @details Implementation based on paper @cite book::pattern_recognition::2009.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 
13 from pyclustering.core.wrapper import ccore_library
14 from pyclustering.core.bsas_wrapper import bsas as bsas_wrapper
15 from pyclustering.core.metric_wrapper import metric_wrapper
16 
17 from pyclustering.cluster import cluster_visualizer
18 from pyclustering.cluster.encoder import type_encoding
19 
20 from pyclustering.utils.metric import type_metric, distance_metric
21 
22 
24  """!
25  @brief Visualizer of BSAS algorithm's results.
26  @details BSAS visualizer provides visualization services that are specific for BSAS algorithm.
27 
28  """
29 
30  @staticmethod
31  def show_clusters(sample, clusters, representatives, **kwargs):
32  """!
33  @brief Display BSAS clustering results.
34 
35  @param[in] sample (list): Dataset that was used for clustering.
36  @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
37  @param[in] representatives (array_like): Allocated representatives correspond to clusters.
38  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display', 'offset').
39 
40  <b>Keyword Args:</b><br>
41  - figure (figure): If 'None' then new is figure is created, otherwise specified figure is used for visualization.
42  - display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'.
43  - offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified).
44 
45  @return (figure) Figure where clusters were drawn.
46 
47  """
48 
49  figure = kwargs.get('figure', None)
50  display = kwargs.get('display', True)
51  offset = kwargs.get('offset', 0)
52 
53  visualizer = cluster_visualizer()
54  visualizer.append_clusters(clusters, sample, canvas=offset)
55 
56  for cluster_index in range(len(clusters)):
57  visualizer.append_cluster_attribute(offset, cluster_index, [representatives[cluster_index]], '*', 10)
58 
59  return visualizer.show(figure=figure, display=display)
60 
61 
62 class bsas:
63  """!
64  @brief Class represents BSAS clustering algorithm - basic sequential algorithmic scheme.
65  @details Algorithm has two mandatory parameters: maximum allowable number of clusters and threshold
66  of dissimilarity or in other words maximum distance between points. Distance metric also can
67  be specified using 'metric' parameters, by default 'Manhattan' distance is used.
68  BSAS using following rule for updating cluster representative:
69 
70  \f[
71  \vec{m}_{C_{k}}^{new}=\frac{ \left ( n_{C_{k}^{new}} - 1 \right )\vec{m}_{C_{k}}^{old} + \vec{x} }{n_{C_{k}^{new}}}
72  \f]
73 
74  Clustering results of this algorithm depends on objects order in input data.
75 
76  Example:
77  @code
78  from pyclustering.cluster.bsas import bsas, bsas_visualizer
79  from pyclustering.utils import read_sample
80  from pyclustering.samples.definitions import SIMPLE_SAMPLES
81 
82  # Read data sample from 'Simple02.data'.
83  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2)
84 
85  # Prepare algorithm's parameters.
86  max_clusters = 3
87  threshold = 1.0
88 
89  # Create instance of BSAS algorithm.
90  bsas_instance = bsas(sample, max_clusters, threshold)
91  bsas_instance.process()
92 
93  # Get clustering results.
94  clusters = bsas_instance.get_clusters()
95  representatives = bsas_instance.get_representatives()
96 
97  # Display results.
98  bsas_visualizer.show_clusters(sample, clusters, representatives)
99  @endcode
100 
101  @see pyclustering.cluster.mbsas, pyclustering.cluster.ttsas
102 
103  """
104 
105  def __init__(self, data, maximum_clusters, threshold, ccore=True, **kwargs):
106  """!
107  @brief Creates classical BSAS algorithm.
108 
109  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
110  @param[in] maximum_clusters: Maximum allowable number of clusters that can be allocated during processing.
111  @param[in] threshold: Threshold of dissimilarity (maximum distance) between points.
112  @param[in] ccore (bool): If True than CCORE (C++ part of the library) will be used for solving.
113  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
114 
115  <b>Keyword Args:</b><br>
116  - metric (distance_metric): Metric that is used for distance calculation between two points.
117 
118  """
119 
120  self._data = data
121  self._amount = maximum_clusters
122  self._threshold = threshold
123  self._metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN))
124  self._ccore = ccore and self._metric.get_type() != type_metric.USER_DEFINED
125 
126  self._clusters = []
127  self._representatives = []
128 
129  if self._ccore is True:
130  self._ccore = ccore_library.workable()
131 
132  self._verify_arguments()
133 
134 
135  def process(self):
136  """!
137  @brief Performs cluster analysis in line with rules of BSAS algorithm.
138 
139  @return (bsas) Returns itself (BSAS instance).
140 
141  @remark Results of clustering can be obtained using corresponding get methods.
142 
143  @see get_clusters()
144  @see get_representatives()
145 
146  """
147 
148  if self._ccore is True:
149  self.__process_by_ccore()
150  else:
151  self.__prcess_by_python()
152 
153  return self
154 
155 
156  def __process_by_ccore(self):
157  ccore_metric = metric_wrapper.create_instance(self._metric)
158  self._clusters, self._representatives = bsas_wrapper(self._data, self._amount, self._threshold, ccore_metric.get_pointer())
159 
160 
161  def __prcess_by_python(self):
162  self._clusters.append([0])
163  self._representatives.append(self._data[0])
164 
165  for i in range(1, len(self._data)):
166  point = self._data[i]
167  index_cluster, distance = self._find_nearest_cluster(point)
168 
169  if (distance > self._threshold) and (len(self._clusters) < self._amount):
170  self._representatives.append(point)
171  self._clusters.append([i])
172  else:
173  self._clusters[index_cluster].append(i)
174  self._update_representative(index_cluster, point)
175 
176 
177  def get_clusters(self):
178  """!
179  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
180 
181  @see process()
182  @see get_representatives()
183 
184  """
185  return self._clusters
186 
187 
189  """!
190  @brief Returns list of representatives of allocated clusters.
191 
192  @see process()
193  @see get_clusters()
194 
195  """
196  return self._representatives
197 
198 
200  """!
201  @brief Returns clustering result representation type that indicate how clusters are encoded.
202 
203  @return (type_encoding) Clustering result representation.
204 
205  @see get_clusters()
206 
207  """
208 
209  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
210 
211 
212  def _find_nearest_cluster(self, point):
213  """!
214  @brief Find nearest cluster to the specified point.
215 
216  @param[in] point (list): Point from dataset.
217 
218  @return (uint, double) Index of nearest cluster and distance to it.
219 
220  """
221  index_cluster = -1
222  nearest_distance = float('inf')
223 
224  for index in range(len(self._representatives)):
225  distance = self._metric(point, self._representatives[index])
226  if distance < nearest_distance:
227  index_cluster = index
228  nearest_distance = distance
229 
230  return index_cluster, nearest_distance
231 
232 
233  def _update_representative(self, index_cluster, point):
234  """!
235  @brief Update cluster representative in line with new cluster size and added point to it.
236 
237  @param[in] index_cluster (uint): Index of cluster whose representative should be updated.
238  @param[in] point (list): Point that was added to cluster.
239 
240  """
241  length = len(self._clusters[index_cluster])
242  rep = self._representatives[index_cluster]
243 
244  for dimension in range(len(rep)):
245  rep[dimension] = ( (length - 1) * rep[dimension] + point[dimension] ) / length
246 
247 
248  def _verify_arguments(self):
249  """!
250  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
251 
252  """
253  if len(self._data) == 0:
254  raise ValueError("Input data is empty (size: '%d')." % len(self._data))
255 
256  if self._amount <= 0:
257  raise ValueError("Amount of cluster (current value: '%d') for allocation should be greater than 0." %
258  self._amount)
259 
260  if self._threshold < 0:
261  raise ValueError("Threshold of dissimilarity (current value: '%d') between points should be greater or "
262  "equal to 0." % self._threshold)
pyclustering.cluster.bsas.bsas._verify_arguments
def _verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: bsas.py:248
pyclustering.cluster.cluster_visualizer
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:370
pyclustering.cluster.bsas.bsas.__prcess_by_python
def __prcess_by_python(self)
Definition: bsas.py:161
pyclustering.cluster.bsas.bsas.get_cluster_encoding
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: bsas.py:199
pyclustering.cluster.bsas.bsas.get_clusters
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
Definition: bsas.py:177
pyclustering.cluster.bsas.bsas._update_representative
def _update_representative(self, index_cluster, point)
Update cluster representative in line with new cluster size and added point to it.
Definition: bsas.py:233
pyclustering.cluster.bsas.bsas.get_representatives
def get_representatives(self)
Returns list of representatives of allocated clusters.
Definition: bsas.py:188
pyclustering.cluster
pyclustering module for cluster analysis.
Definition: __init__.py:1
pyclustering.cluster.bsas.bsas._amount
_amount
Definition: bsas.py:121
pyclustering.cluster.bsas.bsas._representatives
_representatives
Definition: bsas.py:127
pyclustering.cluster.bsas.bsas.__process_by_ccore
def __process_by_ccore(self)
Definition: bsas.py:156
pyclustering.cluster.bsas.bsas._metric
_metric
Definition: bsas.py:123
pyclustering.utils.metric.distance_metric
Distance metric performs distance calculation between two points in line with encapsulated function,...
Definition: metric.py:52
pyclustering.cluster.bsas.bsas._find_nearest_cluster
def _find_nearest_cluster(self, point)
Find nearest cluster to the specified point.
Definition: bsas.py:212
pyclustering.cluster.bsas.bsas.__init__
def __init__(self, data, maximum_clusters, threshold, ccore=True, **kwargs)
Creates classical BSAS algorithm.
Definition: bsas.py:105
pyclustering.cluster.bsas.bsas_visualizer
Visualizer of BSAS algorithm's results.
Definition: bsas.py:23
pyclustering.cluster.bsas.bsas._ccore
_ccore
Definition: bsas.py:124
pyclustering.cluster.bsas.bsas_visualizer.show_clusters
def show_clusters(sample, clusters, representatives, **kwargs)
Display BSAS clustering results.
Definition: bsas.py:31
pyclustering.cluster.bsas.bsas._data
_data
Definition: bsas.py:120
pyclustering.cluster.bsas.bsas._threshold
_threshold
Definition: bsas.py:122
pyclustering.cluster.bsas.bsas._clusters
_clusters
Definition: bsas.py:126
pyclustering.cluster.encoder
Module for representing clustering results.
Definition: encoder.py:1
pyclustering.utils.metric
Module provides various distance metrics - abstraction of the notion of distance in a metric space.
Definition: metric.py:1
pyclustering.cluster.bsas.bsas
Class represents BSAS clustering algorithm - basic sequential algorithmic scheme.
Definition: bsas.py:62
pyclustering.cluster.bsas.bsas.process
def process(self)
Performs cluster analysis in line with rules of BSAS algorithm.
Definition: bsas.py:135