pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
dbscan.py
1 """!
2 
3 @brief Cluster analysis algorithm: DBSCAN.
4 @details Implementation based on paper @cite inproceedings::dbscan::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 
13 from pyclustering.container.kdtree import kdtree_balanced
14 
15 from pyclustering.cluster.encoder import type_encoding
16 
17 from pyclustering.core.wrapper import ccore_library
18 
19 import pyclustering.core.dbscan_wrapper as wrapper
20 
21 
22 class dbscan:
23  """!
24  @brief Class represents clustering algorithm DBSCAN.
25  @details This DBSCAN algorithm is KD-tree optimized.
26 
27  By default C/C++ pyclustering library is used for processing that significantly increases performance.
28 
29  Clustering example where DBSCAN algorithm is used to process `Chainlink` data from `FCPS` collection:
30  @code
31  from pyclustering.cluster.dbscan import dbscan
32  from pyclustering.cluster import cluster_visualizer
33  from pyclustering.utils import read_sample
34  from pyclustering.samples.definitions import FCPS_SAMPLES
35 
36  # Sample for cluster analysis.
37  sample = read_sample(FCPS_SAMPLES.SAMPLE_CHAINLINK)
38 
39  # Create DBSCAN algorithm.
40  dbscan_instance = dbscan(sample, 0.7, 3)
41 
42  # Start processing by DBSCAN.
43  dbscan_instance.process()
44 
45  # Obtain results of clustering.
46  clusters = dbscan_instance.get_clusters()
47  noise = dbscan_instance.get_noise()
48 
49  # Visualize clustering results
50  visualizer = cluster_visualizer()
51  visualizer.append_clusters(clusters, sample)
52  visualizer.append_cluster(noise, sample, marker='x')
53  visualizer.show()
54  @endcode
55 
56  """
57 
58  def __init__(self, data, eps, neighbors, ccore=True, **kwargs):
59  """!
60  @brief Constructor of clustering algorithm DBSCAN.
61 
62  @param[in] data (list): Input data that is presented as list of points or distance matrix (defined by parameter
63  'data_type', by default data is considered as a list of points).
64  @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less then the radius.
65  @param[in] neighbors (uint): minimum number of shared neighbors that is required for establish links between points.
66  @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
67  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'data_type').
68 
69  <b>Keyword Args:</b><br>
70  - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
71 
72  """
73 
74  self.__pointer_data = data
75  self.__kdtree = None
76  self.__eps = eps
77  self.__sqrt_eps = eps * eps
78  self.__neighbors = neighbors
79 
80  self.__visited = None
81  self.__belong = None
82 
83  self.__data_type = kwargs.get('data_type', 'points')
84 
85  self.__clusters = []
86  self.__noise = []
87 
88  self.__neighbor_searcher = None
89 
90  self.__initialize_ccore_state(ccore)
91 
92  self.__verify_arguments()
93 
94 
95  def __getstate__(self):
96  """!
97  @brief Returns current state of the algorithm.
98  @details It does not return internal temporal variables that are not visible for a user.
99 
100  @return (tuple) Current state of the algorithm.
101 
102  """
103  return (self.__pointer_data, self.__eps, self.__sqrt_eps, self.__neighbors, self.__visited, self.__belong,
104  self.__data_type, self.__clusters, self.__noise, self.__ccore)
105 
106 
107  def __setstate__(self, state):
108  """!
109  @brief Set current state of the algorithm.
110  @details Set state method checks if C++ pyclustering is available for the current platform, as a result `ccore`
111  state might be different if state is moved between platforms.
112 
113  """
114  self.__pointer_data, self.__eps, self.__sqrt_eps, self.__neighbors, self.__visited, self.__belong, \
115  self.__data_type, self.__clusters, self.__noise, self.__ccore = state
116 
117  self.__initialize_ccore_state(True)
118 
119 
120  def process(self):
121  """!
122  @brief Performs cluster analysis in line with rules of DBSCAN algorithm.
123 
124  @return (dbscan) Returns itself (DBSCAN instance).
125 
126  @see get_clusters()
127  @see get_noise()
128 
129  """
130 
131  if self.__ccore is True:
132  (self.__clusters, self.__noise) = wrapper.dbscan(self.__pointer_data, self.__eps, self.__neighbors, self.__data_type)
133 
134  else:
135  if self.__data_type == 'points':
136  self.__kdtree = kdtree_balanced(self.__pointer_data, range(len(self.__pointer_data)))
137 
138  self.__visited = [False] * len(self.__pointer_data)
139  self.__belong = [False] * len(self.__pointer_data)
140 
142 
143  for i in range(0, len(self.__pointer_data)):
144  if self.__visited[i] is False:
145  cluster = self.__expand_cluster(i)
146  if cluster is not None:
147  self.__clusters.append(cluster)
148 
149  for i in range(0, len(self.__pointer_data)):
150  if self.__belong[i] is False:
151  self.__noise.append(i)
152 
153  return self
154 
155 
156  def get_clusters(self):
157  """!
158  @brief Returns allocated clusters.
159 
160  @remark Allocated clusters can be returned only after data processing (use method process()). Otherwise empty list is returned.
161 
162  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
163 
164  @see process()
165  @see get_noise()
166 
167  """
168 
169  return self.__clusters
170 
171 
172  def get_noise(self):
173  """!
174  @brief Returns allocated noise.
175 
176  @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
177 
178  @return (list) List of indexes that are marked as a noise.
179 
180  @see process()
181  @see get_clusters()
182 
183  """
184 
185  return self.__noise
186 
187 
189  """!
190  @brief Returns clustering result representation type that indicate how clusters are encoded.
191 
192  @return (type_encoding) Clustering result representation.
193 
194  @see get_clusters()
195 
196  """
197 
198  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
199 
200 
201  def __verify_arguments(self):
202  """!
203  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
204 
205  """
206  if len(self.__pointer_data) == 0:
207  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
208 
209  if self.__eps < 0:
210  raise ValueError("Connectivity radius (current value: '%d') should be greater or equal to 0." % self.__eps)
211 
212 
213  def __create_neighbor_searcher(self, data_type):
214  """!
215  @brief Returns neighbor searcher in line with data type.
216 
217  @param[in] data_type (string): Data type (points or distance matrix).
218 
219  """
220  if data_type == 'points':
221  return self.__neighbor_indexes_points
222  elif data_type == 'distance_matrix':
224  else:
225  raise TypeError("Unknown type of data is specified '%s'" % data_type)
226 
227 
228  def __expand_cluster(self, index_point):
229  """!
230  @brief Expands cluster from specified point in the input data space.
231 
232  @param[in] index_point (list): Index of a point from the data.
233 
234  @return (list) Return tuple of list of indexes that belong to the same cluster and list of points that are marked as noise: (cluster, noise), or None if nothing has been expanded.
235 
236  """
237 
238  cluster = None
239  self.__visited[index_point] = True
240  neighbors = self.__neighbor_searcher(index_point)
241 
242  if len(neighbors) >= self.__neighbors:
243  cluster = [index_point]
244 
245  self.__belong[index_point] = True
246 
247  for i in neighbors:
248  if self.__visited[i] is False:
249  self.__visited[i] = True
250 
251  next_neighbors = self.__neighbor_searcher(i)
252 
253  if len(next_neighbors) >= self.__neighbors:
254  neighbors += [k for k in next_neighbors if ( (k in neighbors) == False) and k != index_point]
255 
256  if self.__belong[i] is False:
257  cluster.append(i)
258  self.__belong[i] = True
259 
260  return cluster
261 
262 
263  def __neighbor_indexes_points(self, index_point):
264  """!
265  @brief Return neighbors of the specified object in case of sequence of points.
266 
267  @param[in] index_point (uint): Index point whose neighbors are should be found.
268 
269  @return (list) List of indexes of neighbors in line the connectivity radius.
270 
271  """
272  kdnodes = self.__kdtree.find_nearest_dist_nodes(self.__pointer_data[index_point], self.__eps)
273  return [node_tuple[1].payload for node_tuple in kdnodes if node_tuple[1].payload != index_point]
274 
275 
276  def __neighbor_indexes_distance_matrix(self, index_point):
277  """!
278  @brief Return neighbors of the specified object in case of distance matrix.
279 
280  @param[in] index_point (uint): Index point whose neighbors are should be found.
281 
282  @return (list) List of indexes of neighbors in line the connectivity radius.
283 
284  """
285  distances = self.__pointer_data[index_point]
286  return [index_neighbor for index_neighbor in range(len(distances))
287  if ((distances[index_neighbor] <= self.__eps) and (index_neighbor != index_point))]
288 
289 
290  def __initialize_ccore_state(self, ccore):
291  """!
292  @brief Initializes C++ pyclustering state.
293  @details Check if it is requested and if it is available for the current platform. These information is used to
294  set status of C++ pyclustering library.
295 
296  @param[in] ccore (bool):
297 
298  """
299  self.__ccore = ccore
300  if self.__ccore:
301  self.__ccore = ccore_library.workable()
pyclustering.cluster.dbscan.dbscan
Class represents clustering algorithm DBSCAN.
Definition: dbscan.py:22
pyclustering.container.kdtree
Data Structure: KD-Tree.
Definition: kdtree.py:1
pyclustering.cluster.dbscan.dbscan.__neighbors
__neighbors
Definition: dbscan.py:78
pyclustering.cluster.dbscan.dbscan.__clusters
__clusters
Definition: dbscan.py:85
pyclustering.cluster.dbscan.dbscan.__data_type
__data_type
Definition: dbscan.py:83
pyclustering.container.kdtree.kdtree_balanced
Represents balanced static KD-tree that does not provide services to add and remove nodes after initi...
Definition: kdtree.py:243
pyclustering.cluster.dbscan.dbscan.__getstate__
def __getstate__(self)
Returns current state of the algorithm.
Definition: dbscan.py:95
pyclustering.cluster.dbscan.dbscan.__pointer_data
__pointer_data
Definition: dbscan.py:74
pyclustering.cluster.dbscan.dbscan.__noise
__noise
Definition: dbscan.py:86
pyclustering.cluster.dbscan.dbscan.__ccore
__ccore
Definition: dbscan.py:115
pyclustering.cluster.dbscan.dbscan.__initialize_ccore_state
def __initialize_ccore_state(self, ccore)
Initializes C++ pyclustering state.
Definition: dbscan.py:290
pyclustering.cluster.dbscan.dbscan.get_cluster_encoding
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: dbscan.py:188
pyclustering.cluster.dbscan.dbscan.__sqrt_eps
__sqrt_eps
Definition: dbscan.py:77
pyclustering.cluster.dbscan.dbscan.__kdtree
__kdtree
Definition: dbscan.py:75
pyclustering.cluster.dbscan.dbscan.__init__
def __init__(self, data, eps, neighbors, ccore=True, **kwargs)
Constructor of clustering algorithm DBSCAN.
Definition: dbscan.py:58
pyclustering.cluster.dbscan.dbscan.__create_neighbor_searcher
def __create_neighbor_searcher(self, data_type)
Returns neighbor searcher in line with data type.
Definition: dbscan.py:213
pyclustering.cluster.dbscan.dbscan.__belong
__belong
Definition: dbscan.py:81
pyclustering.cluster.dbscan.dbscan.__setstate__
def __setstate__(self, state)
Set current state of the algorithm.
Definition: dbscan.py:107
pyclustering.cluster.dbscan.dbscan.get_noise
def get_noise(self)
Returns allocated noise.
Definition: dbscan.py:172
pyclustering.cluster.dbscan.dbscan.__neighbor_indexes_points
def __neighbor_indexes_points(self, index_point)
Return neighbors of the specified object in case of sequence of points.
Definition: dbscan.py:263
pyclustering.cluster.dbscan.dbscan.__neighbor_indexes_distance_matrix
def __neighbor_indexes_distance_matrix(self, index_point)
Return neighbors of the specified object in case of distance matrix.
Definition: dbscan.py:276
pyclustering.cluster.dbscan.dbscan.process
def process(self)
Performs cluster analysis in line with rules of DBSCAN algorithm.
Definition: dbscan.py:120
pyclustering.cluster.dbscan.dbscan.__visited
__visited
Definition: dbscan.py:80
pyclustering.cluster.dbscan.dbscan.__expand_cluster
def __expand_cluster(self, index_point)
Expands cluster from specified point in the input data space.
Definition: dbscan.py:228
pyclustering.cluster.dbscan.dbscan.__eps
__eps
Definition: dbscan.py:76
pyclustering.cluster.dbscan.dbscan.__neighbor_searcher
__neighbor_searcher
Definition: dbscan.py:88
pyclustering.cluster.dbscan.dbscan.__verify_arguments
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: dbscan.py:201
pyclustering.cluster.encoder
Module for representing clustering results.
Definition: encoder.py:1
pyclustering.cluster.dbscan.dbscan.get_clusters
def get_clusters(self)
Returns allocated clusters.
Definition: dbscan.py:156