dbscan.py
1 """!
2 
3 @brief Cluster analysis algorithm: DBSCAN.
4 @details Implementation based on paper @cite inproceedings::dbscan::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from pyclustering.container.kdtree import kdtree_balanced
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.core.wrapper import ccore_library
33 
34 import pyclustering.core.dbscan_wrapper as wrapper
35 
36 
37 class dbscan:
38  """!
39  @brief Class represents clustering algorithm DBSCAN.
40  @details This DBSCAN algorithm is KD-tree optimized.
41 
42  CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
43 
44  Example:
45  @code
46  from pyclustering.cluster.dbscan import dbscan
47  from pyclustering.cluster import cluster_visualizer
48  from pyclustering.utils import read_sample
49  from pyclustering.samples.definitions import FCPS_SAMPLES
50 
51  # Sample for cluster analysis.
52  sample = read_sample(FCPS_SAMPLES.SAMPLE_CHAINLINK)
53 
54  # Create DBSCAN algorithm.
55  dbscan_instance = dbscan(sample, 0.7, 3)
56 
57  # Start processing by DBSCAN.
58  dbscan_instance.process()
59 
60  # Obtain results of clustering.
61  clusters = dbscan_instance.get_clusters()
62  noise = dbscan_instance.get_noise()
63 
64  # Visualize clustering results
65  visualizer = cluster_visualizer()
66  visualizer.append_clusters(clusters, sample)
67  visualizer.append_cluster(noise, sample, marker='x')
68  visualizer.show()
69  @endcode
70 
71  """
72 
73  def __init__(self, data, eps, neighbors, ccore=True, **kwargs):
74  """!
75  @brief Constructor of clustering algorithm DBSCAN.
76 
77  @param[in] data (list): Input data that is presented as list of points or distance matrix (defined by parameter
78  'data_type', by default data is considered as a list of points).
79  @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less then the radius.
80  @param[in] neighbors (uint): minimum number of shared neighbors that is required for establish links between points.
81  @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
82  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'data_type').
83 
84  <b>Keyword Args:</b><br>
85  - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
86 
87  """
88 
89  self.__pointer_data = data
90  self.__kdtree = None
91  self.__eps = eps
92  self.__sqrt_eps = eps * eps
93  self.__neighbors = neighbors
94 
95  self.__visited = [False] * len(self.__pointer_data)
96  self.__belong = [False] * len(self.__pointer_data)
97 
98  self.__data_type = kwargs.get('data_type', 'points')
99 
100  self.__clusters = []
101  self.__noise = []
102 
104 
105  self.__ccore = ccore
106  if self.__ccore:
107  self.__ccore = ccore_library.workable()
108 
109  self.__verify_arguments()
110 
111 
112  def process(self):
113  """!
114  @brief Performs cluster analysis in line with rules of DBSCAN algorithm.
115 
116  @return (dbscan) Returns itself (DBSCAN instance).
117 
118  @see get_clusters()
119  @see get_noise()
120 
121  """
122 
123  if self.__ccore is True:
124  (self.__clusters, self.__noise) = wrapper.dbscan(self.__pointer_data, self.__eps, self.__neighbors, self.__data_type)
125 
126  else:
127  if self.__data_type == 'points':
128  self.__kdtree = kdtree_balanced(self.__pointer_data, range(len(self.__pointer_data)))
129 
130  for i in range(0, len(self.__pointer_data)):
131  if self.__visited[i] is False:
132  cluster = self.__expand_cluster(i)
133  if cluster is not None:
134  self.__clusters.append(cluster)
135 
136  for i in range(0, len(self.__pointer_data)):
137  if self.__belong[i] is False:
138  self.__noise.append(i)
139 
140  return self
141 
142 
143  def get_clusters(self):
144  """!
145  @brief Returns allocated clusters.
146 
147  @remark Allocated clusters can be returned only after data processing (use method process()). Otherwise empty list is returned.
148 
149  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
150 
151  @see process()
152  @see get_noise()
153 
154  """
155 
156  return self.__clusters
157 
158 
159  def get_noise(self):
160  """!
161  @brief Returns allocated noise.
162 
163  @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
164 
165  @return (list) List of indexes that are marked as a noise.
166 
167  @see process()
168  @see get_clusters()
169 
170  """
171 
172  return self.__noise
173 
174 
176  """!
177  @brief Returns clustering result representation type that indicate how clusters are encoded.
178 
179  @return (type_encoding) Clustering result representation.
180 
181  @see get_clusters()
182 
183  """
184 
185  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
186 
187 
188  def __verify_arguments(self):
189  """!
190  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
191 
192  """
193  if len(self.__pointer_data) == 0:
194  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
195 
196  if self.__eps < 0:
197  raise ValueError("Connectivity radius (current value: '%d') should be greater or equal to 0." % self.__eps)
198 
199 
200  def __create_neighbor_searcher(self, data_type):
201  """!
202  @brief Returns neighbor searcher in line with data type.
203 
204  @param[in] data_type (string): Data type (points or distance matrix).
205 
206  """
207  if data_type == 'points':
208  return self.__neighbor_indexes_points
209  elif data_type == 'distance_matrix':
211  else:
212  raise TypeError("Unknown type of data is specified '%s'" % data_type)
213 
214 
215  def __expand_cluster(self, index_point):
216  """!
217  @brief Expands cluster from specified point in the input data space.
218 
219  @param[in] index_point (list): Index of a point from the data.
220 
221  @return (list) Return tuple of list of indexes that belong to the same cluster and list of points that are marked as noise: (cluster, noise), or None if nothing has been expanded.
222 
223  """
224 
225  cluster = None
226  self.__visited[index_point] = True
227  neighbors = self.__neighbor_searcher(index_point)
228 
229  if len(neighbors) >= self.__neighbors:
230  cluster = [index_point]
231 
232  self.__belong[index_point] = True
233 
234  for i in neighbors:
235  if self.__visited[i] is False:
236  self.__visited[i] = True
237 
238  next_neighbors = self.__neighbor_searcher(i)
239 
240  if len(next_neighbors) >= self.__neighbors:
241  neighbors += [k for k in next_neighbors if ( (k in neighbors) == False) and k != index_point]
242 
243  if self.__belong[i] is False:
244  cluster.append(i)
245  self.__belong[i] = True
246 
247  return cluster
248 
249 
250  def __neighbor_indexes_points(self, index_point):
251  """!
252  @brief Return neighbors of the specified object in case of sequence of points.
253 
254  @param[in] index_point (uint): Index point whose neighbors are should be found.
255 
256  @return (list) List of indexes of neighbors in line the connectivity radius.
257 
258  """
259  kdnodes = self.__kdtree.find_nearest_dist_nodes(self.__pointer_data[index_point], self.__eps)
260  return [node_tuple[1].payload for node_tuple in kdnodes if node_tuple[1].payload != index_point]
261 
262 
263  def __neighbor_indexes_distance_matrix(self, index_point):
264  """!
265  @brief Return neighbors of the specified object in case of distance matrix.
266 
267  @param[in] index_point (uint): Index point whose neighbors are should be found.
268 
269  @return (list) List of indexes of neighbors in line the connectivity radius.
270 
271  """
272  distances = self.__pointer_data[index_point]
273  return [index_neighbor for index_neighbor in range(len(distances))
274  if ((distances[index_neighbor] <= self.__eps) and (index_neighbor != index_point))]
Class represents clustering algorithm DBSCAN.
Definition: dbscan.py:37
def get_clusters(self)
Returns allocated clusters.
Definition: dbscan.py:143
Module for representing clustering results.
Definition: encoder.py:1
def __init__(self, data, eps, neighbors, ccore=True, kwargs)
Constructor of clustering algorithm DBSCAN.
Definition: dbscan.py:73
def process(self)
Performs cluster analysis in line with rules of DBSCAN algorithm.
Definition: dbscan.py:112
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: dbscan.py:175
def __expand_cluster(self, index_point)
Expands cluster from specified point in the input data space.
Definition: dbscan.py:215
Represents balanced static KD-tree that does not provide services to add and remove nodes after initi...
Definition: kdtree.py:264
def __neighbor_indexes_points(self, index_point)
Return neighbors of the specified object in case of sequence of points.
Definition: dbscan.py:250
def __create_neighbor_searcher(self, data_type)
Returns neighbor searcher in line with data type.
Definition: dbscan.py:200
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: dbscan.py:188
def get_noise(self)
Returns allocated noise.
Definition: dbscan.py:159
def __neighbor_indexes_distance_matrix(self, index_point)
Return neighbors of the specified object in case of distance matrix.
Definition: dbscan.py:263
Data Structure: KD-Tree.
Definition: kdtree.py:1