dbscan.py
1 """!
2 
3 @brief Cluster analysis algorithm: DBSCAN.
4 @details Implementation based on paper @cite inproceedings::dbscan::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from pyclustering.container.kdtree import kdtree
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.core.wrapper import ccore_library
33 
34 import pyclustering.core.dbscan_wrapper as wrapper
35 
36 
37 class dbscan:
38  """!
39  @brief Class represents clustering algorithm DBSCAN.
40  @details This DBSCAN algorithm is KD-tree optimized.
41 
42  CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
43 
44  Example:
45  @code
46  from pyclustering.cluster.dbscan import dbscan
47  from pyclustering.cluster import cluster_visualizer
48  from pyclustering.utils import read_sample
49  from pyclustering.samples.definitions import FCPS_SAMPLES
50 
51  # Sample for cluster analysis.
52  sample = read_sample(FCPS_SAMPLES.SAMPLE_CHAINLINK)
53 
54  # Create DBSCAN algorithm.
55  dbscan_instance = dbscan(sample, 0.7, 3)
56 
57  # Start processing by DBSCAN.
58  dbscan_instance.process()
59 
60  # Obtain results of clustering.
61  clusters = dbscan_instance.get_clusters()
62  noise = dbscan_instance.get_noise()
63 
64  # Visualize clustering results
65  visualizer = cluster_visualizer()
66  visualizer.append_clusters(clusters, sample)
67  visualizer.append_cluster(noise, sample, marker='x')
68  visualizer.show()
69  @endcode
70 
71  """
72 
73  def __init__(self, data, eps, neighbors, ccore = True, **kwargs):
74  """!
75  @brief Constructor of clustering algorithm DBSCAN.
76 
77  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
78  @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less then the radius.
79  @param[in] neighbors (uint): minimum number of shared neighbors that is required for establish links between points.
80  @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
81  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'data_type').
82 
83  <b>Keyword Args:</b><br>
84  - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
85 
86  """
87 
88  self.__pointer_data = data
89  self.__kdtree = None
90  self.__eps = eps
91  self.__sqrt_eps = eps * eps
92  self.__neighbors = neighbors
93 
94  self.__visited = [False] * len(self.__pointer_data)
95  self.__belong = [False] * len(self.__pointer_data)
96 
97  self.__data_type = kwargs.get('data_type', 'points')
98 
99  self.__clusters = []
100  self.__noise = []
101 
103 
104  self.__ccore = ccore
105  if self.__ccore:
106  self.__ccore = ccore_library.workable()
107 
108 
109  def process(self):
110  """!
111  @brief Performs cluster analysis in line with rules of DBSCAN algorithm.
112 
113  @see get_clusters()
114  @see get_noise()
115 
116  """
117 
118  if self.__ccore is True:
119  (self.__clusters, self.__noise) = wrapper.dbscan(self.__pointer_data, self.__eps, self.__neighbors, self.__data_type)
120 
121  else:
122  if self.__data_type == 'points':
123  self.__kdtree = kdtree(self.__pointer_data, range(len(self.__pointer_data)))
124 
125  for i in range(0, len(self.__pointer_data)):
126  if self.__visited[i] is False:
127  cluster = self.__expand_cluster(i)
128  if cluster is not None:
129  self.__clusters.append(cluster)
130 
131  for i in range(0, len(self.__pointer_data)):
132  if self.__belong[i] is False:
133  self.__noise.append(i)
134 
135 
136  def get_clusters(self):
137  """!
138  @brief Returns allocated clusters.
139 
140  @remark Allocated clusters can be returned only after data processing (use method process()). Otherwise empty list is returned.
141 
142  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
143 
144  @see process()
145  @see get_noise()
146 
147  """
148 
149  return self.__clusters
150 
151 
152  def get_noise(self):
153  """!
154  @brief Returns allocated noise.
155 
156  @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
157 
158  @return (list) List of indexes that are marked as a noise.
159 
160  @see process()
161  @see get_clusters()
162 
163  """
164 
165  return self.__noise
166 
167 
169  """!
170  @brief Returns clustering result representation type that indicate how clusters are encoded.
171 
172  @return (type_encoding) Clustering result representation.
173 
174  @see get_clusters()
175 
176  """
177 
178  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
179 
180 
181  def __create_neighbor_searcher(self, data_type):
182  """!
183  @brief Returns neighbor searcher in line with data type.
184 
185  @param[in] data_type (string): Data type (points or distance matrix).
186 
187  """
188  if data_type == 'points':
189  return self.__neighbor_indexes_points
190  elif data_type == 'distance_matrix':
192  else:
193  raise TypeError("Unknown type of data is specified '%s'" % data_type)
194 
195 
196  def __expand_cluster(self, index_point):
197  """!
198  @brief Expands cluster from specified point in the input data space.
199 
200  @param[in] index_point (list): Index of a point from the data.
201 
202  @return (list) Return tuple of list of indexes that belong to the same cluster and list of points that are marked as noise: (cluster, noise), or None if nothing has been expanded.
203 
204  """
205 
206  cluster = None
207  self.__visited[index_point] = True
208  neighbors = self.__neighbor_searcher(index_point)
209 
210  if len(neighbors) >= self.__neighbors:
211  cluster = [index_point]
212 
213  self.__belong[index_point] = True
214 
215  for i in neighbors:
216  if self.__visited[i] is False:
217  self.__visited[i] = True
218 
219  next_neighbors = self.__neighbor_searcher(i)
220 
221  if len(next_neighbors) >= self.__neighbors:
222  neighbors += [k for k in next_neighbors if ( (k in neighbors) == False) and k != index_point]
223 
224  if self.__belong[i] is False:
225  cluster.append(i)
226  self.__belong[i] = True
227 
228  return cluster
229 
230 
231  def __neighbor_indexes_points(self, index_point):
232  """!
233  @brief Return neighbors of the specified object in case of sequence of points.
234 
235  @param[in] index_point (uint): Index point whose neighbors are should be found.
236 
237  @return (list) List of indexes of neighbors in line the connectivity radius.
238 
239  """
240  kdnodes = self.__kdtree.find_nearest_dist_nodes(self.__pointer_data[index_point], self.__eps)
241  return [node_tuple[1].payload for node_tuple in kdnodes if node_tuple[1].payload != index_point]
242 
243 
244  def __neighbor_indexes_distance_matrix(self, index_point):
245  """!
246  @brief Return neighbors of the specified object in case of distance matrix.
247 
248  @param[in] index_point (uint): Index point whose neighbors are should be found.
249 
250  @return (list) List of indexes of neighbors in line the connectivity radius.
251 
252  """
253  distances = self.__pointer_data[index_point]
254  return [index_neighbor for index_neighbor in range(len(distances))
255  if ((distances[index_neighbor] <= self.__eps) and (index_neighbor != index_point))]
Class represents clustering algorithm DBSCAN.
Definition: dbscan.py:37
def get_clusters(self)
Returns allocated clusters.
Definition: dbscan.py:136
Module for representing clustering results.
Definition: encoder.py:1
Represents KD Tree that is a space-partitioning data structure for organizing points in a k-dimension...
Definition: kdtree.py:157
def __init__(self, data, eps, neighbors, ccore=True, kwargs)
Constructor of clustering algorithm DBSCAN.
Definition: dbscan.py:73
def process(self)
Performs cluster analysis in line with rules of DBSCAN algorithm.
Definition: dbscan.py:109
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: dbscan.py:168
def __expand_cluster(self, index_point)
Expands cluster from specified point in the input data space.
Definition: dbscan.py:196
def __neighbor_indexes_points(self, index_point)
Return neighbors of the specified object in case of sequence of points.
Definition: dbscan.py:231
def __create_neighbor_searcher(self, data_type)
Returns neighbor searcher in line with data type.
Definition: dbscan.py:181
def get_noise(self)
Returns allocated noise.
Definition: dbscan.py:152
def __neighbor_indexes_distance_matrix(self, index_point)
Return neighbors of the specified object in case of distance matrix.
Definition: dbscan.py:244
Data Structure: KD-Tree.
Definition: kdtree.py:1