dbscan.py
1 """!
2 
3 @brief Cluster analysis algorithm: DBSCAN.
4 @details Implementation based on paper @cite inproceedings::dbscan::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from pyclustering.container.kdtree import kdtree
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.core.wrapper import ccore_library
33 
34 import pyclustering.core.dbscan_wrapper as wrapper
35 
36 
37 class dbscan:
38  """!
39  @brief Class represents clustering algorithm DBSCAN.
40  @details This DBSCAN algorithm is KD-tree optimized.
41 
42  CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
43 
44  Example:
45  @code
46  # sample for cluster analysis (represented by list)
47  sample = read_sample(path_to_sample);
48 
49  # create object that uses CCORE for processing
50  dbscan_instance = dbscan(sample, 0.5, 3, True);
51 
52  # cluster analysis
53  dbscan_instance.process();
54 
55  # obtain results of clustering
56  clusters = dbscan_instance.get_clusters();
57  noise = dbscan_instance.get_noise();
58  @endcode
59 
60  """
61 
62  def __init__(self, data, eps, neighbors, ccore = True, **kwargs):
63  """!
64  @brief Constructor of clustering algorithm DBSCAN.
65 
66  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
67  @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less then the radius.
68  @param[in] neighbors (uint): minimum number of shared neighbors that is required for establish links between points.
69  @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
70  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'data_type').
71 
72  <b>Keyword Args:</b><br>
73  - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
74 
75  """
76 
77  self.__pointer_data = data
78  self.__kdtree = None
79  self.__eps = eps
80  self.__sqrt_eps = eps * eps
81  self.__neighbors = neighbors
82 
83  self.__visited = [False] * len(self.__pointer_data)
84  self.__belong = [False] * len(self.__pointer_data)
85 
86  self.__data_type = kwargs.get('data_type', 'points')
87 
88  self.__clusters = []
89  self.__noise = []
90 
92 
93  self.__ccore = ccore
94  if self.__ccore:
95  self.__ccore = ccore_library.workable()
96 
97 
98  def process(self):
99  """!
100  @brief Performs cluster analysis in line with rules of DBSCAN algorithm.
101 
102  @see get_clusters()
103  @see get_noise()
104 
105  """
106 
107  if self.__ccore is True:
108  (self.__clusters, self.__noise) = wrapper.dbscan(self.__pointer_data, self.__eps, self.__neighbors, self.__data_type)
109 
110  else:
111  if self.__data_type == 'points':
112  self.__kdtree = kdtree(self.__pointer_data, range(len(self.__pointer_data)))
113 
114  for i in range(0, len(self.__pointer_data)):
115  if self.__visited[i] is False:
116  cluster = self.__expand_cluster(i)
117  if cluster is not None:
118  self.__clusters.append(cluster)
119 
120  for i in range(0, len(self.__pointer_data)):
121  if self.__belong[i] is False:
122  self.__noise.append(i)
123 
124 
125  def get_clusters(self):
126  """!
127  @brief Returns allocated clusters.
128 
129  @remark Allocated clusters can be returned only after data processing (use method process()). Otherwise empty list is returned.
130 
131  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
132 
133  @see process()
134  @see get_noise()
135 
136  """
137 
138  return self.__clusters
139 
140 
141  def get_noise(self):
142  """!
143  @brief Returns allocated noise.
144 
145  @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
146 
147  @return (list) List of indexes that are marked as a noise.
148 
149  @see process()
150  @see get_clusters()
151 
152  """
153 
154  return self.__noise
155 
156 
158  """!
159  @brief Returns clustering result representation type that indicate how clusters are encoded.
160 
161  @return (type_encoding) Clustering result representation.
162 
163  @see get_clusters()
164 
165  """
166 
167  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
168 
169 
170  def __create_neighbor_searcher(self, data_type):
171  """!
172  @brief Returns neighbor searcher in line with data type.
173 
174  @param[in] data_type (string): Data type (points or distance matrix).
175 
176  """
177  if data_type == 'points':
178  return self.__neighbor_indexes_points
179  elif data_type == 'distance_matrix':
181  else:
182  raise TypeError("Unknown type of data is specified '%s'" % data_type)
183 
184 
185  def __expand_cluster(self, index_point):
186  """!
187  @brief Expands cluster from specified point in the input data space.
188 
189  @param[in] index_point (list): Index of a point from the data.
190 
191  @return (list) Return tuple of list of indexes that belong to the same cluster and list of points that are marked as noise: (cluster, noise), or None if nothing has been expanded.
192 
193  """
194 
195  cluster = None
196  self.__visited[index_point] = True
197  neighbors = self.__neighbor_searcher(index_point)
198 
199  if len(neighbors) >= self.__neighbors:
200  cluster = [ index_point ]
201 
202  self.__belong[index_point] = True
203 
204  for i in neighbors:
205  if self.__visited[i] is False:
206  self.__visited[i] = True
207 
208  next_neighbors = self.__neighbor_searcher(i)
209 
210  if len(next_neighbors) >= self.__neighbors:
211  neighbors += [k for k in next_neighbors if ( (k in neighbors) == False) and k != index_point]
212 
213  if self.__belong[i] is False:
214  cluster.append(i)
215  self.__belong[i] = True
216 
217  return cluster
218 
219 
220  def __neighbor_indexes_points(self, index_point):
221  """!
222  @brief Return neighbors of the specified object in case of sequence of points.
223 
224  @param[in] index_point (uint): Index point whose neighbors are should be found.
225 
226  @return (list) List of indexes of neighbors in line the connectivity radius.
227 
228  """
229  kdnodes = self.__kdtree.find_nearest_dist_nodes(self.__pointer_data[index_point], self.__eps)
230  return [node_tuple[1].payload for node_tuple in kdnodes if node_tuple[1].payload != index_point]
231 
232 
233  def __neighbor_indexes_distance_matrix(self, index_point):
234  """!
235  @brief Return neighbors of the specified object in case of distance matrix.
236 
237  @param[in] index_point (uint): Index point whose neighbors are should be found.
238 
239  @return (list) List of indexes of neighbors in line the connectivity radius.
240 
241  """
242  distances = self.__pointer_data[index_point]
243  return [index_neighbor for index_neighbor in range(len(distances))
244  if ((distances[index_neighbor] <= self.__eps) and (index_neighbor != index_point))]
Class represents clustering algorithm DBSCAN.
Definition: dbscan.py:37
def get_clusters(self)
Returns allocated clusters.
Definition: dbscan.py:125
Module for representing clustering results.
Definition: encoder.py:1
Represents KD Tree that is a space-partitioning data structure for organizing points in a k-dimension...
Definition: kdtree.py:157
def __init__(self, data, eps, neighbors, ccore=True, kwargs)
Constructor of clustering algorithm DBSCAN.
Definition: dbscan.py:62
def process(self)
Performs cluster analysis in line with rules of DBSCAN algorithm.
Definition: dbscan.py:98
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: dbscan.py:157
def __expand_cluster(self, index_point)
Expands cluster from specified point in the input data space.
Definition: dbscan.py:185
def __neighbor_indexes_points(self, index_point)
Return neighbors of the specified object in case of sequence of points.
Definition: dbscan.py:220
def __create_neighbor_searcher(self, data_type)
Returns neighbor searcher in line with data type.
Definition: dbscan.py:170
def get_noise(self)
Returns allocated noise.
Definition: dbscan.py:141
def __neighbor_indexes_distance_matrix(self, index_point)
Return neighbors of the specified object in case of distance matrix.
Definition: dbscan.py:233
Data Structure: KD-Tree.
Definition: kdtree.py:1