|
pyclustering
0.10.1
pyclustring is a Python, C++ data mining library.
|
3 @brief Cluster analysis algorithm: DBSCAN.
4 @details Implementation based on paper @cite inproceedings::dbscan::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
17 from pyclustering.core.wrapper
import ccore_library
19 import pyclustering.core.dbscan_wrapper
as wrapper
24 @brief Class represents clustering algorithm DBSCAN.
25 @details This DBSCAN algorithm is KD-tree optimized.
27 By default C/C++ pyclustering library is used for processing that significantly increases performance.
29 Clustering example where DBSCAN algorithm is used to process `Chainlink` data from `FCPS` collection:
31 from pyclustering.cluster.dbscan import dbscan
32 from pyclustering.cluster import cluster_visualizer
33 from pyclustering.utils import read_sample
34 from pyclustering.samples.definitions import FCPS_SAMPLES
36 # Sample for cluster analysis.
37 sample = read_sample(FCPS_SAMPLES.SAMPLE_CHAINLINK)
39 # Create DBSCAN algorithm.
40 dbscan_instance = dbscan(sample, 0.7, 3)
42 # Start processing by DBSCAN.
43 dbscan_instance.process()
45 # Obtain results of clustering.
46 clusters = dbscan_instance.get_clusters()
47 noise = dbscan_instance.get_noise()
49 # Visualize clustering results
50 visualizer = cluster_visualizer()
51 visualizer.append_clusters(clusters, sample)
52 visualizer.append_cluster(noise, sample, marker='x')
58 def __init__(self, data, eps, neighbors, ccore=True, **kwargs):
60 @brief Constructor of clustering algorithm DBSCAN.
62 @param[in] data (list): Input data that is presented as list of points or distance matrix (defined by parameter
63 'data_type', by default data is considered as a list of points).
64 @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less then the radius.
65 @param[in] neighbors (uint): minimum number of shared neighbors that is required for establish links between points.
66 @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
67 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'data_type').
69 <b>Keyword Args:</b><br>
70 - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
83 self.
__data_type = kwargs.get(
'data_type',
'points')
97 @brief Returns current state of the algorithm.
98 @details It does not return internal temporal variables that are not visible for a user.
100 @return (tuple) Current state of the algorithm.
109 @brief Set current state of the algorithm.
110 @details Set state method checks if C++ pyclustering is available for the current platform, as a result `ccore`
111 state might be different if state is moved between platforms.
122 @brief Performs cluster analysis in line with rules of DBSCAN algorithm.
124 @return (dbscan) Returns itself (DBSCAN instance).
146 if cluster
is not None:
158 @brief Returns allocated clusters.
160 @remark Allocated clusters can be returned only after data processing (use method process()). Otherwise empty list is returned.
162 @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
174 @brief Returns allocated noise.
176 @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
178 @return (list) List of indexes that are marked as a noise.
190 @brief Returns clustering result representation type that indicate how clusters are encoded.
192 @return (type_encoding) Clustering result representation.
198 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
201 def __verify_arguments(self):
203 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
207 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__pointer_data))
210 raise ValueError(
"Connectivity radius (current value: '%d') should be greater or equal to 0." % self.
__eps)
213 def __create_neighbor_searcher(self, data_type):
215 @brief Returns neighbor searcher in line with data type.
217 @param[in] data_type (string): Data type (points or distance matrix).
220 if data_type ==
'points':
222 elif data_type ==
'distance_matrix':
225 raise TypeError(
"Unknown type of data is specified '%s'" % data_type)
228 def __expand_cluster(self, index_point):
230 @brief Expands cluster from specified point in the input data space.
232 @param[in] index_point (list): Index of a point from the data.
234 @return (list) Return tuple of list of indexes that belong to the same cluster and list of points that are marked as noise: (cluster, noise), or None if nothing has been expanded.
243 cluster = [index_point]
254 neighbors += [k
for k
in next_neighbors
if ( (k
in neighbors) ==
False)
and k != index_point]
263 def __neighbor_indexes_points(self, index_point):
265 @brief Return neighbors of the specified object in case of sequence of points.
267 @param[in] index_point (uint): Index point whose neighbors are should be found.
269 @return (list) List of indexes of neighbors in line the connectivity radius.
273 return [node_tuple[1].payload
for node_tuple
in kdnodes
if node_tuple[1].payload != index_point]
276 def __neighbor_indexes_distance_matrix(self, index_point):
278 @brief Return neighbors of the specified object in case of distance matrix.
280 @param[in] index_point (uint): Index point whose neighbors are should be found.
282 @return (list) List of indexes of neighbors in line the connectivity radius.
286 return [index_neighbor
for index_neighbor
in range(len(distances))
287 if ((distances[index_neighbor] <= self.
__eps)
and (index_neighbor != index_point))]
290 def __initialize_ccore_state(self, ccore):
292 @brief Initializes C++ pyclustering state.
293 @details Check if it is requested and if it is available for the current platform. These information is used to
294 set status of C++ pyclustering library.
296 @param[in] ccore (bool):
301 self.
__ccore = ccore_library.workable()
Class represents clustering algorithm DBSCAN.
Represents balanced static KD-tree that does not provide services to add and remove nodes after initi...
def __getstate__(self)
Returns current state of the algorithm.
def __initialize_ccore_state(self, ccore)
Initializes C++ pyclustering state.
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
def __init__(self, data, eps, neighbors, ccore=True, **kwargs)
Constructor of clustering algorithm DBSCAN.
def __create_neighbor_searcher(self, data_type)
Returns neighbor searcher in line with data type.
def __setstate__(self, state)
Set current state of the algorithm.
def get_noise(self)
Returns allocated noise.
def __neighbor_indexes_points(self, index_point)
Return neighbors of the specified object in case of sequence of points.
def __neighbor_indexes_distance_matrix(self, index_point)
Return neighbors of the specified object in case of distance matrix.
def process(self)
Performs cluster analysis in line with rules of DBSCAN algorithm.
def __expand_cluster(self, index_point)
Expands cluster from specified point in the input data space.
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Module for representing clustering results.
def get_clusters(self)
Returns allocated clusters.