3 @brief Cluster analysis algorithm: OPTICS (Ordering Points To Identify Clustering Structure)
4 @details Implementation based on paper @cite article::optics::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
14 import matplotlib.pyplot
as plt
22 from pyclustering.core.wrapper
import ccore_library
24 import pyclustering.core.optics_wrapper
as wrapper
29 @brief Cluster ordering diagram visualizer that represents dataset graphically as density-based clustering structure.
30 @details This OPTICS algorithm is KD-tree optimized.
32 @see ordering_analyser
39 @brief Display cluster-ordering (reachability-plot) diagram.
41 @param[in] analyser (ordering_analyser): cluster-ordering analyser whose ordering diagram should be displayed.
42 @param[in] amount_clusters (uint): if it is not 'None' then it displays connectivity radius line that can used for allocation of specified amount of clusters
43 and colorize diagram by corresponding cluster colors.
45 Example demonstrates general abilities of 'ordering_visualizer' class:
47 # Display cluster-ordering diagram with connectivity radius is used for allocation of three clusters.
48 ordering_visualizer.show_ordering_diagram(analyser, 3);
50 # Display cluster-ordering diagram without radius.
51 ordering_visualizer.show_ordering_diagram(analyser);
55 ordering = analyser.cluster_ordering
56 axis = plt.subplot(111)
58 if amount_clusters
is not None:
59 radius, borders = analyser.calculate_connvectivity_radius(amount_clusters)
63 current_index_border = 0
64 for index_border
in range(len(borders)):
65 right_index_border = borders[index_border]
66 axis.bar(range(left_index_border, right_index_border), ordering[left_index_border:right_index_border], width = 1.0, color = color_list.TITLES[index_border])
67 left_index_border = right_index_border
68 current_index_border = index_border
70 axis.bar(range(left_index_border, len(ordering)), ordering[left_index_border:len(ordering)], width = 1.0, color = color_list.TITLES[current_index_border + 1])
72 plt.xlim([0, len(ordering)])
74 plt.axhline(y = radius, linewidth = 2, color =
'black')
75 plt.text(0, radius + radius * 0.03,
" Radius: " + str(round(radius, 4)) +
";\n Clusters: " + str(amount_clusters), color =
'b', fontsize = 10)
78 axis.bar(range(0, len(ordering)), ordering[0:len(ordering)], width = 1.0, color =
'black')
79 plt.xlim([0, len(ordering)])
86 @brief Analyser of cluster ordering diagram.
87 @details Using cluster-ordering it is able to connectivity radius for allocation of specified amount of clusters and
88 calculate amount of clusters using specified connectivity radius. Cluster-ordering is formed by OPTICS algorithm
89 during cluster analysis.
98 @brief (list) Returns values of dataset cluster ordering.
106 @brief Analyser of ordering diagram that is based on reachability-distances.
108 @see calculate_connvectivity_radius
116 @brief Returns length of clustering-ordering diagram.
124 @brief Calculates connectivity radius of allocation specified amount of clusters using ordering diagram and marks borders of clusters using indexes of values of ordering diagram.
125 @details Parameter 'maximum_iterations' is used to protect from hanging when it is impossible to allocate specified number of clusters.
127 @param[in] amount_clusters (uint): amount of clusters that should be allocated by calculated connectivity radius.
128 @param[in] maximum_iterations (uint): maximum number of iteration for searching connectivity radius to allocated specified amount of clusters (by default it is restricted by 100 iterations).
130 @return (double, list) Value of connectivity radius and borders of clusters like (radius, borders), radius may be 'None' as well as borders may be '[]'
131 if connectivity radius hasn't been found for the specified amount of iterations.
137 upper_distance = maximum_distance
143 if amount <= amount_clusters:
144 for _
in range(maximum_iterations):
145 radius = (lower_distance + upper_distance) / 2.0
148 if amount == amount_clusters:
155 elif amount > amount_clusters:
156 lower_distance = radius
158 elif amount < amount_clusters:
159 upper_distance = radius
161 return result, borders
166 @brief Obtains amount of clustering that can be allocated by using specified radius for ordering diagram and borders between them.
167 @details When growth of reachability-distances is detected than it is considered as a start point of cluster,
168 than pick is detected and after that recession is observed until new growth (that means end of the
169 current cluster and start of a new one) or end of diagram.
171 @param[in] radius (double): connectivity radius that is used for cluster allocation.
173 @return (unit, list) Amount of clusters that can be allocated by the connectivity radius on ordering diagram and borders between them using indexes
174 from ordering diagram (amount_clusters, border_clusters).
180 cluster_start =
False
182 total_similarity =
True
183 previous_cluster_distance =
None
184 previous_distance =
None
188 for index_ordering
in range(len(self.
__ordering)):
190 if distance >= radius:
191 if cluster_start
is False:
195 if index_ordering != 0:
196 cluster_borders.append(index_ordering)
199 if (distance < previous_cluster_distance)
and (cluster_pick
is False):
202 elif (distance > previous_cluster_distance)
and (cluster_pick
is True):
206 if index_ordering != 0:
207 cluster_borders.append(index_ordering)
209 previous_cluster_distance = distance
212 cluster_start =
False
215 if (previous_distance
is not None)
and (distance != previous_distance):
216 total_similarity =
False
218 previous_distance = distance
220 if (total_similarity
is True)
and (previous_distance > radius):
223 return amount_clusters, cluster_borders
228 @brief Object description that used by OPTICS algorithm for cluster analysis.
232 def __init__(self, index, core_distance = None, reachability_distance = None):
234 @brief Constructor of object description in optics terms.
236 @param[in] index (uint): Index of the object in the data set.
237 @param[in] core_distance (double): Core distance that is minimum distance to specified number of neighbors.
238 @param[in] reachability_distance (double): Reachability distance to this object.
256 @brief Returns string representation of the optics descriptor.
265 @brief Class represents clustering algorithm OPTICS (Ordering Points To Identify Clustering Structure) with KD-tree optimization (ccore options is supported).
266 @details OPTICS is a density-based algorithm. Purpose of the algorithm is to provide explicit clusters, but create clustering-ordering representation of the input data.
267 Clustering-ordering information contains information about internal structures of data set in terms of density and proper connectivity radius can be obtained
268 for allocation required amount of clusters using this diagram. In case of usage additional input parameter 'amount of clusters' connectivity radius should be
269 bigger than real - because it will be calculated by the algorithms if requested amount of clusters is not allocated.
271 @image html optics_example_clustering.png "Scheme how does OPTICS works. At the beginning only one cluster is allocated, but two is requested. At the second step OPTICS calculates connectivity radius using cluster-ordering and performs final cluster allocation."
273 Clustering example using sample 'Chainlink':
275 from pyclustering.cluster import cluster_visualizer
276 from pyclustering.cluster.optics import optics, ordering_analyser, ordering_visualizer
277 from pyclustering.samples.definitions import FCPS_SAMPLES
278 from pyclustering.utils import read_sample
280 # Read sample for clustering from some file.
281 sample = read_sample(FCPS_SAMPLES.SAMPLE_CHAINLINK)
283 # Run cluster analysis where connectivity radius is bigger than real.
286 optics_instance = optics(sample, radius, neighbors)
288 # Performs cluster analysis.
289 optics_instance.process()
291 # Obtain results of clustering.
292 clusters = optics_instance.get_clusters()
293 noise = optics_instance.get_noise()
294 ordering = optics_instance.get_ordering()
296 # Visualize clustering results.
297 visualizer = cluster_visualizer()
298 visualizer.append_clusters(clusters, sample)
302 analyser = ordering_analyser(ordering)
303 ordering_visualizer.show_ordering_diagram(analyser, 2)
306 Amount of clusters that should be allocated can be also specified. In this case connectivity radius should be greater than real, for example:
308 from pyclustering.cluster import cluster_visualizer
309 from pyclustering.cluster.optics import optics, ordering_analyser, ordering_visualizer
310 from pyclustering.samples.definitions import FCPS_SAMPLES
311 from pyclustering.utils import read_sample
313 # Read sample for clustering from some file
314 sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
316 # Run cluster analysis where connectivity radius is bigger than real
319 amount_of_clusters = 3
320 optics_instance = optics(sample, radius, neighbors, amount_of_clusters)
322 # Performs cluster analysis
323 optics_instance.process()
325 # Obtain results of clustering
326 clusters = optics_instance.get_clusters()
327 noise = optics_instance.get_noise()
328 ordering = optics_instance.get_ordering()
330 # Visualize ordering diagram
331 analyser = ordering_analyser(ordering)
332 ordering_visualizer.show_ordering_diagram(analyser, amount_of_clusters)
334 # Visualize clustering results
335 visualizer = cluster_visualizer()
336 visualizer.append_clusters(clusters, sample)
340 Here is an example where OPTICS extracts outliers from sample 'Tetra':
342 from pyclustering.cluster import cluster_visualizer
343 from pyclustering.cluster.optics import optics
344 from pyclustering.samples.definitions import FCPS_SAMPLES
345 from pyclustering.utils import read_sample
347 # Read sample for clustering from some file.
348 sample = read_sample(FCPS_SAMPLES.SAMPLE_TETRA)
350 # Run cluster analysis where connectivity radius is bigger than real.
353 optics_instance = optics(sample, radius, neighbors)
355 # Performs cluster analysis.
356 optics_instance.process()
358 # Obtain results of clustering.
359 clusters = optics_instance.get_clusters()
360 noise = optics_instance.get_noise()
362 # Visualize clustering results (clusters and outliers).
363 visualizer = cluster_visualizer()
364 visualizer.append_clusters(clusters, sample)
365 visualizer.append_cluster(noise, sample, marker='x')
369 Visualization result of allocated clusters and outliers is presented on the image below:
370 @image html optics_noise_tetra.png "Clusters and outliers extracted by OPTICS algorithm from sample 'Tetra'."
374 def __init__(self, sample, eps, minpts, amount_clusters=None, ccore=True, **kwargs):
376 @brief Constructor of clustering algorithm OPTICS.
378 @param[in] sample (list): Input data that is presented as a list of points (objects), where each point is represented by list or tuple.
379 @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less than the radius.
380 @param[in] minpts (uint): Minimum number of shared neighbors that is required for establishing links between points.
381 @param[in] amount_clusters (uint): Optional parameter where amount of clusters that should be allocated is specified.
382 In case of usage 'amount_clusters' connectivity radius can be greater than real, in other words, there is place for mistake
383 in connectivity radius usage.
384 @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
385 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'data_type').
387 <b>Keyword Args:</b><br>
388 - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
402 self.
__data_type = kwargs.get(
'data_type',
'points')
410 self.
__ccore = ccore_library.workable()
417 @brief Performs cluster analysis in line with rules of OPTICS algorithm.
419 @return (optics) Returns itself (OPTICS instance).
436 def __process_by_ccore(self):
438 @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
443 objects_indexes, objects_core_distances, objects_reachability_distances) = \
447 for i
in range(len(objects_indexes)):
448 if objects_core_distances[i] < 0.0:
449 objects_core_distances[i] =
None
451 if objects_reachability_distances[i] < 0.0:
452 objects_reachability_distances[i] =
None
454 optics_object =
optics_descriptor(objects_indexes[i], objects_core_distances[i], objects_reachability_distances[i])
455 optics_object.processed =
True
460 def __process_by_python(self):
462 @brief Performs cluster analysis using python code.
474 if radius
is not None:
479 def __initialize(self, sample):
481 @brief Initializes internal states and resets clustering results in line with input sample.
493 def __allocate_clusters(self):
495 @brief Performs cluster allocation and builds ordering diagram that is based on reachability-distances.
502 if optic_object.processed
is False:
510 @brief Returns list of allocated clusters, where each cluster contains indexes of objects and each cluster is represented by list.
512 @return (list) List of allocated clusters.
526 @brief Returns list of noise that contains indexes of objects that corresponds to input data.
528 @return (list) List of allocated noise objects.
542 @brief Returns clustering ordering information about the input data set.
543 @details Clustering ordering of data-set contains the information about the internal clustering structure in line with connectivity radius.
545 @return (ordering_analyser) Analyser of clustering ordering.
551 @see get_optics_objects()
559 for index_object
in cluster:
561 if optics_object.reachability_distance
is not None:
562 self.
__ordering.append(optics_object.reachability_distance)
569 @brief Returns OPTICS objects where each object contains information about index of point from processed data,
570 core distance and reachability distance.
572 @return (list) OPTICS objects.
577 @see optics_descriptor
586 @brief Returns connectivity radius that is calculated and used for clustering by the algorithm.
587 @details Connectivity radius may be changed only in case of usage additional parameter of the algorithm - amount of clusters for allocation.
589 @return (double) Connectivity radius.
594 @see get_optics_objects()
603 @brief Returns clustering result representation type that indicate how clusters are encoded.
605 @return (type_encoding) Clustering result representation.
611 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
614 def __create_neighbor_searcher(self, data_type):
616 @brief Returns neighbor searcher in line with data type.
618 @param[in] data_type (string): Data type (points or distance matrix).
621 if data_type ==
'points':
623 elif data_type ==
'distance_matrix':
626 raise TypeError(
"Unknown type of data is specified '%s'" % data_type)
629 def __expand_cluster_order(self, optics_object):
631 @brief Expand cluster order from not processed optic-object that corresponds to object from input data.
632 Traverse procedure is performed until objects are reachable from core-objects in line with connectivity radius.
633 Order database is updated during expanding.
635 @param[in] optics_object (optics_descriptor): Object that hasn't been processed.
639 optics_object.processed =
True
642 optics_object.reachability_distance =
None
647 if len(neighbors_descriptor) >= self.
__minpts:
648 neighbors_descriptor.sort(key =
lambda obj: obj[1])
649 optics_object.core_distance = neighbors_descriptor[self.
__minpts - 1][1]
655 while len(order_seed) > 0:
656 optic_descriptor = order_seed[0]
657 order_seed.remove(optic_descriptor)
660 optic_descriptor.processed =
True
664 if len(neighbors_descriptor) >= self.
__minpts:
665 neighbors_descriptor.sort(key =
lambda obj: obj[1])
666 optic_descriptor.core_distance = neighbors_descriptor[self.
__minpts - 1][1]
670 optic_descriptor.core_distance =
None
673 optics_object.core_distance =
None
676 def __extract_clusters(self):
678 @brief Extract clusters and noise from order database.
687 if (optics_object.reachability_distance
is None)
or (optics_object.reachability_distance > self.
__eps):
688 if (optics_object.core_distance
is not None)
and (optics_object.core_distance <= self.
__eps):
689 self.
__clusters.append([ optics_object.index_object ])
692 self.
__noise.append(optics_object.index_object)
694 current_cluster.append(optics_object.index_object)
697 def __update_order_seed(self, optic_descriptor, neighbors_descriptors, order_seed):
699 @brief Update sorted list of reachable objects (from core-object) that should be processed using neighbors of core-object.
701 @param[in] optic_descriptor (optics_descriptor): Core-object whose neighbors should be analysed.
702 @param[in] neighbors_descriptors (list): List of neighbors of core-object.
703 @param[in|out] order_seed (list): List of sorted object in line with reachable distance.
707 for neighbor_descriptor
in neighbors_descriptors:
708 index_neighbor = neighbor_descriptor[0]
709 current_reachable_distance = neighbor_descriptor[1]
712 reachable_distance = max(current_reachable_distance, optic_descriptor.core_distance)
714 self.
__optics_objects[index_neighbor].reachability_distance = reachable_distance
717 index_insertion = len(order_seed)
718 for index_seed
in range(0, len(order_seed)):
719 if reachable_distance < order_seed[index_seed].reachability_distance:
720 index_insertion = index_seed
726 if reachable_distance < self.
__optics_objects[index_neighbor].reachability_distance:
727 self.
__optics_objects[index_neighbor].reachability_distance = reachable_distance
728 order_seed.sort(key=
lambda obj: obj.reachability_distance)
731 def __neighbor_indexes_points(self, optic_object):
733 @brief Return neighbors of the specified object in case of sequence of points.
735 @param[in] optic_object (optics_descriptor): Object for which neighbors should be returned in line with connectivity radius.
737 @return (list) List of indexes of neighbors in line the connectivity radius.
741 return [[node_tuple[1].payload, math.sqrt(node_tuple[0])]
for node_tuple
in kdnodes
if
742 node_tuple[1].payload != optic_object.index_object]
745 def __neighbor_indexes_distance_matrix(self, optic_object):
747 @brief Return neighbors of the specified object in case of distance matrix.
749 @param[in] optic_object (optics_descriptor): Object for which neighbors should be returned in line with connectivity radius.
751 @return (list) List of indexes of neighbors in line the connectivity radius.
755 return [[index_neighbor, distances[index_neighbor]]
for index_neighbor
in range(len(distances))
756 if ((distances[index_neighbor] <= self.
__eps)
and (index_neighbor != optic_object.index_object))]
759 def __verify_arguments(self):
761 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
765 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__sample_pointer))
768 raise ValueError(
"Connectivity radius (current value: '%d') should be greater or equal to 0." % self.
__eps)
771 raise ValueError(
"Minimum number of neighbors (current value: '%d') should be greater than 0." %
775 raise ValueError(
"Amount of clusters (current value: '%d') should be greater than 0." %