3 @brief Cluster analysis algorithm: agglomerative algorithm.
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
13 from enum
import IntEnum
19 from pyclustering.core.wrapper
import ccore_library
21 import pyclustering.core.agglomerative_wrapper
as wrapper
26 @brief Enumerator of types of link between clusters.
45 @brief Class represents agglomerative algorithm for cluster analysis.
46 @details Agglomerative algorithm considers each data point (object) as a separate cluster at the beginning and
47 step by step finds the best pair of clusters for merge until required amount of clusters is obtained.
49 Example of agglomerative algorithm where centroid link is used:
51 from pyclustering.cluster.agglomerative import agglomerative, type_link
52 from pyclustering.cluster import cluster_visualizer
53 from pyclustering.samples.definitions import FCPS_SAMPLES
54 from pyclustering.utils import read_sample
56 # Sample for cluster analysis (represented by list)
57 sample = read_sample(FCPS_SAMPLES.SAMPLE_TARGET)
59 # Create object that uses python code only
60 agglomerative_instance = agglomerative(sample, 6, type_link.SINGLE_LINK, ccore=True)
63 agglomerative_instance.process()
65 # Obtain results of clustering
66 clusters = agglomerative_instance.get_clusters()
68 # Visualize clustering results
69 visualizer = cluster_visualizer()
70 visualizer.append_clusters(clusters, sample)
74 There is example of clustering 'LSUN' sample:
76 from pyclustering.cluster.agglomerative import agglomerative, type_link
77 from pyclustering.cluster import cluster_visualizer
78 from pyclustering.samples.definitions import FCPS_SAMPLES
79 from pyclustering.utils import read_sample
81 # sample Lsun for cluster analysis
82 lsun_sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
84 # create instance of the algorithm that will use ccore library (the last argument)
85 agglomerative_instance = agglomerative(lsun_sample, 3, type_link.SINGLE_LINK, True)
88 agglomerative_instance.process()
90 # get result and visualize it
91 lsun_clusters = agglomerative_instance.get_clusters()
92 visualizer = cluster_visualizer()
93 visualizer.append_clusters(lsun_clusters, lsun_sample)
97 Example of agglomerative clustering using different links:
98 @image html agglomerative_lsun_clustering_single_link.png
102 def __init__(self, data, number_clusters, link = None, ccore = True):
104 @brief Constructor of agglomerative hierarchical algorithm.
106 @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by list, for example
107 [[0.1, 0.2], [0.4, 0.5], [1.3, 0.9]].
108 @param[in] number_clusters (uint): Number of clusters that should be allocated.
109 @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
110 @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not (by default it is 'False').
126 self.
__ccore = ccore_library.workable()
134 @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
136 @return (agglomerative) Returns itself (Agglomerative instance).
148 current_number_clusters = len(self.
__clusters)
152 current_number_clusters = len(self.
__clusters)
159 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
161 @remark Results of clustering can be obtained using corresponding gets methods.
163 @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
174 @brief Returns clustering result representation type that indicate how clusters are encoded.
176 @return (type_encoding) Clustering result representation.
182 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
185 def __merge_similar_clusters(self):
187 @brief Merges the most similar clusters in line with link type.
204 raise NameError(
'Not supported similarity is used')
207 def __merge_by_average_link(self):
209 @brief Merges the most similar clusters in line with average link type.
213 minimum_average_distance = float(
'Inf')
215 for index_cluster1
in range(0, len(self.
__clusters)):
216 for index_cluster2
in range(index_cluster1 + 1, len(self.
__clusters)):
219 candidate_average_distance = 0.0
220 for index_object1
in self.
__clusters[index_cluster1]:
221 for index_object2
in self.
__clusters[index_cluster2]:
224 candidate_average_distance /= (len(self.
__clusters[index_cluster1]) + len(self.
__clusters[index_cluster2]))
226 if candidate_average_distance < minimum_average_distance:
227 minimum_average_distance = candidate_average_distance
228 indexes = [index_cluster1, index_cluster2]
234 def __merge_by_centroid_link(self):
236 @brief Merges the most similar clusters in line with centroid link type.
240 minimum_centroid_distance = float(
'Inf')
243 for index1
in range(0, len(self.
__centers)):
244 for index2
in range(index1 + 1, len(self.
__centers)):
246 if distance < minimum_centroid_distance:
247 minimum_centroid_distance = distance
248 indexes = [index1, index2]
257 def __merge_by_complete_link(self):
259 @brief Merges the most similar clusters in line with complete link type.
263 minimum_complete_distance = float(
'Inf')
266 for index_cluster1
in range(0, len(self.
__clusters)):
267 for index_cluster2
in range(index_cluster1 + 1, len(self.
__clusters)):
270 if candidate_maximum_distance < minimum_complete_distance:
271 minimum_complete_distance = candidate_maximum_distance
272 indexes = [index_cluster1, index_cluster2]
278 def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
280 @brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
282 @param[in] (uint) Index of the first cluster.
283 @param[in] (uint) Index of the second cluster.
285 @return The farthest euclidean distance between two clusters.
288 candidate_maximum_distance = 0.0
289 for index_object1
in self.
__clusters[index_cluster1]:
290 for index_object2
in self.
__clusters[index_cluster2]:
293 if distance > candidate_maximum_distance:
294 candidate_maximum_distance = distance
296 return candidate_maximum_distance
299 def __merge_by_signle_link(self):
301 @brief Merges the most similar clusters in line with single link type.
305 minimum_single_distance = float(
'Inf')
308 for index_cluster1
in range(0, len(self.
__clusters)):
309 for index_cluster2
in range(index_cluster1 + 1, len(self.
__clusters)):
312 if candidate_minimum_distance < minimum_single_distance:
313 minimum_single_distance = candidate_minimum_distance
314 indexes = [index_cluster1, index_cluster2]
320 def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
322 @brief Finds two nearest objects in two specified clusters and returns distance between them.
324 @param[in] (uint) Index of the first cluster.
325 @param[in] (uint) Index of the second cluster.
327 @return The nearest euclidean distance between two clusters.
330 candidate_minimum_distance = float(
'Inf')
332 for index_object1
in self.
__clusters[index_cluster1]:
333 for index_object2
in self.
__clusters[index_cluster2]:
335 if distance < candidate_minimum_distance:
336 candidate_minimum_distance = distance
338 return candidate_minimum_distance
341 def __calculate_center(self, cluster):
343 @brief Calculates new center.
345 @return (list) New value of the center of the specified cluster.
350 center = [0] * dimension
351 for index_point
in cluster:
352 for index_dimension
in range(0, dimension):
353 center[index_dimension] += self.
__pointer_data[index_point][index_dimension]
355 for index_dimension
in range(0, dimension):
356 center[index_dimension] /= len(cluster)
361 def __verify_arguments(self):
363 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
367 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__pointer_data))
370 raise ValueError(
"Amount of cluster (current value: '%d') for allocation should be greater than 0." %