3 @brief Cluster analysis algorithm: agglomerative algorithm. 4 @details Implementation based on paper @cite book::algorithms_for_clustering_data. 6 @authors Andrei Novikov (pyclustering@yandex.ru) 8 @copyright GNU Public License 10 @cond GNU_PUBLIC_LICENSE 11 PyClustering is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 16 PyClustering is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 21 You should have received a copy of the GNU General Public License 22 along with this program. If not, see <http://www.gnu.org/licenses/>. 28 from enum
import IntEnum;
34 from pyclustering.core.wrapper
import ccore_library
36 import pyclustering.core.agglomerative_wrapper
as wrapper;
41 @brief Enumerator of types of link between clusters. 60 @brief Class represents agglomerative algorithm for cluster analysis. 61 @details Agglomerative algorithm considers each data point (object) as a separate cluster at the beggining and 62 step by step finds the best pair of clusters for merge until required amount of clusters is obtained. 64 Example of agglomerative algorithm where centroid link is used: 66 from pyclustering.cluster.agglomerative import agglomerative, type_link 67 from pyclustering.cluster import cluster_visualizer 68 from pyclustering.samples.definitions import FCPS_SAMPLES 69 from pyclustering.utils import read_sample 71 # Sample for cluster analysis (represented by list) 72 sample = read_sample(FCPS_SAMPLES.SAMPLE_TARGET) 74 # Create object that uses python code only 75 agglomerative_instance = agglomerative(sample, 6, type_link.SINGLE_LINK, ccore=True) 78 agglomerative_instance.process() 80 # Obtain results of clustering 81 clusters = agglomerative_instance.get_clusters() 83 # Visualize clustering results 84 visualizer = cluster_visualizer() 85 visualizer.append_clusters(clusters, sample) 89 There is example of clustering 'LSUN' sample: 91 from pyclustering.cluster.agglomerative import agglomerative, type_link 92 from pyclustering.cluster import cluster_visualizer 93 from pyclustering.samples.definitions import FCPS_SAMPLES 94 from pyclustering.utils import read_sample 96 # sample Lsun for cluster analysis 97 lsun_sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN) 99 # create instance of the algorithm that will use ccore library (the last argument) 100 agglomerative_instance = agglomerative(lsun_sample, 3, type_link.SINGLE_LINK, True) 103 agglomerative_instance.process() 105 # get result and visualize it 106 lsun_clusters = agglomerative_instance.get_clusters() 107 visualizer = cluster_visualizer() 108 visualizer.append_clusters(lsun_clusters, lsun_sample) 112 Example of agglomerative clustering using different links: 113 @image html agglomerative_lsun_clustering_single_link.png 117 def __init__(self, data, number_clusters, link = None, ccore = True):
119 @brief Constructor of agglomerative hierarchical algorithm. 121 @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by list, for example 122 [[0.1, 0.2], [0.4, 0.5], [1.3, 0.9]]. 123 @param[in] number_clusters (uint): Number of clusters that should be allocated. 124 @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default. 125 @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not (by default it is 'False'). 141 self.
__ccore = ccore_library.workable()
149 @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity. 151 @return (agglomerative) Returns itself (Agglomerative instance). 163 current_number_clusters = len(self.
__clusters)
167 current_number_clusters = len(self.
__clusters)
174 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. 176 @remark Results of clustering can be obtained using corresponding gets methods. 178 @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data. 189 @brief Returns clustering result representation type that indicate how clusters are encoded. 191 @return (type_encoding) Clustering result representation. 197 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
200 def __merge_similar_clusters(self):
202 @brief Merges the most similar clusters in line with link type. 219 raise NameError(
'Not supported similarity is used')
222 def __merge_by_average_link(self):
224 @brief Merges the most similar clusters in line with average link type. 228 minimum_average_distance = float(
'Inf')
230 for index_cluster1
in range(0, len(self.
__clusters)):
231 for index_cluster2
in range(index_cluster1 + 1, len(self.
__clusters)):
234 candidate_average_distance = 0.0
235 for index_object1
in self.
__clusters[index_cluster1]:
236 for index_object2
in self.
__clusters[index_cluster2]:
239 candidate_average_distance /= (len(self.
__clusters[index_cluster1]) + len(self.
__clusters[index_cluster2]))
241 if candidate_average_distance < minimum_average_distance:
242 minimum_average_distance = candidate_average_distance
243 indexes = [index_cluster1, index_cluster2]
249 def __merge_by_centroid_link(self):
251 @brief Merges the most similar clusters in line with centroid link type. 255 minimum_centroid_distance = float(
'Inf')
258 for index1
in range(0, len(self.
__centers)):
259 for index2
in range(index1 + 1, len(self.
__centers)):
261 if distance < minimum_centroid_distance:
262 minimum_centroid_distance = distance
263 indexes = [index1, index2]
272 def __merge_by_complete_link(self):
274 @brief Merges the most similar clusters in line with complete link type. 278 minimum_complete_distance = float(
'Inf')
281 for index_cluster1
in range(0, len(self.
__clusters)):
282 for index_cluster2
in range(index_cluster1 + 1, len(self.
__clusters)):
285 if candidate_maximum_distance < minimum_complete_distance:
286 minimum_complete_distance = candidate_maximum_distance
287 indexes = [index_cluster1, index_cluster2]
293 def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
295 @brief Finds two farthest objects in two specified clusters in terms and returns distance between them. 297 @param[in] (uint) Index of the first cluster. 298 @param[in] (uint) Index of the second cluster. 300 @return The farthest euclidean distance between two clusters. 303 candidate_maximum_distance = 0.0
304 for index_object1
in self.
__clusters[index_cluster1]:
305 for index_object2
in self.
__clusters[index_cluster2]:
308 if distance > candidate_maximum_distance:
309 candidate_maximum_distance = distance
311 return candidate_maximum_distance
314 def __merge_by_signle_link(self):
316 @brief Merges the most similar clusters in line with single link type. 320 minimum_single_distance = float(
'Inf')
323 for index_cluster1
in range(0, len(self.
__clusters)):
324 for index_cluster2
in range(index_cluster1 + 1, len(self.
__clusters)):
327 if candidate_minimum_distance < minimum_single_distance:
328 minimum_single_distance = candidate_minimum_distance
329 indexes = [index_cluster1, index_cluster2]
335 def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
337 @brief Finds two nearest objects in two specified clusters and returns distance between them. 339 @param[in] (uint) Index of the first cluster. 340 @param[in] (uint) Index of the second cluster. 342 @return The nearest euclidean distance between two clusters. 345 candidate_minimum_distance = float(
'Inf')
347 for index_object1
in self.
__clusters[index_cluster1]:
348 for index_object2
in self.
__clusters[index_cluster2]:
350 if distance < candidate_minimum_distance:
351 candidate_minimum_distance = distance
353 return candidate_minimum_distance
356 def __calculate_center(self, cluster):
358 @brief Calculates new center. 360 @return (list) New value of the center of the specified cluster. 365 center = [0] * dimension
366 for index_point
in cluster:
367 for index_dimension
in range(0, dimension):
368 center[index_dimension] += self.
__pointer_data[index_point][index_dimension]
370 for index_dimension
in range(0, dimension):
371 center[index_dimension] /= len(cluster)
376 def __verify_arguments(self):
378 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness. 382 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__pointer_data))
385 raise ValueError(
"Amount of cluster (current value: '%d') for allocation should be greater than 0." %
def __calculate_farthest_distance(self, index_cluster1, index_cluster2)
Finds two farthest objects in two specified clusters in terms and returns distance between them...
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
def __merge_by_signle_link(self)
Merges the most similar clusters in line with single link type.
Utils that are used by modules of pyclustering.
Module for representing clustering results.
Enumerator of types of link between clusters.
def __merge_by_centroid_link(self)
Merges the most similar clusters in line with centroid link type.
Class represents agglomerative algorithm for cluster analysis.
def __init__(self, data, number_clusters, link=None, ccore=True)
Constructor of agglomerative hierarchical algorithm.
def __merge_by_average_link(self)
Merges the most similar clusters in line with average link type.
def __calculate_center(self, cluster)
Calculates new center.
def process(self)
Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
def __calculate_nearest_distance(self, index_cluster1, index_cluster2)
Finds two nearest objects in two specified clusters and returns distance between them.
def __merge_similar_clusters(self)
Merges the most similar clusters in line with link type.
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
def __merge_by_complete_link(self)
Merges the most similar clusters in line with complete link type.