3 @brief Cluster analysis algorithm: agglomerative algorithm. 4 @details Implementation based on paper @cite book::algorithms_for_clustering_data. 6 @authors Andrei Novikov (pyclustering@yandex.ru) 8 @copyright GNU Public License 10 @cond GNU_PUBLIC_LICENSE 11 PyClustering is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 16 PyClustering is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 21 You should have received a copy of the GNU General Public License 22 along with this program. If not, see <http://www.gnu.org/licenses/>. 28 from enum
import IntEnum;
34 from pyclustering.core.wrapper
import ccore_library
36 import pyclustering.core.agglomerative_wrapper
as wrapper;
41 @brief Enumerator of types of link between clusters. 60 @brief Class represents agglomerative algorithm for cluster analysis. 61 @details Agglomerative algorithm considers each data point (object) as a separate cluster at the beggining and 62 step by step finds the best pair of clusters for merge until required amount of clusters is obtained. 64 Example of agglomerative algorithm where centroid link is used: 66 from pyclustering.cluster.agglomerative import agglomerative, type_link 67 from pyclustering.cluster import cluster_visualizer 68 from pyclustering.samples.definitions import FCPS_SAMPLES 69 from pyclustering.utils import read_sample 71 # Sample for cluster analysis (represented by list) 72 sample = read_sample(FCPS_SAMPLES.SAMPLE_TARGET) 74 # Create object that uses python code only 75 agglomerative_instance = agglomerative(sample, 6, type_link.SINGLE_LINK, ccore=True) 78 agglomerative_instance.process() 80 # Obtain results of clustering 81 clusters = agglomerative_instance.get_clusters() 83 # Visualize clustering results 84 visualizer = cluster_visualizer() 85 visualizer.append_clusters(clusters, sample) 89 There is example of clustering 'LSUN' sample: 91 from pyclustering.cluster.agglomerative import agglomerative, type_link 92 from pyclustering.cluster import cluster_visualizer 93 from pyclustering.samples.definitions import FCPS_SAMPLES 94 from pyclustering.utils import read_sample 96 # sample Lsun for cluster analysis 97 lsun_sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN) 99 # create instance of the algorithm that will use ccore library (the last argument) 100 agglomerative_instance = agglomerative(lsun_sample, 3, type_link.SINGLE_LINK, True) 103 agglomerative_instance.process() 105 # get result and visualize it 106 lsun_clusters = agglomerative_instance.get_clusters() 107 visualizer = cluster_visualizer() 108 visualizer.append_clusters(lsun_clusters, lsun_sample) 112 Example of agglomerative clustering using different links: 113 @image html agglomerative_lsun_clustering_single_link.png 117 def __init__(self, data, number_clusters, link = None, ccore = True):
119 @brief Constructor of agglomerative hierarchical algorithm. 121 @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by list, for example 122 [[0.1, 0.2], [0.4, 0.5], [1.3, 0.9]]. 123 @param[in] number_clusters (uint): Number of clusters that should be allocated. 124 @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default. 125 @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not (by default it is 'False'). 139 self.
__ccore = ccore_library.workable()
147 @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity. 159 current_number_clusters = len(self.
__clusters);
163 current_number_clusters = len(self.
__clusters);
168 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. 170 @remark Results of clustering can be obtained using corresponding gets methods. 172 @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data. 183 @brief Returns clustering result representation type that indicate how clusters are encoded. 185 @return (type_encoding) Clustering result representation. 191 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
194 def __merge_similar_clusters(self):
196 @brief Merges the most similar clusters in line with link type. 213 raise NameError(
'Not supported similarity is used');
216 def __merge_by_average_link(self):
218 @brief Merges the most similar clusters in line with average link type. 222 minimum_average_distance = float(
'Inf');
224 for index_cluster1
in range(0, len(self.
__clusters)):
225 for index_cluster2
in range(index_cluster1 + 1, len(self.
__clusters)):
228 candidate_average_distance = 0.0;
229 for index_object1
in self.
__clusters[index_cluster1]:
230 for index_object2
in self.
__clusters[index_cluster2]:
233 candidate_average_distance /= (len(self.
__clusters[index_cluster1]) + len(self.
__clusters[index_cluster2]));
235 if (candidate_average_distance < minimum_average_distance):
236 minimum_average_distance = candidate_average_distance;
237 indexes = [index_cluster1, index_cluster2];
243 def __merge_by_centroid_link(self):
245 @brief Merges the most similar clusters in line with centroid link type. 249 minimum_centroid_distance = float(
'Inf');
252 for index1
in range(0, len(self.
__centers)):
253 for index2
in range(index1 + 1, len(self.
__centers)):
255 if (distance < minimum_centroid_distance):
256 minimum_centroid_distance = distance;
257 indexes = [index1, index2];
266 def __merge_by_complete_link(self):
268 @brief Merges the most similar clusters in line with complete link type. 272 minimum_complete_distance = float(
'Inf');
275 for index_cluster1
in range(0, len(self.
__clusters)):
276 for index_cluster2
in range(index_cluster1 + 1, len(self.
__clusters)):
279 if (candidate_maximum_distance < minimum_complete_distance):
280 minimum_complete_distance = candidate_maximum_distance;
281 indexes = [index_cluster1, index_cluster2];
287 def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
289 @brief Finds two farthest objects in two specified clusters in terms and returns distance between them. 291 @param[in] (uint) Index of the first cluster. 292 @param[in] (uint) Index of the second cluster. 294 @return The farthest euclidean distance between two clusters. 297 candidate_maximum_distance = 0.0;
298 for index_object1
in self.
__clusters[index_cluster1]:
299 for index_object2
in self.
__clusters[index_cluster2]:
302 if (distance > candidate_maximum_distance):
303 candidate_maximum_distance = distance;
305 return candidate_maximum_distance;
308 def __merge_by_signle_link(self):
310 @brief Merges the most similar clusters in line with single link type. 314 minimum_single_distance = float(
'Inf');
317 for index_cluster1
in range(0, len(self.
__clusters)):
318 for index_cluster2
in range(index_cluster1 + 1, len(self.
__clusters)):
321 if (candidate_minimum_distance < minimum_single_distance):
322 minimum_single_distance = candidate_minimum_distance;
323 indexes = [index_cluster1, index_cluster2];
329 def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
331 @brief Finds two nearest objects in two specified clusters and returns distance between them. 333 @param[in] (uint) Index of the first cluster. 334 @param[in] (uint) Index of the second cluster. 336 @return The nearest euclidean distance between two clusters. 339 candidate_minimum_distance = float(
'Inf');
341 for index_object1
in self.
__clusters[index_cluster1]:
342 for index_object2
in self.
__clusters[index_cluster2]:
344 if (distance < candidate_minimum_distance):
345 candidate_minimum_distance = distance;
347 return candidate_minimum_distance;
350 def __calculate_center(self, cluster):
352 @brief Calculates new center. 354 @return (list) New value of the center of the specified cluster. 359 center = [0] * dimension;
360 for index_point
in cluster:
361 for index_dimension
in range(0, dimension):
362 center[index_dimension] += self.
__pointer_data[index_point][index_dimension];
364 for index_dimension
in range(0, dimension):
365 center[index_dimension] /= len(cluster);
def __calculate_farthest_distance(self, index_cluster1, index_cluster2)
Finds two farthest objects in two specified clusters in terms and returns distance between them...
def __merge_by_signle_link(self)
Merges the most similar clusters in line with single link type.
Utils that are used by modules of pyclustering.
Module for representing clustering results.
Enumerator of types of link between clusters.
def __merge_by_centroid_link(self)
Merges the most similar clusters in line with centroid link type.
Class represents agglomerative algorithm for cluster analysis.
def __init__(self, data, number_clusters, link=None, ccore=True)
Constructor of agglomerative hierarchical algorithm.
def __merge_by_average_link(self)
Merges the most similar clusters in line with average link type.
def __calculate_center(self, cluster)
Calculates new center.
def process(self)
Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
def __calculate_nearest_distance(self, index_cluster1, index_cluster2)
Finds two nearest objects in two specified clusters and returns distance between them.
def __merge_similar_clusters(self)
Merges the most similar clusters in line with link type.
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
def __merge_by_complete_link(self)
Merges the most similar clusters in line with complete link type.