3 @brief Cluster analysis algorithm: K-Medoids.
4 @details Implementation based on paper @cite inproceedings::cluster::kmedoids::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
20 import pyclustering.core.kmedoids_wrapper
as wrapper
22 from pyclustering.core.wrapper
import ccore_library
23 from pyclustering.core.metric_wrapper
import metric_wrapper
28 @brief Class represents clustering algorithm K-Medoids (PAM algorithm).
29 @details PAM is a partitioning clustering algorithm that uses the medoids instead of centers like in case of K-Means
30 algorithm. Medoid is an object with the smallest dissimilarity to all others in the cluster. PAM algorithm
31 complexity is \f$O\left ( k\left ( n-k \right )^{2} \right )\f$.
33 There is an example where PAM algorithm is used to cluster 'TwoDiamonds' data:
35 from pyclustering.cluster.kmedoids import kmedoids
36 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
37 from pyclustering.cluster import cluster_visualizer
38 from pyclustering.utils import read_sample
39 from pyclustering.samples.definitions import FCPS_SAMPLES
41 # Load list of points for cluster analysis.
42 sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
44 # Initialize initial medoids using K-Means++ algorithm
45 initial_medoids = kmeans_plusplus_initializer(sample, 2).initialize(return_index=True)
47 # Create instance of K-Medoids (PAM) algorithm.
48 kmedoids_instance = kmedoids(sample, initial_medoids)
50 # Run cluster analysis and obtain results.
51 kmedoids_instance.process()
52 clusters = kmedoids_instance.get_clusters()
53 medoids = kmedoids_instance.get_medoids()
55 # Print allocated clusters.
56 print("Clusters:", clusters)
58 # Display clustering results.
59 visualizer = cluster_visualizer()
60 visualizer.append_clusters(clusters, sample)
61 visualizer.append_cluster(initial_medoids, sample, markersize=12, marker='*', color='gray')
62 visualizer.append_cluster(medoids, sample, markersize=14, marker='*', color='black')
66 @image html pam_clustering_two_diamonds.png "Fig. 1. K-Medoids (PAM) clustering results 'TwoDiamonds'."
68 Metric for calculation distance between points can be specified by parameter additional 'metric':
70 # create Minkowski distance metric with degree equals to '2'
71 metric = distance_metric(type_metric.MINKOWSKI, degree=2)
73 # create K-Medoids algorithm with specific distance metric
74 kmedoids_instance = kmedoids(sample, initial_medoids, metric=metric)
76 # run cluster analysis and obtain results
77 kmedoids_instance.process()
78 clusters = kmedoids_instance.get_clusters()
81 Distance matrix can be used instead of sequence of points to increase performance and for that purpose parameter 'data_type' should be used:
83 # calculate distance matrix for sample
84 sample = read_sample(path_to_sample)
85 matrix = calculate_distance_matrix(sample)
87 # create K-Medoids algorithm for processing distance matrix instead of points
88 kmedoids_instance = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
90 # run cluster analysis and obtain results
91 kmedoids_instance.process()
93 clusters = kmedoids_instance.get_clusters()
94 medoids = kmedoids_instance.get_medoids()
100 def __init__(self, data, initial_index_medoids, tolerance=0.0001, ccore=True, **kwargs):
102 @brief Constructor of clustering algorithm K-Medoids.
104 @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
105 @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
106 @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
107 @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
108 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type', 'itermax').
110 <b>Keyword Args:</b><br>
111 - metric (distance_metric): Metric that is used for distance calculation between two points.
112 - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
113 - itermax (uint): Maximum number of iteration for cluster analysis.
125 self.
__data_type = kwargs.get(
'data_type',
'points')
126 self.
__itermax = kwargs.get(
'itermax', 200)
130 self.
__ccore = ccore
and self.
__metric.get_type() != type_metric.USER_DEFINED
132 self.
__ccore = ccore_library.workable()
139 @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
141 @return (kmedoids) Returns itself (K-Medoids instance).
143 @remark Results of clustering can be obtained using corresponding get methods.
151 ccore_metric = metric_wrapper.create_instance(self.
__metric)
155 changes = float(
'inf')
156 previous_deviation, current_deviation = float(
'inf'), float(
'inf')
166 if swap_cost != float(
'inf'):
167 previous_deviation = current_deviation
169 changes = previous_deviation - current_deviation
180 @brief Calculates the closest cluster to each point.
182 @param[in] points (array_like): Points for which closest clusters are calculated.
184 @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
185 collection if 'process()' method was not called.
187 An example how to calculate (or predict) the closest cluster to specified points.
189 from pyclustering.cluster.kmedoids import kmedoids
190 from pyclustering.samples.definitions import SIMPLE_SAMPLES
191 from pyclustering.utils import read_sample
193 # Load list of points for cluster analysis.
194 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
196 # Initial medoids for sample 'Simple3'.
197 initial_medoids = [4, 12, 25, 37]
199 # Create instance of K-Medoids algorithm with prepared centers.
200 kmedoids_instance = kmedoids(sample, initial_medoids)
202 # Run cluster analysis.
203 kmedoids_instance.process()
205 # Calculate the closest cluster to following two points.
206 points = [[0.35, 0.5], [2.5, 2.0]]
207 closest_clusters = kmedoids_instance.predict(points)
208 print(closest_clusters)
217 differences = numpy.zeros((len(points), len(medoids)))
218 for index_point
in range(len(points)):
219 differences[index_point] = [self.
__metric(points[index_point], center)
for center
in medoids]
221 return numpy.argmin(differences, axis=1)
226 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
238 @brief Returns list of medoids of allocated clusters represented by indexes from the input data.
250 @brief Returns clustering result representation type that indicate how clusters are encoded.
252 @return (type_encoding) Clustering result representation.
258 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
261 def __verify_arguments(self):
263 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
267 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__pointer_data))
270 raise ValueError(
"Initial medoids are empty (size: '%d')." % len(self.
__pointer_data))
273 raise ValueError(
"Tolerance (current value: '%d') should be greater or equal to 0." %
277 raise ValueError(
"Maximum iterations (current value: '%d') should be greater or equal to 0." %
281 def __create_distance_calculator(self):
283 @brief Creates distance calculator in line with algorithms parameters.
285 @return (callable) Distance calculator.
293 return lambda index1, index2: self.
__pointer_data.item((index1, index2))
298 raise TypeError(
"Unknown type of data is specified '%s'" % self.
__data_type)
301 def __update_clusters(self):
303 @brief Calculate distance to each point from the each cluster.
304 @details Nearest points are captured by according clusters and as a result clusters are updated.
306 @return (double) Total deviation (distance from each point to its closest medoid).
310 total_deviation = 0.0
314 dist_optim_first = float(
'Inf')
315 dist_optim_second = float(
'Inf')
320 if dist < dist_optim_first:
321 dist_optim_second = dist_optim_first
323 dist_optim_first = dist
324 elif dist < dist_optim_second:
325 dist_optim_second = dist
327 total_deviation += dist_optim_first
328 self.
__clusters[index_optim].append(index_point)
329 self.
__labels[index_point] = index_optim
334 return total_deviation
337 def __swap_medoids(self):
339 @brief Swap existed medoid with non-medoid points in order to find the most optimal medoid.
341 @return (double) Cost that is needed to swap two medoids.
345 optimal_swap_cost = float(
'inf')
346 optimal_index_cluster =
None
347 optimal_index_medoid =
None
349 for index_cluster
in range(len(self.
__clusters)):
355 if swap_cost < optimal_swap_cost:
356 optimal_swap_cost = swap_cost
357 optimal_index_cluster = index_cluster
358 optimal_index_medoid = candidate_medoid_index
360 if optimal_index_cluster
is not None:
363 return optimal_swap_cost
366 def __calculate_swap_cost(self, index_candidate, cluster_index):
368 @brief Calculates cost to swap `index_candidate` with the current medoid `cluster_index`.
370 @param[in] index_candidate (uint): Index point that is considered as a medoid candidate.
371 @param[in] cluster_index (uint): Index of a cluster where the current medoid is used for calculation.
373 @return (double) Cost that is needed to swap medoids.
379 if index_point == index_candidate:
383 if self.
__labels[index_point] == cluster_index: