3 @brief Cluster analysis algorithm: K-Medians
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
20 import pyclustering.core.kmedians_wrapper
as wrapper
22 from pyclustering.core.wrapper
import ccore_library
23 from pyclustering.core.metric_wrapper
import metric_wrapper
28 @brief Class represents clustering algorithm K-Medians.
29 @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
33 from pyclustering.cluster.kmedians import kmedians
34 from pyclustering.cluster import cluster_visualizer
35 from pyclustering.utils import read_sample
36 from pyclustering.samples.definitions import FCPS_SAMPLES
38 # Load list of points for cluster analysis.
39 sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
41 # Create instance of K-Medians algorithm.
42 initial_medians = [[0.0, 0.1], [2.5, 0.7]]
43 kmedians_instance = kmedians(sample, initial_medians)
45 # Run cluster analysis and obtain results.
46 kmedians_instance.process()
47 clusters = kmedians_instance.get_clusters()
48 medians = kmedians_instance.get_medians()
50 # Visualize clustering results.
51 visualizer = cluster_visualizer()
52 visualizer.append_clusters(clusters, sample)
53 visualizer.append_cluster(initial_medians, marker='*', markersize=10)
54 visualizer.append_cluster(medians, marker='*', markersize=10)
60 def __init__(self, data, initial_medians, tolerance=0.001, ccore=True, **kwargs):
62 @brief Constructor of clustering algorithm K-Medians.
64 @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
65 @param[in] initial_medians (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
66 @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
67 @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
68 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'itermax').
70 <b>Keyword Args:</b><br>
71 - metric (distance_metric): Metric that is used for distance calculation between two points.
72 - itermax (uint): Maximum number of iterations for cluster analysis.
77 self.
__medians = numpy.array(initial_medians)
81 self.
__itermax = kwargs.get(
'itermax', 100)
86 self.
__ccore = ccore
and self.
__metric.get_type() != type_metric.USER_DEFINED
88 self.
__ccore = ccore_library.workable()
95 @brief Performs cluster analysis in line with rules of K-Medians algorithm.
97 @return (kmedians) Returns itself (K-Medians instance).
99 @remark Results of clustering can be obtained using corresponding get methods.
107 ccore_metric = metric_wrapper.create_instance(self.
__metric)
111 changes = float(
'inf')
115 raise NameError(
'Dimension of the input data and dimension of the initial medians must be equal.')
122 changes = max([self.
__metric(self.
__medians[index], updated_centers[index])
for index
in range(len(updated_centers))])
135 @brief Calculates the closest cluster to each point.
137 @param[in] points (array_like): Points for which closest clusters are calculated.
139 @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
140 collection if 'process()' method was not called.
142 An example how to calculate (or predict) the closest cluster to specified points.
144 from pyclustering.cluster.kmedians import kmedians
145 from pyclustering.samples.definitions import SIMPLE_SAMPLES
146 from pyclustering.utils import read_sample
148 # Load list of points for cluster analysis.
149 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
151 # Initial centers for sample 'Simple3'.
152 initial_medians = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
154 # Create instance of K-Medians algorithm with prepared centers.
155 kmedians_instance = kmedians(sample, initial_medians)
157 # Run cluster analysis.
158 kmedians_instance.process()
160 # Calculate the closest cluster to following two points.
161 points = [[0.25, 0.2], [2.5, 4.0]]
162 closest_clusters = kmedians_instance.predict(points)
163 print(closest_clusters)
171 differences = numpy.zeros((len(points), len(self.
__medians)))
172 for index_point
in range(len(points)):
173 differences[index_point] = [ self.
__metric(points[index_point], center)
for center
in self.
__medians ]
175 return numpy.argmin(differences, axis=1)
180 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
192 @brief Returns list of centers of allocated clusters.
207 @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
208 @details Sum of metric errors is calculated using distance between point and its center:
209 \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
221 @brief Returns clustering result representation type that indicate how clusters are encoded.
223 @return (type_encoding) Clustering result representation.
229 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
232 def __update_clusters(self):
234 @brief Calculate Manhattan distance to each point from the each cluster.
235 @details Nearest points are captured by according clusters and as a result clusters are updated.
237 @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
241 clusters = [[]
for i
in range(len(self.
__medians))]
249 if (dist < dist_optim)
or (index == 0):
253 clusters[index_optim].append(index_point)
256 clusters = [cluster
for cluster
in clusters
if len(cluster) > 0]
261 def __calculate_total_wce(self):
263 @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Medians algorithm.
270 for index_cluster
in range(len(self.
__clusters)):
271 for index_point
in self.
__clusters[index_cluster]:
272 self.
__total_wce += dataset_differences[index_cluster][index_point]
275 def __calculate_dataset_difference(self, amount_clusters):
277 @brief Calculate distance from each point to each cluster center.
281 dataset_differences = numpy.zeros((amount_clusters, len(self.
__pointer_data)))
282 for index_center
in range(amount_clusters):
283 if self.
__metric.get_type() != type_metric.USER_DEFINED:
286 dataset_differences[index_center] = [self.
__metric(point, self.
__medians[index_center])
289 return dataset_differences
292 def __update_medians(self):
294 @brief Calculate medians of clusters in line with contained objects.
296 @return (list) list of medians for current number of clusters.
300 medians = [[]
for i
in range(len(self.
__clusters))]
303 medians[index] = [0.0
for i
in range(len(self.
__pointer_data[0]))]
309 relative_index_median = int(math.floor((length_cluster - 1) / 2))
310 index_median = sorted_cluster[relative_index_median]
312 if (length_cluster % 2) == 0:
313 index_median_second = sorted_cluster[relative_index_median + 1]
314 medians[index][index_dimension] = (self.
__pointer_data[index_median][index_dimension] + self.
__pointer_data[index_median_second][index_dimension]) / 2.0
317 medians[index][index_dimension] = self.
__pointer_data[index_median][index_dimension]
322 def __verify_arguments(self):
324 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
328 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__pointer_data))
331 raise TypeError(
"Input data element does not have attribute '__getitem__'.")
334 raise ValueError(
"Initial medians are empty (size: '%d')." % len(self.
__medians))
336 if not hasattr(self.
__medians[0],
"__getitem__"):
337 raise TypeError(
"Initial medians element does not have attribute '__getitem__'.")
340 raise ValueError(
"Tolerance (current value: '%d') should be greater or equal to 0." %
344 raise ValueError(
"Maximum iterations (current value: '%d') should be greater or equal to 0." %