3 @brief Cluster analysis algorithm: K-Medians 4 @details Implementation based on paper @cite book::algorithms_for_clustering_data. 6 @authors Andrei Novikov (pyclustering@yandex.ru) 8 @copyright GNU Public License 10 @cond GNU_PUBLIC_LICENSE 11 PyClustering is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 16 PyClustering is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 21 You should have received a copy of the GNU General Public License 22 along with this program. If not, see <http://www.gnu.org/licenses/>. 35 import pyclustering.core.kmedians_wrapper
as wrapper
37 from pyclustering.core.wrapper
import ccore_library
38 from pyclustering.core.metric_wrapper
import metric_wrapper
43 @brief Class represents clustering algorithm K-Medians. 44 @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids. 48 from pyclustering.cluster.kmedians import kmedians 49 from pyclustering.cluster import cluster_visualizer 50 from pyclustering.utils import read_sample 51 from pyclustering.samples.definitions import FCPS_SAMPLES 53 # Load list of points for cluster analysis. 54 sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS) 56 # Create instance of K-Medians algorithm. 57 initial_medians = [[0.0, 0.1], [2.5, 0.7]] 58 kmedians_instance = kmedians(sample, initial_medians) 60 # Run cluster analysis and obtain results. 61 kmedians_instance.process() 62 clusters = kmedians_instance.get_clusters() 63 medians = kmedians_instance.get_medians() 65 # Visualize clustering results. 66 visualizer = cluster_visualizer() 67 visualizer.append_clusters(clusters, sample) 68 visualizer.append_cluster(initial_medians, marker='*', markersize=10) 69 visualizer.append_cluster(medians, marker='*', markersize=10) 75 def __init__(self, data, initial_medians, tolerance=0.001, ccore=True, **kwargs):
77 @brief Constructor of clustering algorithm K-Medians. 79 @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. 80 @param[in] initial_medians (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...]. 81 @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing 82 @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not. 83 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'itermax'). 85 <b>Keyword Args:</b><br> 86 - metric (distance_metric): Metric that is used for distance calculation between two points. 87 - itermax (uint): Maximum number of iterations for cluster analysis. 92 self.
__medians = numpy.array(initial_medians)
96 self.
__itermax = kwargs.get(
'itermax', 100)
101 self.
__ccore = ccore
and self.
__metric.get_type() != type_metric.USER_DEFINED
103 self.
__ccore = ccore_library.workable()
110 @brief Performs cluster analysis in line with rules of K-Medians algorithm. 112 @return (kmedians) Returns itself (K-Medians instance). 114 @remark Results of clustering can be obtained using corresponding get methods. 122 ccore_metric = metric_wrapper.create_instance(self.
__metric)
126 changes = float(
'inf')
130 raise NameError(
'Dimension of the input data and dimension of the initial medians must be equal.')
137 changes = max([self.
__metric(self.
__medians[index], updated_centers[index])
for index
in range(len(updated_centers))])
150 @brief Calculates the closest cluster to each point. 152 @param[in] points (array_like): Points for which closest clusters are calculated. 154 @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty 155 collection if 'process()' method was not called. 157 An example how to calculate (or predict) the closest cluster to specified points. 159 from pyclustering.cluster.kmedians import kmedians 160 from pyclustering.samples.definitions import SIMPLE_SAMPLES 161 from pyclustering.utils import read_sample 163 # Load list of points for cluster analysis. 164 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3) 166 # Initial centers for sample 'Simple3'. 167 initial_medians = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]] 169 # Create instance of K-Medians algorithm with prepared centers. 170 kmedians_instance = kmedians(sample, initial_medians) 172 # Run cluster analysis. 173 kmedians_instance.process() 175 # Calculate the closest cluster to following two points. 176 points = [[0.25, 0.2], [2.5, 4.0]] 177 closest_clusters = kmedians_instance.predict(points) 178 print(closest_clusters) 186 differences = numpy.zeros((len(points), len(self.
__medians)))
187 for index_point
in range(len(points)):
188 differences[index_point] = [ self.
__metric(points[index_point], center)
for center
in self.
__medians ]
190 return numpy.argmin(differences, axis=1)
195 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. 207 @brief Returns list of centers of allocated clusters. 222 @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors). 223 @details Sum of metric errors is calculated using distance between point and its center: 224 \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f] 236 @brief Returns clustering result representation type that indicate how clusters are encoded. 238 @return (type_encoding) Clustering result representation. 244 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
247 def __update_clusters(self):
249 @brief Calculate Manhattan distance to each point from the each cluster. 250 @details Nearest points are captured by according clusters and as a result clusters are updated. 252 @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data. 256 clusters = [[]
for i
in range(len(self.
__medians))]
264 if (dist < dist_optim)
or (index == 0):
268 clusters[index_optim].append(index_point)
271 clusters = [cluster
for cluster
in clusters
if len(cluster) > 0]
276 def __calculate_total_wce(self):
278 @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Medians algorithm. 285 for index_cluster
in range(len(self.
__clusters)):
286 for index_point
in self.
__clusters[index_cluster]:
287 self.
__total_wce += dataset_differences[index_cluster][index_point]
290 def __calculate_dataset_difference(self, amount_clusters):
292 @brief Calculate distance from each point to each cluster center. 296 dataset_differences = numpy.zeros((amount_clusters, len(self.
__pointer_data)))
297 for index_center
in range(amount_clusters):
298 if self.
__metric.get_type() != type_metric.USER_DEFINED:
301 dataset_differences[index_center] = [self.
__metric(point, self.
__medians[index_center])
304 return dataset_differences
307 def __update_medians(self):
309 @brief Calculate medians of clusters in line with contained objects. 311 @return (list) list of medians for current number of clusters. 315 medians = [[]
for i
in range(len(self.
__clusters))]
318 medians[index] = [0.0
for i
in range(len(self.
__pointer_data[0]))]
324 relative_index_median = int(math.floor((length_cluster - 1) / 2))
325 index_median = sorted_cluster[relative_index_median]
327 if (length_cluster % 2) == 0:
328 index_median_second = sorted_cluster[relative_index_median + 1]
329 medians[index][index_dimension] = (self.
__pointer_data[index_median][index_dimension] + self.
__pointer_data[index_median_second][index_dimension]) / 2.0
332 medians[index][index_dimension] = self.
__pointer_data[index_median][index_dimension]
337 def __verify_arguments(self):
339 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness. 343 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__pointer_data))
346 raise TypeError(
"Input data element does not have attribute '__getitem__'.")
349 raise ValueError(
"Initial medians are empty (size: '%d')." % len(self.
__medians))
351 if not hasattr(self.
__medians[0],
"__getitem__"):
352 raise TypeError(
"Initial medians element does not have attribute '__getitem__'.")
355 raise ValueError(
"Tolerance (current value: '%d') should be greater or equal to 0." %
359 raise ValueError(
"Maximum iterations (current value: '%d') should be greater or equal to 0." %
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Module for representing clustering results.
Distance metric performs distance calculation between two points in line with encapsulated function...