3 @brief Cluster analysis algorithm: K-Medians 4 @details Implementation based on paper @cite book::algorithms_for_clustering_data. 6 @authors Andrei Novikov (pyclustering@yandex.ru) 8 @copyright GNU Public License 10 @cond GNU_PUBLIC_LICENSE 11 PyClustering is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 16 PyClustering is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 21 You should have received a copy of the GNU General Public License 22 along with this program. If not, see <http://www.gnu.org/licenses/>. 35 import pyclustering.core.kmedians_wrapper
as wrapper
37 from pyclustering.core.wrapper
import ccore_library
38 from pyclustering.core.metric_wrapper
import metric_wrapper
43 @brief Class represents clustering algorithm K-Medians. 44 @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids. 48 from pyclustering.cluster.kmedians import kmedians 49 from pyclustering.cluster import cluster_visualizer 50 from pyclustering.utils import read_sample 51 from pyclustering.samples.definitions import FCPS_SAMPLES 53 # Load list of points for cluster analysis. 54 sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS) 56 # Create instance of K-Medians algorithm. 57 initial_medians = [[0.0, 0.1], [2.5, 0.7]] 58 kmedians_instance = kmedians(sample, initial_medians) 60 # Run cluster analysis and obtain results. 61 kmedians_instance.process() 62 clusters = kmedians_instance.get_clusters() 63 medians = kmedians_instance.get_medians() 65 # Visualize clustering results. 66 visualizer = cluster_visualizer() 67 visualizer.append_clusters(clusters, sample) 68 visualizer.append_cluster(initial_medians, marker='*', markersize=10) 69 visualizer.append_cluster(medians, marker='*', markersize=10) 75 def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
77 @brief Constructor of clustering algorithm K-Medians. 79 @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. 80 @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...]. 81 @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing 82 @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not. 83 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'itermax'). 85 <b>Keyword Args:</b><br> 86 - metric (distance_metric): Metric that is used for distance calculation between two points. 87 - itermax (uint): Maximum number of iterations for cluster analysis. 92 self.
__medians = numpy.array(initial_centers)
96 self.
__itermax = kwargs.get(
'itermax', 100)
101 self.
__ccore = ccore
and self.
__metric.get_type() != type_metric.USER_DEFINED
103 self.
__ccore = ccore_library.workable()
108 @brief Performs cluster analysis in line with rules of K-Medians algorithm. 110 @return (kmedians) Returns itself (K-Medians instance). 112 @remark Results of clustering can be obtained using corresponding get methods. 120 ccore_metric = metric_wrapper.create_instance(self.
__metric)
124 changes = float(
'inf')
128 raise NameError(
'Dimension of the input data and dimension of the initial medians must be equal.')
135 changes = max([self.
__metric(self.
__medians[index], updated_centers[index])
for index
in range(len(updated_centers))])
148 @brief Calculates the closest cluster to each point. 150 @param[in] points (array_like): Points for which closest clusters are calculated. 152 @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty 153 collection if 'process()' method was not called. 155 An example how to calculate (or predict) the closest cluster to specified points. 157 from pyclustering.cluster.kmedians import kmedians 158 from pyclustering.samples.definitions import SIMPLE_SAMPLES 159 from pyclustering.utils import read_sample 161 # Load list of points for cluster analysis. 162 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3) 164 # Initial centers for sample 'Simple3'. 165 initial_medians = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]] 167 # Create instance of K-Medians algorithm with prepared centers. 168 kmedians_instance = kmedians(sample, initial_medians) 170 # Run cluster analysis. 171 kmedians_instance.process() 173 # Calculate the closest cluster to following two points. 174 points = [[0.25, 0.2], [2.5, 4.0]] 175 closest_clusters = kmedians_instance.predict(points) 176 print(closest_clusters) 184 differences = numpy.zeros((len(points), len(self.
__medians)))
185 for index_point
in range(len(points)):
186 differences[index_point] = [ self.
__metric(points[index_point], center)
for center
in self.
__medians ]
188 return numpy.argmin(differences, axis=1)
193 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. 205 @brief Returns list of centers of allocated clusters. 220 @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors). 221 @details Sum of metric errors is calculated using distance between point and its center: 222 \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f] 234 @brief Returns clustering result representation type that indicate how clusters are encoded. 236 @return (type_encoding) Clustering result representation. 242 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
245 def __update_clusters(self):
247 @brief Calculate Manhattan distance to each point from the each cluster. 248 @details Nearest points are captured by according clusters and as a result clusters are updated. 250 @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data. 254 clusters = [[]
for i
in range(len(self.
__medians))]
262 if (dist < dist_optim)
or (index == 0):
266 clusters[index_optim].append(index_point)
269 clusters = [cluster
for cluster
in clusters
if len(cluster) > 0]
274 def __calculate_total_wce(self):
276 @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Medians algorithm. 283 for index_cluster
in range(len(self.
__clusters)):
284 for index_point
in self.
__clusters[index_cluster]:
285 self.
__total_wce += dataset_differences[index_cluster][index_point]
288 def __calculate_dataset_difference(self, amount_clusters):
290 @brief Calculate distance from each point to each cluster center. 294 dataset_differences = numpy.zeros((amount_clusters, len(self.
__pointer_data)))
295 for index_center
in range(amount_clusters):
296 if self.
__metric.get_type() != type_metric.USER_DEFINED:
299 dataset_differences[index_center] = [self.
__metric(point, self.
__medians[index_center])
302 return dataset_differences
305 def __update_medians(self):
307 @brief Calculate medians of clusters in line with contained objects. 309 @return (list) list of medians for current number of clusters. 313 medians = [[]
for i
in range(len(self.
__clusters))]
316 medians[index] = [0.0
for i
in range(len(self.
__pointer_data[0]))]
322 relative_index_median = int(math.floor((length_cluster - 1) / 2))
323 index_median = sorted_cluster[relative_index_median]
325 if (length_cluster % 2) == 0:
326 index_median_second = sorted_cluster[relative_index_median + 1]
327 medians[index][index_dimension] = (self.
__pointer_data[index_median][index_dimension] + self.
__pointer_data[index_median_second][index_dimension]) / 2.0
330 medians[index][index_dimension] = self.
__pointer_data[index_median][index_dimension]
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Module for representing clustering results.
Distance metric performs distance calculation between two points in line with encapsulated function...