pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
kmedians.py
1 """!
2 
3 @brief Cluster analysis algorithm: K-Medians
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 
13 import math
14 import numpy
15 
16 from pyclustering.cluster.encoder import type_encoding
17 
18 from pyclustering.utils.metric import distance_metric, type_metric
19 
20 import pyclustering.core.kmedians_wrapper as wrapper
21 
22 from pyclustering.core.wrapper import ccore_library
23 from pyclustering.core.metric_wrapper import metric_wrapper
24 
25 
26 class kmedians:
27  """!
28  @brief Class represents clustering algorithm K-Medians.
29  @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
30 
31  Example:
32  @code
33  from pyclustering.cluster.kmedians import kmedians
34  from pyclustering.cluster import cluster_visualizer
35  from pyclustering.utils import read_sample
36  from pyclustering.samples.definitions import FCPS_SAMPLES
37 
38  # Load list of points for cluster analysis.
39  sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
40 
41  # Create instance of K-Medians algorithm.
42  initial_medians = [[0.0, 0.1], [2.5, 0.7]]
43  kmedians_instance = kmedians(sample, initial_medians)
44 
45  # Run cluster analysis and obtain results.
46  kmedians_instance.process()
47  clusters = kmedians_instance.get_clusters()
48  medians = kmedians_instance.get_medians()
49 
50  # Visualize clustering results.
51  visualizer = cluster_visualizer()
52  visualizer.append_clusters(clusters, sample)
53  visualizer.append_cluster(initial_medians, marker='*', markersize=10)
54  visualizer.append_cluster(medians, marker='*', markersize=10)
55  visualizer.show()
56  @endcode
57 
58  """
59 
60  def __init__(self, data, initial_medians, tolerance=0.001, ccore=True, **kwargs):
61  """!
62  @brief Constructor of clustering algorithm K-Medians.
63 
64  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
65  @param[in] initial_medians (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
66  @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
67  @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
68  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'itermax').
69 
70  <b>Keyword Args:</b><br>
71  - metric (distance_metric): Metric that is used for distance calculation between two points.
72  - itermax (uint): Maximum number of iterations for cluster analysis.
73 
74  """
75  self.__pointer_data = numpy.array(data)
76  self.__clusters = []
77  self.__medians = numpy.array(initial_medians)
78  self.__tolerance = tolerance
79  self.__total_wce = 0
80 
81  self.__itermax = kwargs.get('itermax', 100)
82  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
83  if self.__metric is None:
84  self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
85 
86  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
87  if self.__ccore:
88  self.__ccore = ccore_library.workable()
89 
90  self.__verify_arguments()
91 
92 
93  def process(self):
94  """!
95  @brief Performs cluster analysis in line with rules of K-Medians algorithm.
96 
97  @return (kmedians) Returns itself (K-Medians instance).
98 
99  @remark Results of clustering can be obtained using corresponding get methods.
100 
101  @see get_clusters()
102  @see get_medians()
103 
104  """
105 
106  if self.__ccore is True:
107  ccore_metric = metric_wrapper.create_instance(self.__metric)
108  self.__clusters, self.__medians = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance, self.__itermax, ccore_metric.get_pointer())
109 
110  else:
111  changes = float('inf')
112 
113  # Check for dimension
114  if len(self.__pointer_data[0]) != len(self.__medians[0]):
115  raise NameError('Dimension of the input data and dimension of the initial medians must be equal.')
116 
117  iterations = 0
118  while changes > self.__tolerance and iterations < self.__itermax:
119  self.__clusters = self.__update_clusters()
120  updated_centers = self.__update_medians()
121 
122  changes = max([self.__metric(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))])
123 
124  self.__medians = updated_centers
125 
126  iterations += 1
127 
128  self.__calculate_total_wce()
129 
130  return self
131 
132 
133  def predict(self, points):
134  """!
135  @brief Calculates the closest cluster to each point.
136 
137  @param[in] points (array_like): Points for which closest clusters are calculated.
138 
139  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
140  collection if 'process()' method was not called.
141 
142  An example how to calculate (or predict) the closest cluster to specified points.
143  @code
144  from pyclustering.cluster.kmedians import kmedians
145  from pyclustering.samples.definitions import SIMPLE_SAMPLES
146  from pyclustering.utils import read_sample
147 
148  # Load list of points for cluster analysis.
149  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
150 
151  # Initial centers for sample 'Simple3'.
152  initial_medians = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
153 
154  # Create instance of K-Medians algorithm with prepared centers.
155  kmedians_instance = kmedians(sample, initial_medians)
156 
157  # Run cluster analysis.
158  kmedians_instance.process()
159 
160  # Calculate the closest cluster to following two points.
161  points = [[0.25, 0.2], [2.5, 4.0]]
162  closest_clusters = kmedians_instance.predict(points)
163  print(closest_clusters)
164  @endcode
165 
166  """
167 
168  if len(self.__clusters) == 0:
169  return []
170 
171  differences = numpy.zeros((len(points), len(self.__medians)))
172  for index_point in range(len(points)):
173  differences[index_point] = [ self.__metric(points[index_point], center) for center in self.__medians ]
174 
175  return numpy.argmin(differences, axis=1)
176 
177 
178  def get_clusters(self):
179  """!
180  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
181 
182  @see process()
183  @see get_medians()
184 
185  """
186 
187  return self.__clusters
188 
189 
190  def get_medians(self):
191  """!
192  @brief Returns list of centers of allocated clusters.
193 
194  @see process()
195  @see get_clusters()
196 
197  """
198 
199  if isinstance(self.__medians, list):
200  return self.__medians
201 
202  return self.__medians.tolist()
203 
204 
205  def get_total_wce(self):
206  """!
207  @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
208  @details Sum of metric errors is calculated using distance between point and its center:
209  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
210 
211  @see process()
212  @see get_clusters()
213 
214  """
215 
216  return self.__total_wce
217 
218 
220  """!
221  @brief Returns clustering result representation type that indicate how clusters are encoded.
222 
223  @return (type_encoding) Clustering result representation.
224 
225  @see get_clusters()
226 
227  """
228 
229  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
230 
231 
232  def __update_clusters(self):
233  """!
234  @brief Calculate Manhattan distance to each point from the each cluster.
235  @details Nearest points are captured by according clusters and as a result clusters are updated.
236 
237  @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
238 
239  """
240 
241  clusters = [[] for i in range(len(self.__medians))]
242  for index_point in range(len(self.__pointer_data)):
243  index_optim = -1
244  dist_optim = 0.0
245 
246  for index in range(len(self.__medians)):
247  dist = self.__metric(self.__pointer_data[index_point], self.__medians[index])
248 
249  if (dist < dist_optim) or (index == 0):
250  index_optim = index
251  dist_optim = dist
252 
253  clusters[index_optim].append(index_point)
254 
255  # If cluster is not able to capture object it should be removed
256  clusters = [cluster for cluster in clusters if len(cluster) > 0]
257 
258  return clusters
259 
260 
261  def __calculate_total_wce(self):
262  """!
263  @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Medians algorithm.
264 
265  """
266 
267  dataset_differences = self.__calculate_dataset_difference(len(self.__clusters))
268 
269  self.__total_wce = 0
270  for index_cluster in range(len(self.__clusters)):
271  for index_point in self.__clusters[index_cluster]:
272  self.__total_wce += dataset_differences[index_cluster][index_point]
273 
274 
275  def __calculate_dataset_difference(self, amount_clusters):
276  """!
277  @brief Calculate distance from each point to each cluster center.
278 
279  """
280  self.__metric.enable_numpy_usage()
281  dataset_differences = numpy.zeros((amount_clusters, len(self.__pointer_data)))
282  for index_center in range(amount_clusters):
283  if self.__metric.get_type() != type_metric.USER_DEFINED:
284  dataset_differences[index_center] = self.__metric(self.__pointer_data, self.__medians[index_center])
285  else:
286  dataset_differences[index_center] = [self.__metric(point, self.__medians[index_center])
287  for point in self.__pointer_data]
288  self.__metric.disable_numpy_usage()
289  return dataset_differences
290 
291 
292  def __update_medians(self):
293  """!
294  @brief Calculate medians of clusters in line with contained objects.
295 
296  @return (list) list of medians for current number of clusters.
297 
298  """
299 
300  medians = [[] for i in range(len(self.__clusters))]
301 
302  for index in range(len(self.__clusters)):
303  medians[index] = [0.0 for i in range(len(self.__pointer_data[0]))]
304  length_cluster = len(self.__clusters[index])
305 
306  for index_dimension in range(len(self.__pointer_data[0])):
307  sorted_cluster = sorted(self.__clusters[index], key=lambda x: self.__pointer_data[x][index_dimension])
308 
309  relative_index_median = int(math.floor((length_cluster - 1) / 2))
310  index_median = sorted_cluster[relative_index_median]
311 
312  if (length_cluster % 2) == 0:
313  index_median_second = sorted_cluster[relative_index_median + 1]
314  medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0
315 
316  else:
317  medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension]
318 
319  return medians
320 
321 
322  def __verify_arguments(self):
323  """!
324  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
325 
326  """
327  if len(self.__pointer_data) == 0:
328  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
329 
330  if not hasattr(self.__pointer_data[0], "__getitem__"):
331  raise TypeError("Input data element does not have attribute '__getitem__'.")
332 
333  if len(self.__medians) == 0:
334  raise ValueError("Initial medians are empty (size: '%d')." % len(self.__medians))
335 
336  if not hasattr(self.__medians[0], "__getitem__"):
337  raise TypeError("Initial medians element does not have attribute '__getitem__'.")
338 
339  if self.__tolerance < 0:
340  raise ValueError("Tolerance (current value: '%d') should be greater or equal to 0." %
341  self.__tolerance)
342 
343  if self.__itermax < 0:
344  raise ValueError("Maximum iterations (current value: '%d') should be greater or equal to 0." %
345  self.__itermax)
pyclustering.cluster.kmedians.kmedians.get_medians
def get_medians(self)
Returns list of centers of allocated clusters.
Definition: kmedians.py:190
pyclustering.cluster.kmedians.kmedians.__pointer_data
__pointer_data
Definition: kmedians.py:75
pyclustering.cluster.kmedians.kmedians.predict
def predict(self, points)
Calculates the closest cluster to each point.
Definition: kmedians.py:133
pyclustering.cluster.kmedians.kmedians.__verify_arguments
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: kmedians.py:322
pyclustering.cluster.kmedians.kmedians.__medians
__medians
Definition: kmedians.py:77
pyclustering.cluster.kmedians.kmedians
Class represents clustering algorithm K-Medians.
Definition: kmedians.py:26
pyclustering.cluster.kmedians.kmedians.__total_wce
__total_wce
Definition: kmedians.py:79
pyclustering.utils.metric.distance_metric
Distance metric performs distance calculation between two points in line with encapsulated function,...
Definition: metric.py:52
pyclustering.cluster.kmedians.kmedians.__update_clusters
def __update_clusters(self)
Calculate Manhattan distance to each point from the each cluster.
Definition: kmedians.py:232
pyclustering.cluster.kmedians.kmedians.__calculate_dataset_difference
def __calculate_dataset_difference(self, amount_clusters)
Calculate distance from each point to each cluster center.
Definition: kmedians.py:275
pyclustering.cluster.kmedians.kmedians.__metric
__metric
Definition: kmedians.py:82
pyclustering.cluster.kmedians.kmedians.__clusters
__clusters
Definition: kmedians.py:76
pyclustering.cluster.kmedians.kmedians.__init__
def __init__(self, data, initial_medians, tolerance=0.001, ccore=True, **kwargs)
Constructor of clustering algorithm K-Medians.
Definition: kmedians.py:60
pyclustering.cluster.kmedians.kmedians.process
def process(self)
Performs cluster analysis in line with rules of K-Medians algorithm.
Definition: kmedians.py:93
pyclustering.cluster.kmedians.kmedians.get_cluster_encoding
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmedians.py:219
pyclustering.cluster.kmedians.kmedians.get_total_wce
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: kmedians.py:205
pyclustering.cluster.kmedians.kmedians.__calculate_total_wce
def __calculate_total_wce(self)
Calculate total within cluster errors that is depend on metric that was chosen for K-Medians algorith...
Definition: kmedians.py:261
pyclustering.cluster.kmedians.kmedians.get_clusters
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
Definition: kmedians.py:178
pyclustering.cluster.kmedians.kmedians.__update_medians
def __update_medians(self)
Calculate medians of clusters in line with contained objects.
Definition: kmedians.py:292
pyclustering.cluster.kmedians.kmedians.__tolerance
__tolerance
Definition: kmedians.py:78
pyclustering.cluster.kmedians.kmedians.__itermax
__itermax
Definition: kmedians.py:81
pyclustering.cluster.encoder
Module for representing clustering results.
Definition: encoder.py:1
pyclustering.utils.metric
Module provides various distance metrics - abstraction of the notion of distance in a metric space.
Definition: metric.py:1
pyclustering.cluster.kmedians.kmedians.__ccore
__ccore
Definition: kmedians.py:86