kmedians.py
1 """!
2 
3 @brief Cluster analysis algorithm: K-Medians
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import math
29 import numpy
30 
31 from pyclustering.cluster.encoder import type_encoding
32 
33 from pyclustering.utils.metric import distance_metric, type_metric
34 
35 import pyclustering.core.kmedians_wrapper as wrapper
36 
37 from pyclustering.core.wrapper import ccore_library
38 from pyclustering.core.metric_wrapper import metric_wrapper
39 
40 
41 class kmedians:
42  """!
43  @brief Class represents clustering algorithm K-Medians.
44  @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
45 
46  Example:
47  @code
48  from pyclustering.cluster.kmedians import kmedians
49  from pyclustering.cluster import cluster_visualizer
50  from pyclustering.utils import read_sample
51  from pyclustering.samples.definitions import FCPS_SAMPLES
52 
53  # Load list of points for cluster analysis.
54  sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
55 
56  # Create instance of K-Medians algorithm.
57  initial_medians = [[0.0, 0.1], [2.5, 0.7]]
58  kmedians_instance = kmedians(sample, initial_medians)
59 
60  # Run cluster analysis and obtain results.
61  kmedians_instance.process()
62  clusters = kmedians_instance.get_clusters()
63  medians = kmedians_instance.get_medians()
64 
65  # Visualize clustering results.
66  visualizer = cluster_visualizer()
67  visualizer.append_clusters(clusters, sample)
68  visualizer.append_cluster(initial_medians, marker='*', markersize=10)
69  visualizer.append_cluster(medians, marker='*', markersize=10)
70  visualizer.show()
71  @endcode
72 
73  """
74 
75  def __init__(self, data, initial_medians, tolerance=0.001, ccore=True, **kwargs):
76  """!
77  @brief Constructor of clustering algorithm K-Medians.
78 
79  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
80  @param[in] initial_medians (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
81  @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
82  @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
83  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'itermax').
84 
85  <b>Keyword Args:</b><br>
86  - metric (distance_metric): Metric that is used for distance calculation between two points.
87  - itermax (uint): Maximum number of iterations for cluster analysis.
88 
89  """
90  self.__pointer_data = numpy.array(data)
91  self.__clusters = []
92  self.__medians = numpy.array(initial_medians)
93  self.__tolerance = tolerance
94  self.__total_wce = 0
95 
96  self.__itermax = kwargs.get('itermax', 100)
97  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
98  if self.__metric is None:
99  self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
100 
101  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
102  if self.__ccore:
103  self.__ccore = ccore_library.workable()
104 
105  self.__verify_arguments()
106 
107 
108  def process(self):
109  """!
110  @brief Performs cluster analysis in line with rules of K-Medians algorithm.
111 
112  @return (kmedians) Returns itself (K-Medians instance).
113 
114  @remark Results of clustering can be obtained using corresponding get methods.
115 
116  @see get_clusters()
117  @see get_medians()
118 
119  """
120 
121  if self.__ccore is True:
122  ccore_metric = metric_wrapper.create_instance(self.__metric)
123  self.__clusters, self.__medians = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance, self.__itermax, ccore_metric.get_pointer())
124 
125  else:
126  changes = float('inf')
127 
128  # Check for dimension
129  if len(self.__pointer_data[0]) != len(self.__medians[0]):
130  raise NameError('Dimension of the input data and dimension of the initial medians must be equal.')
131 
132  iterations = 0
133  while changes > self.__tolerance and iterations < self.__itermax:
134  self.__clusters = self.__update_clusters()
135  updated_centers = self.__update_medians()
136 
137  changes = max([self.__metric(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))])
138 
139  self.__medians = updated_centers
140 
141  iterations += 1
142 
143  self.__calculate_total_wce()
144 
145  return self
146 
147 
148  def predict(self, points):
149  """!
150  @brief Calculates the closest cluster to each point.
151 
152  @param[in] points (array_like): Points for which closest clusters are calculated.
153 
154  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
155  collection if 'process()' method was not called.
156 
157  An example how to calculate (or predict) the closest cluster to specified points.
158  @code
159  from pyclustering.cluster.kmedians import kmedians
160  from pyclustering.samples.definitions import SIMPLE_SAMPLES
161  from pyclustering.utils import read_sample
162 
163  # Load list of points for cluster analysis.
164  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
165 
166  # Initial centers for sample 'Simple3'.
167  initial_medians = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
168 
169  # Create instance of K-Medians algorithm with prepared centers.
170  kmedians_instance = kmedians(sample, initial_medians)
171 
172  # Run cluster analysis.
173  kmedians_instance.process()
174 
175  # Calculate the closest cluster to following two points.
176  points = [[0.25, 0.2], [2.5, 4.0]]
177  closest_clusters = kmedians_instance.predict(points)
178  print(closest_clusters)
179  @endcode
180 
181  """
182 
183  if len(self.__clusters) == 0:
184  return []
185 
186  differences = numpy.zeros((len(points), len(self.__medians)))
187  for index_point in range(len(points)):
188  differences[index_point] = [ self.__metric(points[index_point], center) for center in self.__medians ]
189 
190  return numpy.argmin(differences, axis=1)
191 
192 
193  def get_clusters(self):
194  """!
195  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
196 
197  @see process()
198  @see get_medians()
199 
200  """
201 
202  return self.__clusters
203 
204 
205  def get_medians(self):
206  """!
207  @brief Returns list of centers of allocated clusters.
208 
209  @see process()
210  @see get_clusters()
211 
212  """
213 
214  if isinstance(self.__medians, list):
215  return self.__medians
216 
217  return self.__medians.tolist()
218 
219 
220  def get_total_wce(self):
221  """!
222  @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
223  @details Sum of metric errors is calculated using distance between point and its center:
224  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
225 
226  @see process()
227  @see get_clusters()
228 
229  """
230 
231  return self.__total_wce
232 
233 
235  """!
236  @brief Returns clustering result representation type that indicate how clusters are encoded.
237 
238  @return (type_encoding) Clustering result representation.
239 
240  @see get_clusters()
241 
242  """
243 
244  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
245 
246 
247  def __update_clusters(self):
248  """!
249  @brief Calculate Manhattan distance to each point from the each cluster.
250  @details Nearest points are captured by according clusters and as a result clusters are updated.
251 
252  @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
253 
254  """
255 
256  clusters = [[] for i in range(len(self.__medians))]
257  for index_point in range(len(self.__pointer_data)):
258  index_optim = -1
259  dist_optim = 0.0
260 
261  for index in range(len(self.__medians)):
262  dist = self.__metric(self.__pointer_data[index_point], self.__medians[index])
263 
264  if (dist < dist_optim) or (index == 0):
265  index_optim = index
266  dist_optim = dist
267 
268  clusters[index_optim].append(index_point)
269 
270  # If cluster is not able to capture object it should be removed
271  clusters = [cluster for cluster in clusters if len(cluster) > 0]
272 
273  return clusters
274 
275 
276  def __calculate_total_wce(self):
277  """!
278  @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Medians algorithm.
279 
280  """
281 
282  dataset_differences = self.__calculate_dataset_difference(len(self.__clusters))
283 
284  self.__total_wce = 0
285  for index_cluster in range(len(self.__clusters)):
286  for index_point in self.__clusters[index_cluster]:
287  self.__total_wce += dataset_differences[index_cluster][index_point]
288 
289 
290  def __calculate_dataset_difference(self, amount_clusters):
291  """!
292  @brief Calculate distance from each point to each cluster center.
293 
294  """
295  self.__metric.enable_numpy_usage()
296  dataset_differences = numpy.zeros((amount_clusters, len(self.__pointer_data)))
297  for index_center in range(amount_clusters):
298  if self.__metric.get_type() != type_metric.USER_DEFINED:
299  dataset_differences[index_center] = self.__metric(self.__pointer_data, self.__medians[index_center])
300  else:
301  dataset_differences[index_center] = [self.__metric(point, self.__medians[index_center])
302  for point in self.__pointer_data]
303  self.__metric.disable_numpy_usage()
304  return dataset_differences
305 
306 
307  def __update_medians(self):
308  """!
309  @brief Calculate medians of clusters in line with contained objects.
310 
311  @return (list) list of medians for current number of clusters.
312 
313  """
314 
315  medians = [[] for i in range(len(self.__clusters))]
316 
317  for index in range(len(self.__clusters)):
318  medians[index] = [0.0 for i in range(len(self.__pointer_data[0]))]
319  length_cluster = len(self.__clusters[index])
320 
321  for index_dimension in range(len(self.__pointer_data[0])):
322  sorted_cluster = sorted(self.__clusters[index], key=lambda x: self.__pointer_data[x][index_dimension])
323 
324  relative_index_median = int(math.floor((length_cluster - 1) / 2))
325  index_median = sorted_cluster[relative_index_median]
326 
327  if (length_cluster % 2) == 0:
328  index_median_second = sorted_cluster[relative_index_median + 1]
329  medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0
330 
331  else:
332  medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension]
333 
334  return medians
335 
336 
337  def __verify_arguments(self):
338  """!
339  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
340 
341  """
342  if len(self.__pointer_data) == 0:
343  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
344 
345  if not hasattr(self.__pointer_data[0], "__getitem__"):
346  raise TypeError("Input data element does not have attribute '__getitem__'.")
347 
348  if len(self.__medians) == 0:
349  raise ValueError("Initial medians are empty (size: '%d')." % len(self.__medians))
350 
351  if not hasattr(self.__medians[0], "__getitem__"):
352  raise TypeError("Initial medians element does not have attribute '__getitem__'.")
353 
354  if self.__tolerance < 0:
355  raise ValueError("Tolerance (current value: '%d') should be greater or equal to 0." %
356  self.__tolerance)
357 
358  if self.__itermax < 0:
359  raise ValueError("Maximum iterations (current value: '%d') should be greater or equal to 0." %
360  self.__itermax)
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: kmedians.py:337
def predict(self, points)
Calculates the closest cluster to each point.
Definition: kmedians.py:148
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:67
def __update_clusters(self)
Calculate Manhattan distance to each point from the each cluster.
Definition: kmedians.py:247
def __calculate_dataset_difference(self, amount_clusters)
Calculate distance from each point to each cluster center.
Definition: kmedians.py:290
def process(self)
Performs cluster analysis in line with rules of K-Medians algorithm.
Definition: kmedians.py:108
def __init__(self, data, initial_medians, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Medians.
Definition: kmedians.py:75
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmedians.py:234
def __calculate_total_wce(self)
Calculate total within cluster errors that is depend on metric that was chosen for K-Medians algorith...
Definition: kmedians.py:276
Class represents clustering algorithm K-Medians.
Definition: kmedians.py:41
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: kmedians.py:220
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmedians.py:193
def get_medians(self)
Returns list of centers of allocated clusters.
Definition: kmedians.py:205
def __update_medians(self)
Calculate medians of clusters in line with contained objects.
Definition: kmedians.py:307