kmedians.py
1 """!
2 
3 @brief Cluster analysis algorithm: K-Medians
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import math
29 import numpy
30 
31 from pyclustering.cluster.encoder import type_encoding
32 
33 from pyclustering.utils.metric import distance_metric, type_metric
34 
35 import pyclustering.core.kmedians_wrapper as wrapper
36 
37 from pyclustering.core.wrapper import ccore_library
38 from pyclustering.core.metric_wrapper import metric_wrapper
39 
40 
41 class kmedians:
42  """!
43  @brief Class represents clustering algorithm K-Medians.
44  @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
45 
46  Example:
47  @code
48  from pyclustering.cluster.kmedians import kmedians
49  from pyclustering.cluster import cluster_visualizer
50  from pyclustering.utils import read_sample
51  from pyclustering.samples.definitions import FCPS_SAMPLES
52 
53  # Load list of points for cluster analysis.
54  sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
55 
56  # Create instance of K-Medians algorithm.
57  initial_medians = [[0.0, 0.1], [2.5, 0.7]]
58  kmedians_instance = kmedians(sample, initial_medians)
59 
60  # Run cluster analysis and obtain results.
61  kmedians_instance.process()
62  clusters = kmedians_instance.get_clusters()
63  medians = kmedians_instance.get_medians()
64 
65  # Visualize clustering results.
66  visualizer = cluster_visualizer()
67  visualizer.append_clusters(clusters, sample)
68  visualizer.append_cluster(initial_medians, marker='*', markersize=10)
69  visualizer.append_cluster(medians, marker='*', markersize=10)
70  visualizer.show()
71  @endcode
72 
73  """
74 
75  def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
76  """!
77  @brief Constructor of clustering algorithm K-Medians.
78 
79  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
80  @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
81  @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
82  @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
83  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'itermax').
84 
85  <b>Keyword Args:</b><br>
86  - metric (distance_metric): Metric that is used for distance calculation between two points.
87  - itermax (uint): Maximum number of iterations for cluster analysis.
88 
89  """
90  self.__pointer_data = numpy.array(data)
91  self.__clusters = []
92  self.__medians = numpy.array(initial_centers)
93  self.__tolerance = tolerance
94  self.__total_wce = 0
95 
96  self.__itermax = kwargs.get('itermax', 100)
97  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
98  if self.__metric is None:
99  self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
100 
101  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
102  if self.__ccore:
103  self.__ccore = ccore_library.workable()
104 
105 
106  def process(self):
107  """!
108  @brief Performs cluster analysis in line with rules of K-Medians algorithm.
109 
110  @return (kmedians) Returns itself (K-Medians instance).
111 
112  @remark Results of clustering can be obtained using corresponding get methods.
113 
114  @see get_clusters()
115  @see get_medians()
116 
117  """
118 
119  if self.__ccore is True:
120  ccore_metric = metric_wrapper.create_instance(self.__metric)
121  self.__clusters, self.__medians = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance, self.__itermax, ccore_metric.get_pointer())
122 
123  else:
124  changes = float('inf')
125 
126  # Check for dimension
127  if len(self.__pointer_data[0]) != len(self.__medians[0]):
128  raise NameError('Dimension of the input data and dimension of the initial medians must be equal.')
129 
130  iterations = 0
131  while changes > self.__tolerance and iterations < self.__itermax:
132  self.__clusters = self.__update_clusters()
133  updated_centers = self.__update_medians()
134 
135  changes = max([self.__metric(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))])
136 
137  self.__medians = updated_centers
138 
139  iterations += 1
140 
141  self.__calculate_total_wce()
142 
143  return self
144 
145 
146  def predict(self, points):
147  """!
148  @brief Calculates the closest cluster to each point.
149 
150  @param[in] points (array_like): Points for which closest clusters are calculated.
151 
152  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
153  collection if 'process()' method was not called.
154 
155  An example how to calculate (or predict) the closest cluster to specified points.
156  @code
157  from pyclustering.cluster.kmedians import kmedians
158  from pyclustering.samples.definitions import SIMPLE_SAMPLES
159  from pyclustering.utils import read_sample
160 
161  # Load list of points for cluster analysis.
162  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
163 
164  # Initial centers for sample 'Simple3'.
165  initial_medians = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
166 
167  # Create instance of K-Medians algorithm with prepared centers.
168  kmedians_instance = kmedians(sample, initial_medians)
169 
170  # Run cluster analysis.
171  kmedians_instance.process()
172 
173  # Calculate the closest cluster to following two points.
174  points = [[0.25, 0.2], [2.5, 4.0]]
175  closest_clusters = kmedians_instance.predict(points)
176  print(closest_clusters)
177  @endcode
178 
179  """
180 
181  if len(self.__clusters) == 0:
182  return []
183 
184  differences = numpy.zeros((len(points), len(self.__medians)))
185  for index_point in range(len(points)):
186  differences[index_point] = [ self.__metric(points[index_point], center) for center in self.__medians ]
187 
188  return numpy.argmin(differences, axis=1)
189 
190 
191  def get_clusters(self):
192  """!
193  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
194 
195  @see process()
196  @see get_medians()
197 
198  """
199 
200  return self.__clusters
201 
202 
203  def get_medians(self):
204  """!
205  @brief Returns list of centers of allocated clusters.
206 
207  @see process()
208  @see get_clusters()
209 
210  """
211 
212  if isinstance(self.__medians, list):
213  return self.__medians
214 
215  return self.__medians.tolist()
216 
217 
218  def get_total_wce(self):
219  """!
220  @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
221  @details Sum of metric errors is calculated using distance between point and its center:
222  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
223 
224  @see process()
225  @see get_clusters()
226 
227  """
228 
229  return self.__total_wce
230 
231 
233  """!
234  @brief Returns clustering result representation type that indicate how clusters are encoded.
235 
236  @return (type_encoding) Clustering result representation.
237 
238  @see get_clusters()
239 
240  """
241 
242  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
243 
244 
245  def __update_clusters(self):
246  """!
247  @brief Calculate Manhattan distance to each point from the each cluster.
248  @details Nearest points are captured by according clusters and as a result clusters are updated.
249 
250  @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
251 
252  """
253 
254  clusters = [[] for i in range(len(self.__medians))]
255  for index_point in range(len(self.__pointer_data)):
256  index_optim = -1
257  dist_optim = 0.0
258 
259  for index in range(len(self.__medians)):
260  dist = self.__metric(self.__pointer_data[index_point], self.__medians[index])
261 
262  if (dist < dist_optim) or (index == 0):
263  index_optim = index
264  dist_optim = dist
265 
266  clusters[index_optim].append(index_point)
267 
268  # If cluster is not able to capture object it should be removed
269  clusters = [cluster for cluster in clusters if len(cluster) > 0]
270 
271  return clusters
272 
273 
274  def __calculate_total_wce(self):
275  """!
276  @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Medians algorithm.
277 
278  """
279 
280  dataset_differences = self.__calculate_dataset_difference(len(self.__clusters))
281 
282  self.__total_wce = 0
283  for index_cluster in range(len(self.__clusters)):
284  for index_point in self.__clusters[index_cluster]:
285  self.__total_wce += dataset_differences[index_cluster][index_point]
286 
287 
288  def __calculate_dataset_difference(self, amount_clusters):
289  """!
290  @brief Calculate distance from each point to each cluster center.
291 
292  """
293  self.__metric.enable_numpy_usage()
294  dataset_differences = numpy.zeros((amount_clusters, len(self.__pointer_data)))
295  for index_center in range(amount_clusters):
296  if self.__metric.get_type() != type_metric.USER_DEFINED:
297  dataset_differences[index_center] = self.__metric(self.__pointer_data, self.__medians[index_center])
298  else:
299  dataset_differences[index_center] = [self.__metric(point, self.__medians[index_center])
300  for point in self.__pointer_data]
301  self.__metric.disable_numpy_usage()
302  return dataset_differences
303 
304 
305  def __update_medians(self):
306  """!
307  @brief Calculate medians of clusters in line with contained objects.
308 
309  @return (list) list of medians for current number of clusters.
310 
311  """
312 
313  medians = [[] for i in range(len(self.__clusters))]
314 
315  for index in range(len(self.__clusters)):
316  medians[index] = [0.0 for i in range(len(self.__pointer_data[0]))]
317  length_cluster = len(self.__clusters[index])
318 
319  for index_dimension in range(len(self.__pointer_data[0])):
320  sorted_cluster = sorted(self.__clusters[index], key=lambda x: self.__pointer_data[x][index_dimension])
321 
322  relative_index_median = int(math.floor((length_cluster - 1) / 2))
323  index_median = sorted_cluster[relative_index_median]
324 
325  if (length_cluster % 2) == 0:
326  index_median_second = sorted_cluster[relative_index_median + 1]
327  medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0
328 
329  else:
330  medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension]
331 
332  return medians
def predict(self, points)
Calculates the closest cluster to each point.
Definition: kmedians.py:146
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:64
def __update_clusters(self)
Calculate Manhattan distance to each point from the each cluster.
Definition: kmedians.py:245
def __calculate_dataset_difference(self, amount_clusters)
Calculate distance from each point to each cluster center.
Definition: kmedians.py:288
def process(self)
Performs cluster analysis in line with rules of K-Medians algorithm.
Definition: kmedians.py:106
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmedians.py:232
def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Medians.
Definition: kmedians.py:75
def __calculate_total_wce(self)
Calculate total within cluster errors that is depend on metric that was chosen for K-Medians algorith...
Definition: kmedians.py:274
Class represents clustering algorithm K-Medians.
Definition: kmedians.py:41
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: kmedians.py:218
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmedians.py:191
def get_medians(self)
Returns list of centers of allocated clusters.
Definition: kmedians.py:203
def __update_medians(self)
Calculate medians of clusters in line with contained objects.
Definition: kmedians.py:305