kmedoids.py
1 """!
2 
3 @brief Cluster analysis algorithm: K-Medoids.
4 @details Implementation based on papers @cite book::algorithms_for_clustering_data, @cite book::finding_groups_in_data.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.utils import medoid
33 from pyclustering.utils.metric import distance_metric, type_metric
34 
35 import pyclustering.core.kmedoids_wrapper as wrapper
36 
37 from pyclustering.core.wrapper import ccore_library
38 from pyclustering.core.metric_wrapper import metric_wrapper
39 
40 
41 class kmedoids:
42  """!
43  @brief Class represents clustering algorithm K-Medoids.
44  @details The algorithm is less sensitive to outliers tham K-Means. The principle difference between K-Medoids and K-Medians is that
45  K-Medoids uses existed points from input data space as medoids, but median in K-Medians can be unreal object (not from
46  input data space).
47 
48  Clustering example:
49  @code
50  from pyclustering.cluster.kmedoids import kmedoids
51  from pyclustering.cluster import cluster_visualizer
52  from pyclustering.utils import read_sample
53  from pyclustering.samples.definitions import FCPS_SAMPLES
54 
55  # Load list of points for cluster analysis.
56  sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
57 
58  # Set random initial medoids.
59  initial_medoids = [1, 500]
60 
61  # Create instance of K-Medoids algorithm.
62  kmedoids_instance = kmedoids(sample, initial_medoids)
63 
64  # Run cluster analysis and obtain results.
65  kmedoids_instance.process()
66  clusters = kmedoids_instance.get_clusters()
67 
68  # Show allocated clusters.
69  print(clusters)
70 
71  # Display clusters.
72  visualizer = cluster_visualizer()
73  visualizer.append_clusters(clusters, sample)
74  visualizer.show()
75  @endcode
76 
77  Metric for calculation distance between points can be specified by parameter additional 'metric':
78  @code
79  # create Minkowski distance metric with degree equals to '2'
80  metric = distance_metric(type_metric.MINKOWSKI, degree=2)
81 
82  # create K-Medoids algorithm with specific distance metric
83  kmedoids_instance = kmedoids(sample, initial_medoids, metric=metric)
84 
85  # run cluster analysis and obtain results
86  kmedoids_instance.process()
87  clusters = kmedoids_instance.get_clusters()
88  @endcode
89 
90  Distance matrix can be used instead of sequence of points to increase performance and for that purpose parameter 'data_type' should be used:
91  @code
92  # calculate distance matrix for sample
93  sample = read_sample(path_to_sample)
94  matrix = calculate_distance_matrix(sample)
95 
96  # create K-Medoids algorithm for processing distance matrix instead of points
97  kmedoids_instance = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
98 
99  # run cluster analysis and obtain results
100  kmedoids_instance.process()
101 
102  clusters = kmedoids_instance.get_clusters()
103  medoids = kmedoids_instance.get_medoids()
104  @endcode
105 
106  """
107 
108 
109  def __init__(self, data, initial_index_medoids, tolerance=0.001, ccore=True, **kwargs):
110  """!
111  @brief Constructor of clustering algorithm K-Medoids.
112 
113  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
114  @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
115  @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
116  @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
117  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type', 'itermax').
118 
119  <b>Keyword Args:</b><br>
120  - metric (distance_metric): Metric that is used for distance calculation between two points.
121  - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
122  - itermax (uint): Maximum number of iteration for cluster analysis.
123 
124  """
125  self.__pointer_data = data
126  self.__clusters = []
127  self.__medoid_indexes = initial_index_medoids
128  self.__tolerance = tolerance
129 
130  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
131  self.__data_type = kwargs.get('data_type', 'points')
132  self.__itermax = kwargs.get('itermax', 200)
133 
135 
136  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
137  if self.__ccore:
138  self.__ccore = ccore_library.workable()
139 
140  self.__verify_arguments()
141 
142 
143  def process(self):
144  """!
145  @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
146 
147  @return (kmedoids) Returns itself (K-Medoids instance).
148 
149  @remark Results of clustering can be obtained using corresponding get methods.
150 
151  @see get_clusters()
152  @see get_medoids()
153 
154  """
155 
156  if self.__ccore is True:
157  ccore_metric = metric_wrapper.create_instance(self.__metric)
158  self.__clusters, self.__medoid_indexes = wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance, self.__itermax, ccore_metric.get_pointer(), self.__data_type)
159 
160  else:
161  changes = float('inf')
162  iterations = 0
163 
164  while changes > self.__tolerance and iterations < self.__itermax:
165  self.__clusters = self.__update_clusters()
166  update_medoid_indexes = self.__update_medoids()
167 
168  changes = max([self.__distance_calculator(self.__medoid_indexes[index], update_medoid_indexes[index]) for index in range(len(update_medoid_indexes))])
169 
170  self.__medoid_indexes = update_medoid_indexes
171 
172  iterations += 1
173 
174  return self
175 
176 
177  def predict(self, points):
178  """!
179  @brief Calculates the closest cluster to each point.
180 
181  @param[in] points (array_like): Points for which closest clusters are calculated.
182 
183  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
184  collection if 'process()' method was not called.
185 
186  An example how to calculate (or predict) the closest cluster to specified points.
187  @code
188  from pyclustering.cluster.kmedoids import kmedoids
189  from pyclustering.samples.definitions import SIMPLE_SAMPLES
190  from pyclustering.utils import read_sample
191 
192  # Load list of points for cluster analysis.
193  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
194 
195  # Initial medoids for sample 'Simple3'.
196  initial_medoids = [4, 12, 25, 37]
197 
198  # Create instance of K-Medoids algorithm with prepared centers.
199  kmedoids_instance = kmedoids(sample, initial_medoids)
200 
201  # Run cluster analysis.
202  kmedoids_instance.process()
203 
204  # Calculate the closest cluster to following two points.
205  points = [[0.35, 0.5], [2.5, 2.0]]
206  closest_clusters = kmedoids_instance.predict(points)
207  print(closest_clusters)
208  @endcode
209 
210  """
211 
212  if len(self.__clusters) == 0:
213  return []
214 
215  medoids = [ self.__pointer_data[index] for index in self.__medoid_indexes ]
216  differences = numpy.zeros((len(points), len(medoids)))
217  for index_point in range(len(points)):
218  differences[index_point] = [ self.__metric(points[index_point], center) for center in medoids ]
219 
220  return numpy.argmin(differences, axis=1)
221 
222 
223  def get_clusters(self):
224  """!
225  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
226 
227  @see process()
228  @see get_medoids()
229 
230  """
231 
232  return self.__clusters
233 
234 
235  def get_medoids(self):
236  """!
237  @brief Returns list of medoids of allocated clusters represented by indexes from the input data.
238 
239  @see process()
240  @see get_clusters()
241 
242  """
243 
244  return self.__medoid_indexes
245 
246 
248  """!
249  @brief Returns clustering result representation type that indicate how clusters are encoded.
250 
251  @return (type_encoding) Clustering result representation.
252 
253  @see get_clusters()
254 
255  """
256 
257  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
258 
259 
260  def __verify_arguments(self):
261  """!
262  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
263 
264  """
265  if len(self.__pointer_data) == 0:
266  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
267 
268  if len(self.__medoid_indexes) == 0:
269  raise ValueError("Initial medoids are empty (size: '%d')." % len(self.__pointer_data))
270 
271  if self.__tolerance < 0:
272  raise ValueError("Tolerance (current value: '%d') should be greater or equal to 0." %
273  self.__tolerance)
274 
275  if self.__itermax < 0:
276  raise ValueError("Maximum iterations (current value: '%d') should be greater or equal to 0." %
277  self.__tolerance)
278 
279 
280  def __create_distance_calculator(self):
281  """!
282  @brief Creates distance calculator in line with algorithms parameters.
283 
284  @return (callable) Distance calculator.
285 
286  """
287  if self.__data_type == 'points':
288  return lambda index1, index2: self.__metric(self.__pointer_data[index1], self.__pointer_data[index2])
289 
290  elif self.__data_type == 'distance_matrix':
291  if isinstance(self.__pointer_data, numpy.matrix):
292  return lambda index1, index2: self.__pointer_data.item((index1, index2))
293 
294  return lambda index1, index2: self.__pointer_data[index1][index2]
295 
296  else:
297  raise TypeError("Unknown type of data is specified '%s'" % self.__data_type)
298 
299 
300  def __update_clusters(self):
301  """!
302  @brief Calculate distance to each point from the each cluster.
303  @details Nearest points are captured by according clusters and as a result clusters are updated.
304 
305  @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
306 
307  """
308 
309  clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoid_indexes))]
310  for index_point in range(len(self.__pointer_data)):
311  if index_point in self.__medoid_indexes:
312  continue
313 
314  index_optim = -1
315  dist_optim = float('Inf')
316 
317  for index in range(len(self.__medoid_indexes)):
318  dist = self.__distance_calculator(index_point, self.__medoid_indexes[index])
319 
320  if dist < dist_optim:
321  index_optim = index
322  dist_optim = dist
323 
324  clusters[index_optim].append(index_point)
325 
326  return clusters
327 
328 
329  def __update_medoids(self):
330  """!
331  @brief Find medoids of clusters in line with contained objects.
332 
333  @return (list) list of medoids for current number of clusters.
334 
335  """
336 
337  medoid_indexes = [-1] * len(self.__clusters)
338 
339  for index in range(len(self.__clusters)):
340  medoid_index = medoid(self.__pointer_data, self.__clusters[index], metric=self.__metric, data_type=self.__data_type)
341  medoid_indexes[index] = medoid_index
342 
343  return medoid_indexes
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def __update_clusters(self)
Calculate distance to each point from the each cluster.
Definition: kmedoids.py:300
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def __update_medoids(self)
Find medoids of clusters in line with contained objects.
Definition: kmedoids.py:329
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmedoids.py:247
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:67
def __init__(self, data, initial_index_medoids, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Medoids.
Definition: kmedoids.py:109
def __create_distance_calculator(self)
Creates distance calculator in line with algorithms parameters.
Definition: kmedoids.py:280
Class represents clustering algorithm K-Medoids.
Definition: kmedoids.py:41
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmedoids.py:223
def process(self)
Performs cluster analysis in line with rules of K-Medoids algorithm.
Definition: kmedoids.py:143
def get_medoids(self)
Returns list of medoids of allocated clusters represented by indexes from the input data...
Definition: kmedoids.py:235
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: kmedoids.py:260
def predict(self, points)
Calculates the closest cluster to each point.
Definition: kmedoids.py:177