kmedoids.py
1 """!
2 
3 @brief Cluster analysis algorithm: K-Medoids.
4 @details Implementation based on papers @cite book::algorithms_for_clustering_data, @cite book::finding_groups_in_data.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.utils import medoid
33 from pyclustering.utils.metric import distance_metric, type_metric
34 
35 import pyclustering.core.kmedoids_wrapper as wrapper
36 
37 from pyclustering.core.wrapper import ccore_library
38 from pyclustering.core.metric_wrapper import metric_wrapper
39 
40 
41 class kmedoids:
42  """!
43  @brief Class represents clustering algorithm K-Medoids.
44  @details The algorithm is less sensitive to outliers tham K-Means. The principle difference between K-Medoids and K-Medians is that
45  K-Medoids uses existed points from input data space as medoids, but median in K-Medians can be unreal object (not from
46  input data space).
47 
48  Clustering example:
49  @code
50  from pyclustering.cluster.kmedoids import kmedoids
51  from pyclustering.cluster import cluster_visualizer
52  from pyclustering.utils import read_sample
53  from pyclustering.samples.definitions import FCPS_SAMPLES
54 
55  # Load list of points for cluster analysis.
56  sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
57 
58  # Set random initial medoids.
59  initial_medoids = [1, 500]
60 
61  # Create instance of K-Medoids algorithm.
62  kmedoids_instance = kmedoids(sample, initial_medoids)
63 
64  # Run cluster analysis and obtain results.
65  kmedoids_instance.process()
66  clusters = kmedoids_instance.get_clusters()
67 
68  # Show allocated clusters.
69  print(clusters)
70 
71  # Display clusters.
72  visualizer = cluster_visualizer()
73  visualizer.append_clusters(clusters, sample)
74  visualizer.show()
75  @endcode
76 
77  Metric for calculation distance between points can be specified by parameter additional 'metric':
78  @code
79  # create Minkowski distance metric with degree equals to '2'
80  metric = distance_metric(type_metric.MINKOWSKI, degree=2)
81 
82  # create K-Medoids algorithm with specific distance metric
83  kmedoids_instance = kmedoids(sample, initial_medoids, metric=metric)
84 
85  # run cluster analysis and obtain results
86  kmedoids_instance.process()
87  clusters = kmedoids_instance.get_clusters()
88  @endcode
89 
90  Distance matrix can be used instead of sequence of points to increase performance and for that purpose parameter 'data_type' should be used:
91  @code
92  # calculate distance matrix for sample
93  sample = read_sample(path_to_sample)
94  matrix = calculate_distance_matrix(sample)
95 
96  # create K-Medoids algorithm for processing distance matrix instead of points
97  kmedoids_instance = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
98 
99  # run cluster analysis and obtain results
100  kmedoids_instance.process()
101 
102  clusters = kmedoids_instance.get_clusters()
103  medoids = kmedoids_instance.get_medoids()
104  @endcode
105 
106  """
107 
108 
109  def __init__(self, data, initial_index_medoids, tolerance=0.001, ccore=True, **kwargs):
110  """!
111  @brief Constructor of clustering algorithm K-Medoids.
112 
113  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
114  @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
115  @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
116  @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
117  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type', 'itermax').
118 
119  <b>Keyword Args:</b><br>
120  - metric (distance_metric): Metric that is used for distance calculation between two points.
121  - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
122  - itermax (uint): Maximum number of iteration for cluster analysis.
123 
124  """
125  self.__pointer_data = data
126  self.__clusters = []
127  self.__medoid_indexes = initial_index_medoids
128  self.__tolerance = tolerance
129 
130  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
131  self.__data_type = kwargs.get('data_type', 'points')
132  self.__itermax = kwargs.get('itermax', 200)
133 
135 
136  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
137  if self.__ccore:
138  self.__ccore = ccore_library.workable()
139 
140 
141  def process(self):
142  """!
143  @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
144 
145  @return (kmedoids) Returns itself (K-Medoids instance).
146 
147  @remark Results of clustering can be obtained using corresponding get methods.
148 
149  @see get_clusters()
150  @see get_medoids()
151 
152  """
153 
154  if self.__ccore is True:
155  ccore_metric = metric_wrapper.create_instance(self.__metric)
156  self.__clusters, self.__medoid_indexes = wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance, self.__itermax, ccore_metric.get_pointer(), self.__data_type)
157 
158  else:
159  changes = float('inf')
160  iterations = 0
161 
162  while changes > self.__tolerance and iterations < self.__itermax:
163  self.__clusters = self.__update_clusters()
164  update_medoid_indexes = self.__update_medoids()
165 
166  changes = max([self.__distance_calculator(self.__medoid_indexes[index], update_medoid_indexes[index]) for index in range(len(update_medoid_indexes))])
167 
168  self.__medoid_indexes = update_medoid_indexes
169 
170  iterations += 1
171 
172  return self
173 
174 
175  def get_clusters(self):
176  """!
177  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
178 
179  @see process()
180  @see get_medoids()
181 
182  """
183 
184  return self.__clusters
185 
186 
187  def get_medoids(self):
188  """!
189  @brief Returns list of medoids of allocated clusters represented by indexes from the input data.
190 
191  @see process()
192  @see get_clusters()
193 
194  """
195 
196  return self.__medoid_indexes
197 
198 
200  """!
201  @brief Returns clustering result representation type that indicate how clusters are encoded.
202 
203  @return (type_encoding) Clustering result representation.
204 
205  @see get_clusters()
206 
207  """
208 
209  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
210 
211 
212  def __create_distance_calculator(self):
213  """!
214  @brief Creates distance calculator in line with algorithms parameters.
215 
216  @return (callable) Distance calculator.
217 
218  """
219  if self.__data_type == 'points':
220  return lambda index1, index2: self.__metric(self.__pointer_data[index1], self.__pointer_data[index2])
221 
222  elif self.__data_type == 'distance_matrix':
223  if isinstance(self.__pointer_data, numpy.matrix):
224  return lambda index1, index2: self.__pointer_data.item((index1, index2))
225 
226  return lambda index1, index2: self.__pointer_data[index1][index2]
227 
228  else:
229  raise TypeError("Unknown type of data is specified '%s'" % self.__data_type)
230 
231 
232  def __update_clusters(self):
233  """!
234  @brief Calculate distance to each point from the each cluster.
235  @details Nearest points are captured by according clusters and as a result clusters are updated.
236 
237  @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
238 
239  """
240 
241  clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoid_indexes))]
242  for index_point in range(len(self.__pointer_data)):
243  if index_point in self.__medoid_indexes:
244  continue
245 
246  index_optim = -1
247  dist_optim = float('Inf')
248 
249  for index in range(len(self.__medoid_indexes)):
250  dist = self.__distance_calculator(index_point, self.__medoid_indexes[index])
251 
252  if dist < dist_optim:
253  index_optim = index
254  dist_optim = dist
255 
256  clusters[index_optim].append(index_point)
257 
258  return clusters
259 
260 
261  def __update_medoids(self):
262  """!
263  @brief Find medoids of clusters in line with contained objects.
264 
265  @return (list) list of medoids for current number of clusters.
266 
267  """
268 
269  medoid_indexes = [-1] * len(self.__clusters)
270 
271  for index in range(len(self.__clusters)):
272  medoid_index = medoid(self.__pointer_data, self.__clusters[index], metric=self.__metric, data_type=self.__data_type)
273  medoid_indexes[index] = medoid_index
274 
275  return medoid_indexes
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
def __update_clusters(self)
Calculate distance to each point from the each cluster.
Definition: kmedoids.py:232
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def __update_medoids(self)
Find medoids of clusters in line with contained objects.
Definition: kmedoids.py:261
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmedoids.py:199
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:64
def __init__(self, data, initial_index_medoids, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Medoids.
Definition: kmedoids.py:109
def __create_distance_calculator(self)
Creates distance calculator in line with algorithms parameters.
Definition: kmedoids.py:212
Class represents clustering algorithm K-Medoids.
Definition: kmedoids.py:41
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmedoids.py:175
def process(self)
Performs cluster analysis in line with rules of K-Medoids algorithm.
Definition: kmedoids.py:141
def get_medoids(self)
Returns list of medoids of allocated clusters represented by indexes from the input data...
Definition: kmedoids.py:187