kmedians.py
1 """!
2 
3 @brief Cluster analysis algorithm: K-Medians
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import math
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.utils.metric import distance_metric, type_metric
33 
34 import pyclustering.core.kmedians_wrapper as wrapper
35 
36 from pyclustering.core.wrapper import ccore_library
37 from pyclustering.core.metric_wrapper import metric_wrapper
38 
39 
40 class kmedians:
41  """!
42  @brief Class represents clustering algorithm K-Medians.
43  @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
44 
45  CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
46 
47  Example:
48  @code
49  # load list of points for cluster analysis
50  sample = read_sample(path);
51 
52  # create instance of K-Medians algorithm
53  kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
54 
55  # run cluster analysis and obtain results
56  kmedians_instance.process();
57  kmedians_instance.get_clusters();
58  @endcode
59 
60  """
61 
62  def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
63  """!
64  @brief Constructor of clustering algorithm K-Medians.
65 
66  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
67  @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
68  @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
69  @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
70  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
71 
72  <b>Keyword Args:</b><br>
73  - metric (distance_metric): Metric that is used for distance calculation between two points.
74 
75  """
76  self.__pointer_data = data
77  self.__clusters = []
78  self.__medians = initial_centers[:]
79  self.__tolerance = tolerance
80 
81  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
82  if self.__metric is None:
83  self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
84 
85  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
86  if self.__ccore:
87  self.__ccore = ccore_library.workable()
88 
89 
90  def process(self):
91  """!
92  @brief Performs cluster analysis in line with rules of K-Medians algorithm.
93 
94  @return (kmedians) Returns itself (K-Medians instance).
95 
96  @remark Results of clustering can be obtained using corresponding get methods.
97 
98  @see get_clusters()
99  @see get_medians()
100 
101  """
102 
103  if self.__ccore is True:
104  ccore_metric = metric_wrapper.create_instance(self.__metric)
105  self.__clusters, self.__medians = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance, ccore_metric.get_pointer())
106 
107  else:
108  changes = float('inf')
109 
110  # Check for dimension
111  if len(self.__pointer_data[0]) != len(self.__medians[0]):
112  raise NameError('Dimension of the input data and dimension of the initial medians must be equal.')
113 
114  while changes > self.__tolerance:
115  self.__clusters = self.__update_clusters()
116  updated_centers = self.__update_medians()
117 
118  changes = max([self.__metric(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))])
119 
120  self.__medians = updated_centers
121 
122  return self
123 
124 
125  def get_clusters(self):
126  """!
127  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
128 
129  @see process()
130  @see get_medians()
131 
132  """
133 
134  return self.__clusters
135 
136 
137  def get_medians(self):
138  """!
139  @brief Returns list of centers of allocated clusters.
140 
141  @see process()
142  @see get_clusters()
143 
144  """
145 
146  return self.__medians
147 
148 
150  """!
151  @brief Returns clustering result representation type that indicate how clusters are encoded.
152 
153  @return (type_encoding) Clustering result representation.
154 
155  @see get_clusters()
156 
157  """
158 
159  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
160 
161 
162  def __update_clusters(self):
163  """!
164  @brief Calculate Manhattan distance to each point from the each cluster.
165  @details Nearest points are captured by according clusters and as a result clusters are updated.
166 
167  @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
168 
169  """
170 
171  clusters = [[] for i in range(len(self.__medians))]
172  for index_point in range(len(self.__pointer_data)):
173  index_optim = -1
174  dist_optim = 0.0
175 
176  for index in range(len(self.__medians)):
177  dist = self.__metric(self.__pointer_data[index_point], self.__medians[index])
178 
179  if (dist < dist_optim) or (index is 0):
180  index_optim = index
181  dist_optim = dist
182 
183  clusters[index_optim].append(index_point)
184 
185  # If cluster is not able to capture object it should be removed
186  clusters = [cluster for cluster in clusters if len(cluster) > 0]
187 
188  return clusters
189 
190 
191  def __update_medians(self):
192  """!
193  @brief Calculate medians of clusters in line with contained objects.
194 
195  @return (list) list of medians for current number of clusters.
196 
197  """
198 
199  medians = [[] for i in range(len(self.__clusters))]
200 
201  for index in range(len(self.__clusters)):
202  medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))]
203  length_cluster = len(self.__clusters[index])
204 
205  for index_dimension in range(len(self.__pointer_data[0])):
206  sorted_cluster = sorted(self.__clusters[index], key=lambda x: self.__pointer_data[x][index_dimension])
207 
208  relative_index_median = int(math.floor((length_cluster - 1) / 2))
209  index_median = sorted_cluster[relative_index_median]
210 
211  if (length_cluster % 2) == 0:
212  index_median_second = sorted_cluster[relative_index_median + 1]
213  medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0
214 
215  else:
216  medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension]
217 
218  return medians
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:58
def __update_clusters(self)
Calculate Manhattan distance to each point from the each cluster.
Definition: kmedians.py:162
def process(self)
Performs cluster analysis in line with rules of K-Medians algorithm.
Definition: kmedians.py:90
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmedians.py:149
def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Medians.
Definition: kmedians.py:62
Class represents clustering algorithm K-Medians.
Definition: kmedians.py:40
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmedians.py:125
def get_medians(self)
Returns list of centers of allocated clusters.
Definition: kmedians.py:137
def __update_medians(self)
Calculate medians of clusters in line with contained objects.
Definition: kmedians.py:191