kmedians.py
1 """!
2 
3 @brief Cluster analysis algorithm: K-Medians
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import math
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.utils.metric import distance_metric, type_metric
33 
34 import pyclustering.core.kmedians_wrapper as wrapper
35 
36 from pyclustering.core.wrapper import ccore_library
37 from pyclustering.core.metric_wrapper import metric_wrapper
38 
39 
40 class kmedians:
41  """!
42  @brief Class represents clustering algorithm K-Medians.
43  @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
44 
45  Example:
46  @code
47  from pyclustering.cluster.kmedians import kmedians
48  from pyclustering.cluster import cluster_visualizer
49  from pyclustering.utils import read_sample
50  from pyclustering.samples.definitions import FCPS_SAMPLES
51 
52  # Load list of points for cluster analysis.
53  sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
54 
55  # Create instance of K-Medians algorithm.
56  initial_medians = [[0.0, 0.1], [2.5, 0.7]]
57  kmedians_instance = kmedians(sample, initial_medians)
58 
59  # Run cluster analysis and obtain results.
60  kmedians_instance.process()
61  clusters = kmedians_instance.get_clusters()
62  medians = kmedians_instance.get_medians()
63 
64  # Visualize clustering results.
65  visualizer = cluster_visualizer()
66  visualizer.append_clusters(clusters, sample)
67  visualizer.append_cluster(initial_medians, marker='*', markersize=10)
68  visualizer.append_cluster(medians, marker='*', markersize=10)
69  visualizer.show()
70  @endcode
71 
72  """
73 
74  def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
75  """!
76  @brief Constructor of clustering algorithm K-Medians.
77 
78  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
79  @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
80  @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
81  @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
82  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'itermax').
83 
84  <b>Keyword Args:</b><br>
85  - metric (distance_metric): Metric that is used for distance calculation between two points.
86  - itermax (uint): Maximum number of iterations for cluster analysis.
87 
88  """
89  self.__pointer_data = data
90  self.__clusters = []
91  self.__medians = initial_centers[:]
92  self.__tolerance = tolerance
93 
94  self.__itermax = kwargs.get('itermax', 100)
95  self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
96  if self.__metric is None:
97  self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
98 
99  self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
100  if self.__ccore:
101  self.__ccore = ccore_library.workable()
102 
103 
104  def process(self):
105  """!
106  @brief Performs cluster analysis in line with rules of K-Medians algorithm.
107 
108  @return (kmedians) Returns itself (K-Medians instance).
109 
110  @remark Results of clustering can be obtained using corresponding get methods.
111 
112  @see get_clusters()
113  @see get_medians()
114 
115  """
116 
117  if self.__ccore is True:
118  ccore_metric = metric_wrapper.create_instance(self.__metric)
119  self.__clusters, self.__medians = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance, self.__itermax, ccore_metric.get_pointer())
120 
121  else:
122  changes = float('inf')
123 
124  # Check for dimension
125  if len(self.__pointer_data[0]) != len(self.__medians[0]):
126  raise NameError('Dimension of the input data and dimension of the initial medians must be equal.')
127 
128  iterations = 0
129  while changes > self.__tolerance and iterations < self.__itermax:
130  self.__clusters = self.__update_clusters()
131  updated_centers = self.__update_medians()
132 
133  changes = max([self.__metric(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))])
134 
135  self.__medians = updated_centers
136 
137  iterations += 1
138 
139  return self
140 
141 
142  def get_clusters(self):
143  """!
144  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
145 
146  @see process()
147  @see get_medians()
148 
149  """
150 
151  return self.__clusters
152 
153 
154  def get_medians(self):
155  """!
156  @brief Returns list of centers of allocated clusters.
157 
158  @see process()
159  @see get_clusters()
160 
161  """
162 
163  return self.__medians
164 
165 
167  """!
168  @brief Returns clustering result representation type that indicate how clusters are encoded.
169 
170  @return (type_encoding) Clustering result representation.
171 
172  @see get_clusters()
173 
174  """
175 
176  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
177 
178 
179  def __update_clusters(self):
180  """!
181  @brief Calculate Manhattan distance to each point from the each cluster.
182  @details Nearest points are captured by according clusters and as a result clusters are updated.
183 
184  @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
185 
186  """
187 
188  clusters = [[] for i in range(len(self.__medians))]
189  for index_point in range(len(self.__pointer_data)):
190  index_optim = -1
191  dist_optim = 0.0
192 
193  for index in range(len(self.__medians)):
194  dist = self.__metric(self.__pointer_data[index_point], self.__medians[index])
195 
196  if (dist < dist_optim) or (index == 0):
197  index_optim = index
198  dist_optim = dist
199 
200  clusters[index_optim].append(index_point)
201 
202  # If cluster is not able to capture object it should be removed
203  clusters = [cluster for cluster in clusters if len(cluster) > 0]
204 
205  return clusters
206 
207 
208  def __update_medians(self):
209  """!
210  @brief Calculate medians of clusters in line with contained objects.
211 
212  @return (list) list of medians for current number of clusters.
213 
214  """
215 
216  medians = [[] for i in range(len(self.__clusters))]
217 
218  for index in range(len(self.__clusters)):
219  medians[index] = [0.0 for i in range(len(self.__pointer_data[0]))]
220  length_cluster = len(self.__clusters[index])
221 
222  for index_dimension in range(len(self.__pointer_data[0])):
223  sorted_cluster = sorted(self.__clusters[index], key=lambda x: self.__pointer_data[x][index_dimension])
224 
225  relative_index_median = int(math.floor((length_cluster - 1) / 2))
226  index_median = sorted_cluster[relative_index_median]
227 
228  if (length_cluster % 2) == 0:
229  index_median_second = sorted_cluster[relative_index_median + 1]
230  medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0
231 
232  else:
233  medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension]
234 
235  return medians
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1
Module for representing clustering results.
Definition: encoder.py:1
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:64
def __update_clusters(self)
Calculate Manhattan distance to each point from the each cluster.
Definition: kmedians.py:179
def process(self)
Performs cluster analysis in line with rules of K-Medians algorithm.
Definition: kmedians.py:104
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmedians.py:166
def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Medians.
Definition: kmedians.py:74
Class represents clustering algorithm K-Medians.
Definition: kmedians.py:40
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmedians.py:142
def get_medians(self)
Returns list of centers of allocated clusters.
Definition: kmedians.py:154
def __update_medians(self)
Calculate medians of clusters in line with contained objects.
Definition: kmedians.py:208