center_initializer.py
1 """!
2 
3 @brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4 @details Implementation based on paper @cite article::kmeans++::1.
5 
6 @authors Andrei Novikov, Aleksey Kukushkin (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @see pyclustering.cluster.kmeans
11 @see puclustering.cluster.xmeans
12 
13 @cond GNU_PUBLIC_LICENSE
14  PyClustering is free software: you can redistribute it and/or modify
15  it under the terms of the GNU General Public License as published by
16  the Free Software Foundation, either version 3 of the License, or
17  (at your option) any later version.
18 
19  PyClustering is distributed in the hope that it will be useful,
20  but WITHOUT ANY WARRANTY; without even the implied warranty of
21  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22  GNU General Public License for more details.
23 
24  You should have received a copy of the GNU General Public License
25  along with this program. If not, see <http://www.gnu.org/licenses/>.
26 @endcond
27 
28 """
29 
30 
31 import numpy
32 import random
33 import warnings
34 
35 
37  """!
38  @brief Random center initializer is for generation specified amount of random of centers for specified data.
39 
40  """
41 
42  def __init__(self, data, amount_centers):
43  """!
44  @brief Creates instance of random center initializer.
45 
46  @param[in] data (list): List of points where each point is represented by list of coordinates.
47  @param[in] amount_centers (unit): Amount of centers that should be initialized.
48 
49  """
50 
51  self.__data = data
52  self.__amount = amount_centers
53  self.__available_indexes = set(list(range(len(self.__data))))
54 
55  if self.__amount <= 0:
56  raise ValueError("Amount of cluster centers should be at least 1.")
57 
58  if self.__amount > len(self.__data):
59  raise ValueError("Amount of cluster centers '%d' should be less than data size." % self.__amount)
60 
61 
62  def initialize(self, **kwargs):
63  """!
64  @brief Generates random centers in line with input parameters.
65 
66  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'return_index').
67 
68  <b>Keyword Args:</b><br>
69  - return_index (bool): If True then returns indexes of points from input data instead of points itself.
70 
71  @return (list) List of initialized initial centers.
72  If argument 'return_index' is False then returns list of points.
73  If argument 'return_index' is True then returns list of indexes.
74 
75  """
76  return_index = kwargs.get('return_index', False)
77  if self.__amount == len(self.__data):
78  if return_index:
79  return list(range(len(self.__data)))
80  return self.__data[:]
81 
82  return [self.__create_center(return_index) for _ in range(self.__amount)]
83 
84 
85  def __create_center(self, return_index):
86  """!
87  @brief Generates and returns random center.
88 
89  @param[in] return_index (bool): If True then returns index of point from input data instead of point itself.
90 
91  """
92  random_index_point = random.randint(0, len(self.__data[0]))
93  if random_index_point not in self.__available_indexes:
94  random_index_point = self.__available_indexes.pop()
95  else:
96  self.__available_indexes.remove(random_index_point)
97 
98  if return_index:
99  return random_index_point
100  return self.__data[random_index_point]
101 
102 
103 
105  """!
106  @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
107  @details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
108  initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
109  out optimal initial centers.
110 
111  Algorithm can be divided into three steps. The first center is chosen from input data randomly with
112  uniform distribution at the first step. At the second, probability to being center is calculated for each point:
113  \f[p_{i}=\frac{D(x_{i})}{\sum_{j=0}^{N}D(x_{j})}\f]
114  where \f$D(x_{i})\f$ is a distance from point \f$i\f$ to the closest center. Using this probabilities next center
115  is chosen. The last step is repeated until required amount of centers is initialized.
116 
117  Pyclustering implementation of the algorithm provides feature to consider several candidates on the second
118  step, for example:
119 
120  @code
121  amount_centers = 4;
122  amount_candidates = 3;
123  initializer = kmeans_plusplus_initializer(sample, amount_centers, amount_candidates);
124  @endcode
125 
126  If the farthest points should be used as centers then special constant 'FARTHEST_CENTER_CANDIDATE' should be used
127  for that purpose, for example:
128  @code
129  amount_centers = 4;
130  amount_candidates = kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE;
131  initializer = kmeans_plusplus_initializer(sample, amount_centers, amount_candidates);
132  @endcode
133 
134  There is an example of initial centers that were calculated by the K-Means++ method:
135 
136  @image html kmeans_plusplus_initializer_results.png
137 
138  Code example where initial centers are prepared for K-Means algorithm:
139  @code
140  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
141  from pyclustering.cluster.kmeans import kmeans
142  from pyclustering.cluster import cluster_visualizer
143  from pyclustering.utils import read_sample
144  from pyclustering.samples.definitions import SIMPLE_SAMPLES
145 
146  # Read data 'SampleSimple3' from Simple Sample collection.
147  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
148 
149  # Calculate initial centers using K-Means++ method.
150  centers = kmeans_plusplus_initializer(sample, 4, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
151 
152  # Display initial centers.
153  visualizer = cluster_visualizer()
154  visualizer.append_cluster(sample)
155  visualizer.append_cluster(centers, marker='*', markersize=10)
156  visualizer.show()
157 
158  # Perform cluster analysis using K-Means algorithm with initial centers.
159  kmeans_instance = kmeans(sample, centers)
160 
161  # Run clustering process and obtain result.
162  kmeans_instance.process()
163  clusters = kmeans_instance.get_clusters()
164  @endcode
165 
166  """
167 
168 
169 
170  FARTHEST_CENTER_CANDIDATE = "farthest"
171 
172 
173  def __init__(self, data, amount_centers, amount_candidates = 1):
174  """!
175  @brief Creates K-Means++ center initializer instance.
176 
177  @param[in] data (array_like): List of points where each point is represented by list of coordinates.
178  @param[in] amount_centers (uint): Amount of centers that should be initialized.
179  @param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points (with the highest probability) should
180  be considered as centers then special constant should be used 'FARTHEST_CENTER_CANDIDATE'.
181 
182  @see FARTHEST_CENTER_CANDIDATE
183 
184  """
185 
186  self.__data = numpy.array(data)
187  self.__amount = amount_centers
188  self.__candidates = amount_candidates
189  self.__free_indexes = set(range(len(self.__data)))
190 
191  self.__check_parameters()
192 
193 
194  def __check_parameters(self):
195  """!
196  @brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown.
197 
198  """
199  if (self.__amount <= 0) or (self.__amount > len(self.__data)):
200  raise AttributeError("Amount of cluster centers '" + str(self.__amount) + "' should be at least 1 and "
201  "should be less or equal to amount of points in data.")
202 
203  if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
204  if (self.__candidates <= 0) or (self.__candidates > len(self.__data)):
205  raise AttributeError("Amount of center candidates '" + str(self.__candidates) + "' should be at least 1 "
206  "and should be less or equal to amount of points in data.")
207 
208  if len(self.__data) == 0:
209  raise AttributeError("Data is empty.")
210 
211 
212  def __calculate_shortest_distances(self, data, centers, index_representation):
213  """!
214  @brief Calculates distance from each data point to nearest center.
215 
216  @param[in] data (numpy.array): Array of points for that initialization is performed.
217  @param[in] centers (numpy.array): Array of points or indexes that represents centers.
218  @param[in] index_representation (bool): If 'True' then index representation is used in 'centers', otherwise
219  points itself are used for representation.
220 
221  @return (numpy.array) List of distances to closest center for each data point.
222 
223  """
224 
225  dataset_differences = numpy.zeros((len(centers), len(data)))
226  for index_center in range(len(centers)):
227  if index_representation:
228  center = data[centers[index_center]]
229  else:
230  center = centers[index_center]
231 
232  dataset_differences[index_center] = numpy.sum(numpy.square(data - center), axis=1).T
233  for index_other_centers in centers:
234  dataset_differences[index_center][index_other_centers] = numpy.nan
235 
236  with warnings.catch_warnings():
237  numpy.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')
238  shortest_distances = numpy.nanmin(dataset_differences, axis=0)
239 
240  return shortest_distances
241 
242 
243  def __get_next_center(self, centers, return_index):
244  """!
245  @brief Calculates the next center for the data.
246 
247  @param[in] centers (array_like): Current initialized centers.
248  @param[in] return_index (bool): If True then return center's index instead of point.
249 
250  @return (array_like) Next initialized center.<br>
251  (uint) Index of next initialized center if return_index is True.
252 
253  """
254 
255  distances = self.__calculate_shortest_distances(self.__data, centers, return_index)
256 
257  if self.__candidates == kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
258  center_index = numpy.nanargmax(distances)
259  else:
260  probabilities = self.__calculate_probabilities(distances)
261  center_index = self.__get_probable_center(distances, probabilities)
262 
263  if return_index:
264  return center_index
265 
266  return self.__data[center_index]
267 
268 
269  def __get_initial_center(self, return_index):
270  """!
271  @brief Choose randomly first center.
272 
273  @param[in] return_index (bool): If True then return center's index instead of point.
274 
275  @return (array_like) First center.<br>
276  (uint) Index of first center.
277 
278  """
279 
280  index_center = random.randint(0, len(self.__data) - 1)
281  if return_index:
282  return index_center
283 
284  return self.__data[index_center]
285 
286 
287  def __calculate_probabilities(self, distances):
288  """!
289  @brief Calculates cumulative probabilities of being center of each point.
290 
291  @param[in] distances (array_like): Distances from each point to closest center.
292 
293  @return (array_like) Cumulative probabilities of being center of each point.
294 
295  """
296 
297  total_distance = numpy.sum(distances)
298  if total_distance != 0.0:
299  probabilities = distances / total_distance
300  return numpy.cumsum(probabilities)
301  else:
302  return numpy.zeros(len(distances))
303 
304 
305  def __get_probable_center(self, distances, probabilities):
306  """!
307  @brief Calculates the next probable center considering amount candidates.
308 
309  @param[in] distances (array_like): Distances from each point to closest center.
310  @param[in] probabilities (array_like): Cumulative probabilities of being center of each point.
311 
312  @return (uint) Index point that is next initialized center.
313 
314  """
315 
316  index_best_candidate = -1
317  for _ in range(self.__candidates):
318  candidate_probability = random.random()
319  index_candidate = 0
320 
321  for index_object in range(len(probabilities)):
322  if candidate_probability < probabilities[index_object]:
323  index_candidate = index_object
324  break
325 
326  if index_best_candidate == -1:
327  index_best_candidate = next(iter(self.__free_indexes))
328  elif distances[index_best_candidate] < distances[index_candidate]:
329  index_best_candidate = index_candidate
330 
331  return index_best_candidate
332 
333 
334  def initialize(self, **kwargs):
335  """!
336  @brief Calculates initial centers using K-Means++ method.
337 
338  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'return_index').
339 
340  <b>Keyword Args:</b><br>
341  - return_index (bool): If True then returns indexes of points from input data instead of points itself.
342 
343  @return (list) List of initialized initial centers.
344  If argument 'return_index' is False then returns list of points.
345  If argument 'return_index' is True then returns list of indexes.
346 
347  """
348 
349  return_index = kwargs.get('return_index', False)
350 
351  index_point = self.__get_initial_center(True)
352  centers = [index_point]
353  self.__free_indexes.remove(index_point)
354 
355  # For each next center
356  for _ in range(1, self.__amount):
357  index_point = self.__get_next_center(centers, True)
358  centers.append(index_point)
359  self.__free_indexes.remove(index_point)
360 
361  if not return_index:
362  centers = [self.__data[index] for index in centers]
363 
364  return centers
def __calculate_probabilities(self, distances)
Calculates cumulative probabilities of being center of each point.
def initialize(self, kwargs)
Calculates initial centers using K-Means++ method.
def __create_center(self, return_index)
Generates and returns random center.
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def initialize(self, kwargs)
Generates random centers in line with input parameters.
def __get_probable_center(self, distances, probabilities)
Calculates the next probable center considering amount candidates.
Random center initializer is for generation specified amount of random of centers for specified data...
def __check_parameters(self)
Checks input parameters of the algorithm and if something wrong then corresponding exception is throw...
def __init__(self, data, amount_centers)
Creates instance of random center initializer.
def __init__(self, data, amount_centers, amount_candidates=1)
Creates K-Means++ center initializer instance.
def __get_initial_center(self, return_index)
Choose randomly first center.
def __get_next_center(self, centers, return_index)
Calculates the next center for the data.
def __calculate_shortest_distances(self, data, centers, index_representation)
Calculates distance from each data point to nearest center.