center_initializer.py
1 """!
2 
3 @brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4 @details Implementation based on paper @cite article::kmeans++::1.
5 
6 @authors Andrei Novikov, Aleksey Kukushkin (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @see pyclustering.cluster.kmeans
11 @see puclustering.cluster.xmeans
12 
13 @cond GNU_PUBLIC_LICENSE
14  PyClustering is free software: you can redistribute it and/or modify
15  it under the terms of the GNU General Public License as published by
16  the Free Software Foundation, either version 3 of the License, or
17  (at your option) any later version.
18 
19  PyClustering is distributed in the hope that it will be useful,
20  but WITHOUT ANY WARRANTY; without even the implied warranty of
21  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22  GNU General Public License for more details.
23 
24  You should have received a copy of the GNU General Public License
25  along with this program. If not, see <http://www.gnu.org/licenses/>.
26 @endcond
27 
28 """
29 
30 
31 import numpy
32 import random
33 import warnings
34 
35 
37  """!
38  @brief Random center initializer is for generation specified amount of random of centers for specified data.
39 
40  """
41 
42  def __init__(self, data, amount_centers):
43  """!
44  @brief Creates instance of random center initializer.
45 
46  @param[in] data (list): List of points where each point is represented by list of coordinates.
47  @param[in] amount_centers (unit): Amount of centers that should be initialized.
48 
49  """
50 
51  self.__data = data
52  self.__amount = amount_centers
53  self.__available_indexes = set(list(range(len(self.__data))))
54 
55  if self.__amount <= 0:
56  raise ValueError("Amount of cluster centers should be at least 1.")
57 
58  if self.__amount > len(self.__data):
59  raise ValueError("Amount of cluster centers '%d' should be less than data size." % self.__amount)
60 
61 
62  def initialize(self, **kwargs):
63  """!
64  @brief Generates random centers in line with input parameters.
65 
66  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'return_index').
67 
68  <b>Keyword Args:</b><br>
69  - return_index (bool): If True then returns indexes of points from input data instead of points itself.
70 
71  @return (list) List of initialized initial centers.
72  If argument 'return_index' is False then returns list of points.
73  If argument 'return_index' is True then returns list of indexes.
74 
75  """
76  return_index = kwargs.get('return_index', False)
77  if self.__amount == len(self.__data):
78  if return_index:
79  return list(range(len(self.__data)))
80  return self.__data[:]
81 
82  return [self.__create_center(return_index) for _ in range(self.__amount)]
83 
84 
85  def __create_center(self, return_index):
86  """!
87  @brief Generates and returns random center.
88 
89  @param[in] return_index (bool): If True then returns index of point from input data instead of point itself.
90 
91  """
92  random_index_point = random.randint(0, len(self.__data[0]))
93  if random_index_point not in self.__available_indexes:
94  random_index_point = self.__available_indexes.pop()
95  else:
96  self.__available_indexes.remove(random_index_point)
97 
98  if return_index:
99  return random_index_point
100  return self.__data[random_index_point]
101 
102 
103 
105  """!
106  @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
107  @details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
108  initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
109  out optimal initial centers.
110 
111  Algorithm can be divided into three steps. The first center is chosen from input data randomly with
112  uniform distribution at the first step. At the second, probability to being center is calculated for each point:
113  \f[p_{i}=\frac{D(x_{i})}{\sum_{j=0}^{N}D(x_{j})}\f]
114  where \f$D(x_{i})\f$ is a distance from point \f$i\f$ to the closest center. Using this probabilities next center
115  is chosen. The last step is repeated until required amount of centers is initialized.
116 
117  Pyclustering implementation of the algorithm provides feature to consider several candidates on the second
118  step, for example:
119 
120  @code
121  amount_centers = 4;
122  amount_candidates = 3;
123  initializer = kmeans_plusplus_initializer(sample, amount_centers, amount_candidates);
124  @endcode
125 
126  If the farthest points should be used as centers then special constant 'FARTHEST_CENTER_CANDIDATE' should be used
127  for that purpose, for example:
128  @code
129  amount_centers = 4;
130  amount_candidates = kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE;
131  initializer = kmeans_plusplus_initializer(sample, amount_centers, amount_candidates);
132  @endcode
133 
134  There is an example of initial centers that were calculated by the K-Means++ method:
135 
136  @image html kmeans_plusplus_initializer_results.png
137 
138  Code example where initial centers are prepared for K-Means algorithm:
139  @code
140  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
141  from pyclustering.cluster.kmeans import kmeans
142  from pyclustering.cluster import cluster_visualizer
143  from pyclustering.utils import read_sample
144  from pyclustering.samples.definitions import SIMPLE_SAMPLES
145 
146  # Read data 'SampleSimple3' from Simple Sample collection.
147  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
148 
149  # Calculate initial centers using K-Means++ method.
150  centers = kmeans_plusplus_initializer(sample, 4, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
151 
152  # Display initial centers.
153  visualizer = cluster_visualizer()
154  visualizer.append_cluster(sample)
155  visualizer.append_cluster(centers, marker='*', markersize=10)
156  visualizer.show()
157 
158  # Perform cluster analysis using K-Means algorithm with initial centers.
159  kmeans_instance = kmeans(sample, centers)
160 
161  # Run clustering process and obtain result.
162  kmeans_instance.process()
163  clusters = kmeans_instance.get_clusters()
164  @endcode
165 
166  """
167 
168 
169 
170  FARTHEST_CENTER_CANDIDATE = "farthest"
171 
172 
173  def __init__(self, data, amount_centers, amount_candidates=None):
174  """!
175  @brief Creates K-Means++ center initializer instance.
176 
177  @param[in] data (array_like): List of points where each point is represented by list of coordinates.
178  @param[in] amount_centers (uint): Amount of centers that should be initialized.
179  @param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points
180  (with the highest probability) should be considered as centers then special constant should be used
181  'FARTHEST_CENTER_CANDIDATE'.
182 
183  @see FARTHEST_CENTER_CANDIDATE
184 
185  """
186 
187  self.__data = numpy.array(data)
188  self.__amount = amount_centers
189  self.__free_indexes = set(range(len(self.__data)))
190 
191  if amount_candidates is None:
192  self.__candidates = 3
193  if self.__candidates > len(self.__data):
194  self.__candidates = len(self.__data)
195  else:
196  self.__candidates = amount_candidates
197 
198  self.__check_parameters()
199 
200  random.seed()
201 
202 
203  def __check_parameters(self):
204  """!
205  @brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown.
206 
207  """
208  if (self.__amount <= 0) or (self.__amount > len(self.__data)):
209  raise ValueError("Amount of cluster centers '" + str(self.__amount) + "' should be at least 1 and "
210  "should be less or equal to amount of points in data.")
211 
212  if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
213  if (self.__candidates <= 0) or (self.__candidates > len(self.__data)):
214  raise ValueError("Amount of center candidates '" + str(self.__candidates) + "' should be at least 1 "
215  "and should be less or equal to amount of points in data.")
216 
217  if len(self.__data) == 0:
218  raise ValueError("Data is empty.")
219 
220 
221  def __calculate_shortest_distances(self, data, centers):
222  """!
223  @brief Calculates distance from each data point to nearest center.
224 
225  @param[in] data (numpy.array): Array of points for that initialization is performed.
226  @param[in] centers (numpy.array): Array of indexes that represents centers.
227 
228  @return (numpy.array) List of distances to closest center for each data point.
229 
230  """
231 
232  dataset_differences = numpy.zeros((len(centers), len(data)))
233  for index_center in range(len(centers)):
234  center = data[centers[index_center]]
235 
236  dataset_differences[index_center] = numpy.sum(numpy.square(data - center), axis=1).T
237 
238  with warnings.catch_warnings():
239  numpy.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')
240  shortest_distances = numpy.nanmin(dataset_differences, axis=0)
241 
242  return shortest_distances
243 
244 
245  def __get_next_center(self, centers):
246  """!
247  @brief Calculates the next center for the data.
248 
249  @param[in] centers (array_like): Current initialized centers represented by indexes.
250 
251  @return (array_like) Next initialized center.<br>
252  (uint) Index of next initialized center if return_index is True.
253 
254  """
255 
256  distances = self.__calculate_shortest_distances(self.__data, centers)
257 
258  if self.__candidates == kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
259  for index_point in centers:
260  distances[index_point] = numpy.nan
261  center_index = numpy.nanargmax(distances)
262  else:
263  probabilities = self.__calculate_probabilities(distances)
264  center_index = self.__get_probable_center(distances, probabilities)
265 
266  return center_index
267 
268 
269  def __get_initial_center(self, return_index):
270  """!
271  @brief Choose randomly first center.
272 
273  @param[in] return_index (bool): If True then return center's index instead of point.
274 
275  @return (array_like) First center.<br>
276  (uint) Index of first center.
277 
278  """
279 
280  index_center = random.randint(0, len(self.__data) - 1)
281  if return_index:
282  return index_center
283 
284  return self.__data[index_center]
285 
286 
287  def __calculate_probabilities(self, distances):
288  """!
289  @brief Calculates cumulative probabilities of being center of each point.
290 
291  @param[in] distances (array_like): Distances from each point to closest center.
292 
293  @return (array_like) Cumulative probabilities of being center of each point.
294 
295  """
296 
297  total_distance = numpy.sum(distances)
298  if total_distance != 0.0:
299  probabilities = distances / total_distance
300  return numpy.cumsum(probabilities)
301  else:
302  return numpy.zeros(len(distances))
303 
304 
305  def __get_probable_center(self, distances, probabilities):
306  """!
307  @brief Calculates the next probable center considering amount candidates.
308 
309  @param[in] distances (array_like): Distances from each point to closest center.
310  @param[in] probabilities (array_like): Cumulative probabilities of being center of each point.
311 
312  @return (uint) Index point that is next initialized center.
313 
314  """
315 
316  index_best_candidate = 0
317  for i in range(self.__candidates):
318  candidate_probability = random.random()
319  index_candidate = -1
320 
321  for index_object in range(len(probabilities)):
322  if candidate_probability < probabilities[index_object]:
323  index_candidate = index_object
324  break
325 
326  if index_candidate == -1:
327  index_best_candidate = next(iter(self.__free_indexes))
328  elif distances[index_best_candidate] < distances[index_candidate]:
329  index_best_candidate = index_candidate
330 
331  return index_best_candidate
332 
333 
334  def initialize(self, **kwargs):
335  """!
336  @brief Calculates initial centers using K-Means++ method.
337 
338  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'return_index').
339 
340  <b>Keyword Args:</b><br>
341  - return_index (bool): If True then returns indexes of points from input data instead of points itself.
342 
343  @return (list) List of initialized initial centers.
344  If argument 'return_index' is False then returns list of points.
345  If argument 'return_index' is True then returns list of indexes.
346 
347  """
348 
349  return_index = kwargs.get('return_index', False)
350 
351  index_point = self.__get_initial_center(True)
352  centers = [index_point]
353  self.__free_indexes.remove(index_point)
354 
355  # For each next center
356  for _ in range(1, self.__amount):
357  index_point = self.__get_next_center(centers)
358  centers.append(index_point)
359  self.__free_indexes.remove(index_point)
360 
361  if not return_index:
362  centers = [self.__data[index] for index in centers]
363 
364  return centers
def __calculate_probabilities(self, distances)
Calculates cumulative probabilities of being center of each point.
def initialize(self, kwargs)
Calculates initial centers using K-Means++ method.
def __create_center(self, return_index)
Generates and returns random center.
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def initialize(self, kwargs)
Generates random centers in line with input parameters.
def __get_probable_center(self, distances, probabilities)
Calculates the next probable center considering amount candidates.
Random center initializer is for generation specified amount of random of centers for specified data...
def __check_parameters(self)
Checks input parameters of the algorithm and if something wrong then corresponding exception is throw...
def __init__(self, data, amount_centers, amount_candidates=None)
Creates K-Means++ center initializer instance.
def __init__(self, data, amount_centers)
Creates instance of random center initializer.
def __get_next_center(self, centers)
Calculates the next center for the data.
def __get_initial_center(self, return_index)
Choose randomly first center.
def __calculate_shortest_distances(self, data, centers)
Calculates distance from each data point to nearest center.