center_initializer.py
1 """!
2 
3 @brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4 @details Implementation based on paper @cite article::kmeans++::1.
5 
6 @authors Andrei Novikov, Aleksey Kukushkin (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright GNU Public License
9 
10 @see pyclustering.cluster.kmeans
11 @see puclustering.cluster.xmeans
12 
13 @cond GNU_PUBLIC_LICENSE
14  PyClustering is free software: you can redistribute it and/or modify
15  it under the terms of the GNU General Public License as published by
16  the Free Software Foundation, either version 3 of the License, or
17  (at your option) any later version.
18 
19  PyClustering is distributed in the hope that it will be useful,
20  but WITHOUT ANY WARRANTY; without even the implied warranty of
21  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22  GNU General Public License for more details.
23 
24  You should have received a copy of the GNU General Public License
25  along with this program. If not, see <http://www.gnu.org/licenses/>.
26 @endcond
27 
28 """
29 
30 
31 import numpy
32 import random
33 import warnings
34 
35 
37  """!
38  @brief Random center initializer is for generation specified amount of random of centers for specified data.
39 
40  """
41 
42  def __init__(self, data, amount_centers, **kwargs):
43  """!
44  @brief Creates instance of random center initializer.
45 
46  @param[in] data (list): List of points where each point is represented by list of coordinates.
47  @param[in] amount_centers (unit): Amount of centers that should be initialized.
48  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'random_state').
49 
50  <b>Keyword Args:</b><br>
51  - random_state (int): Seed for random state (by default is `None`, current system time is used).
52 
53  """
54 
55  self.__data = data
56  self.__amount = amount_centers
57  self.__available_indexes = set(list(range(len(self.__data))))
58 
59  random.seed(kwargs.get('random_state', None))
60 
61  if self.__amount <= 0:
62  raise ValueError("Amount of cluster centers should be at least 1.")
63 
64  if self.__amount > len(self.__data):
65  raise ValueError("Amount of cluster centers '%d' should be less than data size." % self.__amount)
66 
67 
68  def initialize(self, **kwargs):
69  """!
70  @brief Generates random centers in line with input parameters.
71 
72  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'return_index').
73 
74  <b>Keyword Args:</b><br>
75  - return_index (bool): If True then returns indexes of points from input data instead of points itself.
76 
77  @return (list) List of initialized initial centers.
78  If argument 'return_index' is False then returns list of points.
79  If argument 'return_index' is True then returns list of indexes.
80 
81  """
82  return_index = kwargs.get('return_index', False)
83  if self.__amount == len(self.__data):
84  if return_index:
85  return list(range(len(self.__data)))
86  return self.__data[:]
87 
88  return [self.__create_center(return_index) for _ in range(self.__amount)]
89 
90 
91  def __create_center(self, return_index):
92  """!
93  @brief Generates and returns random center.
94 
95  @param[in] return_index (bool): If True then returns index of point from input data instead of point itself.
96 
97  """
98  random_index_point = random.randint(0, len(self.__data))
99  if random_index_point not in self.__available_indexes:
100  random_index_point = self.__available_indexes.pop()
101  else:
102  self.__available_indexes.remove(random_index_point)
103 
104  if return_index:
105  return random_index_point
106  return self.__data[random_index_point]
107 
108 
109 
111  """!
112  @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
113  @details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
114  initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
115  out optimal initial centers.
116 
117  Algorithm can be divided into three steps. The first center is chosen from input data randomly with
118  uniform distribution at the first step. At the second, probability to being center is calculated for each point:
119  \f[p_{i}=\frac{D(x_{i})}{\sum_{j=0}^{N}D(x_{j})}\f]
120  where \f$D(x_{i})\f$ is a distance from point \f$i\f$ to the closest center. Using this probabilities next center
121  is chosen. The last step is repeated until required amount of centers is initialized.
122 
123  Pyclustering implementation of the algorithm provides feature to consider several candidates on the second
124  step, for example:
125 
126  @code
127  amount_centers = 4;
128  amount_candidates = 3;
129  initializer = kmeans_plusplus_initializer(sample, amount_centers, amount_candidates);
130  @endcode
131 
132  If the farthest points should be used as centers then special constant 'FARTHEST_CENTER_CANDIDATE' should be used
133  for that purpose, for example:
134  @code
135  amount_centers = 4;
136  amount_candidates = kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE;
137  initializer = kmeans_plusplus_initializer(sample, amount_centers, amount_candidates);
138  @endcode
139 
140  There is an example of initial centers that were calculated by the K-Means++ method:
141 
142  @image html kmeans_plusplus_initializer_results.png
143 
144  Code example where initial centers are prepared for K-Means algorithm:
145  @code
146  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
147  from pyclustering.cluster.kmeans import kmeans
148  from pyclustering.cluster import cluster_visualizer
149  from pyclustering.utils import read_sample
150  from pyclustering.samples.definitions import SIMPLE_SAMPLES
151 
152  # Read data 'SampleSimple3' from Simple Sample collection.
153  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
154 
155  # Calculate initial centers using K-Means++ method.
156  centers = kmeans_plusplus_initializer(sample, 4, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
157 
158  # Display initial centers.
159  visualizer = cluster_visualizer()
160  visualizer.append_cluster(sample)
161  visualizer.append_cluster(centers, marker='*', markersize=10)
162  visualizer.show()
163 
164  # Perform cluster analysis using K-Means algorithm with initial centers.
165  kmeans_instance = kmeans(sample, centers)
166 
167  # Run clustering process and obtain result.
168  kmeans_instance.process()
169  clusters = kmeans_instance.get_clusters()
170  @endcode
171 
172  """
173 
174 
175 
176  FARTHEST_CENTER_CANDIDATE = "farthest"
177 
178 
179  def __init__(self, data, amount_centers, amount_candidates=None, **kwargs):
180  """!
181  @brief Creates K-Means++ center initializer instance.
182 
183  @param[in] data (array_like): List of points where each point is represented by list of coordinates.
184  @param[in] amount_centers (uint): Amount of centers that should be initialized.
185  @param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points
186  (with the highest probability) should be considered as centers then special constant should be used
187  'FARTHEST_CENTER_CANDIDATE'. By default the amount of candidates is 3.
188  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'random_state').
189 
190  <b>Keyword Args:</b><br>
191  - random_state (int): Seed for random state (by default is `None`, current system time is used).
192 
193  @see FARTHEST_CENTER_CANDIDATE
194 
195  """
196 
197  self.__data = numpy.array(data)
198  self.__amount = amount_centers
199  self.__free_indexes = set(range(len(self.__data)))
200 
201  if amount_candidates is None:
202  self.__candidates = 3
203  if self.__candidates > len(self.__data):
204  self.__candidates = len(self.__data)
205  else:
206  self.__candidates = amount_candidates
207 
208  self.__check_parameters()
209 
210  random.seed(kwargs.get('random_state', None))
211 
212 
213  def __check_parameters(self):
214  """!
215  @brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown.
216 
217  """
218  if (self.__amount <= 0) or (self.__amount > len(self.__data)):
219  raise ValueError("Amount of cluster centers '" + str(self.__amount) + "' should be at least 1 and "
220  "should be less or equal to amount of points in data.")
221 
222  if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
223  if (self.__candidates <= 0) or (self.__candidates > len(self.__data)):
224  raise ValueError("Amount of center candidates '" + str(self.__candidates) + "' should be at least 1 "
225  "and should be less or equal to amount of points in data.")
226 
227  if len(self.__data) == 0:
228  raise ValueError("Data is empty.")
229 
230 
231  def __calculate_shortest_distances(self, data, centers):
232  """!
233  @brief Calculates distance from each data point to nearest center.
234 
235  @param[in] data (numpy.array): Array of points for that initialization is performed.
236  @param[in] centers (numpy.array): Array of indexes that represents centers.
237 
238  @return (numpy.array) List of distances to closest center for each data point.
239 
240  """
241 
242  dataset_differences = numpy.zeros((len(centers), len(data)))
243  for index_center in range(len(centers)):
244  center = data[centers[index_center]]
245 
246  dataset_differences[index_center] = numpy.sum(numpy.square(data - center), axis=1).T
247 
248  with warnings.catch_warnings():
249  numpy.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')
250  shortest_distances = numpy.nanmin(dataset_differences, axis=0)
251 
252  return shortest_distances
253 
254 
255  def __get_next_center(self, centers):
256  """!
257  @brief Calculates the next center for the data.
258 
259  @param[in] centers (array_like): Current initialized centers represented by indexes.
260 
261  @return (array_like) Next initialized center.<br>
262  (uint) Index of next initialized center if return_index is True.
263 
264  """
265 
266  distances = self.__calculate_shortest_distances(self.__data, centers)
267 
268  if self.__candidates == kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
269  for index_point in centers:
270  distances[index_point] = numpy.nan
271  center_index = numpy.nanargmax(distances)
272  else:
273  probabilities = self.__calculate_probabilities(distances)
274  center_index = self.__get_probable_center(distances, probabilities)
275 
276  return center_index
277 
278 
279  def __get_initial_center(self, return_index):
280  """!
281  @brief Choose randomly first center.
282 
283  @param[in] return_index (bool): If True then return center's index instead of point.
284 
285  @return (array_like) First center.<br>
286  (uint) Index of first center.
287 
288  """
289 
290  index_center = random.randint(0, len(self.__data) - 1)
291  if return_index:
292  return index_center
293 
294  return self.__data[index_center]
295 
296 
297  def __calculate_probabilities(self, distances):
298  """!
299  @brief Calculates cumulative probabilities of being center of each point.
300 
301  @param[in] distances (array_like): Distances from each point to closest center.
302 
303  @return (array_like) Cumulative probabilities of being center of each point.
304 
305  """
306 
307  total_distance = numpy.sum(distances)
308  if total_distance != 0.0:
309  probabilities = distances / total_distance
310  return numpy.cumsum(probabilities)
311  else:
312  return numpy.zeros(len(distances))
313 
314 
315  def __get_probable_center(self, distances, probabilities):
316  """!
317  @brief Calculates the next probable center considering amount candidates.
318 
319  @param[in] distances (array_like): Distances from each point to closest center.
320  @param[in] probabilities (array_like): Cumulative probabilities of being center of each point.
321 
322  @return (uint) Index point that is next initialized center.
323 
324  """
325 
326  index_best_candidate = 0
327  for i in range(self.__candidates):
328  candidate_probability = random.random()
329  index_candidate = -1
330 
331  for index_object in range(len(probabilities)):
332  if candidate_probability < probabilities[index_object]:
333  index_candidate = index_object
334  break
335 
336  if index_candidate == -1:
337  index_best_candidate = next(iter(self.__free_indexes))
338  elif distances[index_best_candidate] < distances[index_candidate]:
339  index_best_candidate = index_candidate
340 
341  return index_best_candidate
342 
343 
344  def initialize(self, **kwargs):
345  """!
346  @brief Calculates initial centers using K-Means++ method.
347 
348  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'return_index').
349 
350  <b>Keyword Args:</b><br>
351  - return_index (bool): If True then returns indexes of points from input data instead of points itself.
352 
353  @return (list) List of initialized initial centers.
354  If argument 'return_index' is False then returns list of points.
355  If argument 'return_index' is True then returns list of indexes.
356 
357  """
358 
359  return_index = kwargs.get('return_index', False)
360 
361  index_point = self.__get_initial_center(True)
362  centers = [index_point]
363  self.__free_indexes.remove(index_point)
364 
365  # For each next center
366  for _ in range(1, self.__amount):
367  index_point = self.__get_next_center(centers)
368  centers.append(index_point)
369  self.__free_indexes.remove(index_point)
370 
371  if not return_index:
372  centers = [self.__data[index] for index in centers]
373 
374  return centers
def __calculate_probabilities(self, distances)
Calculates cumulative probabilities of being center of each point.
def initialize(self, kwargs)
Calculates initial centers using K-Means++ method.
def __create_center(self, return_index)
Generates and returns random center.
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def initialize(self, kwargs)
Generates random centers in line with input parameters.
def __get_probable_center(self, distances, probabilities)
Calculates the next probable center considering amount candidates.
Random center initializer is for generation specified amount of random of centers for specified data...
def __check_parameters(self)
Checks input parameters of the algorithm and if something wrong then corresponding exception is throw...
def __get_next_center(self, centers)
Calculates the next center for the data.
def __get_initial_center(self, return_index)
Choose randomly first center.
def __calculate_shortest_distances(self, data, centers)
Calculates distance from each data point to nearest center.
def __init__(self, data, amount_centers, amount_candidates=None, kwargs)
Creates K-Means++ center initializer instance.
def __init__(self, data, amount_centers, kwargs)
Creates instance of random center initializer.