center_initializer.py
1 """!
2 
3 @brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4 @details Implementation based on paper @cite article::kmeans++::1.
5 
6 @authors Andrei Novikov, Aleksey Kukushkin (pyclustering@yandex.ru)
7 @date 2014-2018
8 @copyright GNU Public License
9 
10 @see pyclustering.cluster.kmeans
11 @see puclustering.cluster.xmeans
12 
13 @cond GNU_PUBLIC_LICENSE
14  PyClustering is free software: you can redistribute it and/or modify
15  it under the terms of the GNU General Public License as published by
16  the Free Software Foundation, either version 3 of the License, or
17  (at your option) any later version.
18 
19  PyClustering is distributed in the hope that it will be useful,
20  but WITHOUT ANY WARRANTY; without even the implied warranty of
21  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22  GNU General Public License for more details.
23 
24  You should have received a copy of the GNU General Public License
25  along with this program. If not, see <http://www.gnu.org/licenses/>.
26 @endcond
27 
28 """
29 
30 
31 import numpy
32 import random
33 
34 
36  """!
37  @brief Random center initializer is for generation specified amount of random of centers for specified data.
38 
39  """
40 
41  def __init__(self, data, amount_centers):
42  """!
43  @brief Creates instance of random center initializer.
44 
45  @param[in] data (list): List of points where each point is represented by list of coordinates.
46  @param[in] amount_centers (unit): Amount of centers that should be initialized.
47 
48  """
49 
50  self.__data = data
51  self.__amount = amount_centers
52  self.__available_indexes = set(list(range(len(self.__data))))
53 
54  if self.__amount <= 0:
55  raise ValueError("Amount of cluster centers should be at least 1.")
56 
57  if self.__amount > len(self.__data):
58  raise ValueError("Amount of cluster centers '%d' should be less than data size." % self.__amount)
59 
60 
61  def initialize(self, **kwargs):
62  """!
63  @brief Generates random centers in line with input parameters.
64 
65  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'return_index').
66 
67  <b>Keyword Args:</b><br>
68  - return_index (bool): If True then returns indexes of points from input data instead of points itself.
69 
70  @return (list) List of initialized initial centers.
71  If argument 'return_index' is False then returns list of points.
72  If argument 'return_index' is True then returns list of indexes.
73 
74  """
75  return_index = kwargs.get('return_index', False)
76  if self.__amount == len(self.__data):
77  if return_index:
78  return list(range(len(self.__data)))
79  return self.__data[:]
80 
81  return [self.__create_center(return_index) for _ in range(self.__amount)]
82 
83 
84  def __create_center(self, return_index):
85  """!
86  @brief Generates and returns random center.
87 
88  @param[in] return_index (bool): If True then returns index of point from input data instead of point itself.
89 
90  """
91  random_index_point = random.randint(0, len(self.__data[0]))
92  if random_index_point not in self.__available_indexes:
93  random_index_point = self.__available_indexes.pop()
94  else:
95  self.__available_indexes.remove(random_index_point)
96 
97  if return_index:
98  return random_index_point
99  return self.__data[random_index_point]
100 
101 
103  """!
104  @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
105  @details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
106  initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
107  out optimal initial centers.
108 
109  Algorithm can be divided into three steps. The first center is chosen from input data randomly with
110  uniform distribution at the first step. At the second, probability to being center is calculated for each point:
111  \f[p_{i}=\frac{D(x_{i})}{\sum_{j=0}^{N}D(x_{j})}\f]
112  where \f$D(x_{i})\f$ is a distance from point \f$i\f$ to the closest center. Using this probabilities next center
113  is chosen. The last step is repeated until required amount of centers is initialized.
114 
115  Pyclustering implementation of the algorithm provides feature to consider several candidates on the second
116  step, for example:
117 
118  @code
119  amount_centers = 4;
120  amount_candidates = 3;
121  initializer = kmeans_plusplus_initializer(sample, amount_centers, amount_candidates);
122  @endcode
123 
124  If the farthest points should be used as centers then special constant 'FARTHEST_CENTER_CANDIDATE' should be used
125  for that purpose, for example:
126  @code
127  amount_centers = 4;
128  amount_candidates = kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE;
129  initializer = kmeans_plusplus_initializer(sample, amount_centers, amount_candidates);
130  @endcode
131 
132  There is an example of initial centers that were calculated by the K-Means++ method:
133 
134  @image html kmeans_plusplus_initializer_results.png
135 
136  Code example where initial centers are prepared for K-Means algorithm:
137  @code
138  # Read data 'SampleSimple3' from Simple Sample collection.
139  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
140 
141  # Calculate initial centers using K-Means++ method.
142  centers = kmeans_plusplus_initializer(sample, 4).initialize();
143 
144  # Display initial centers.
145  visualizer = cluster_visualizer();
146  visualizer.append_cluster(sample);
147  visualizer.append_cluster(centers, marker = '*', markersize = 10);
148  visualizer.show();
149 
150  # Perform cluster analysis using K-Means algorithm with initial centers.
151  kmeans_instance = kmeans(sample, centers);
152 
153  # Run clustering process and obtain result.
154  kmeans_instance.process();
155  clusters = kmeans_instance.get_clusters();
156  @endcode
157 
158  """
159 
160 
161 
162  FARTHEST_CENTER_CANDIDATE = "farthest"
163 
164 
165  def __init__(self, data, amount_centers, amount_candidates = 1):
166  """!
167  @brief Creates K-Means++ center initializer instance.
168 
169  @param[in] data (array_like): List of points where each point is represented by list of coordinates.
170  @param[in] amount_centers (uint): Amount of centers that should be initialized.
171  @param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points (with the highest probability) should
172  be considered as centers then special constant should be used 'FARTHEST_CENTER_CANDIDATE'.
173 
174  @see FARTHEST_CENTER_CANDIDATE
175 
176  """
177 
178  self.__data = numpy.array(data)
179  self.__amount = amount_centers
180  self.__candidates = amount_candidates
181 
182  self.__check_parameters()
183 
184 
185  def __check_parameters(self):
186  """!
187  @brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown.
188 
189  """
190  if (self.__amount <= 0) or (self.__amount > len(self.__data)):
191  raise AttributeError("Amount of cluster centers '" + str(self.__amount) + "' should be at least 1 and "
192  "should be less or equal to amount of points in data.")
193 
194  if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
195  if (self.__candidates <= 0) or (self.__candidates > len(self.__data)):
196  raise AttributeError("Amount of center candidates '" + str(self.__candidates) + "' should be at least 1 "
197  "and should be less or equal to amount of points in data.")
198 
199  if len(self.__data) == 0:
200  raise AttributeError("Data is empty.")
201 
202 
203  def __calculate_shortest_distances(self, data, centers):
204  """!
205  @brief Calculates distance from each data point to nearest center.
206 
207  @param[in] data (numpy.array): Array of points for that initialization is performed.
208  @param[in] centers (numpy.array): Array of points that represents centers.
209 
210  @return (numpy.array) List of distances to closest center for each data point.
211 
212  """
213 
214  dataset_differences = numpy.zeros((len(centers), len(data)))
215  for index_center in range(len(centers)):
216  dataset_differences[index_center] = numpy.sum(
217  numpy.square(data - centers[index_center]), axis=1).T
218 
219  shortest_distances = numpy.min(dataset_differences, axis=0)
220  return shortest_distances
221 
222 
223  def __get_next_center(self, centers, return_index):
224  """!
225  @brief Calculates the next center for the data.
226 
227  @param[in] centers (array_like): Current initialized centers.
228  @param[in] return_index (bool): If True then return center's index instead of point.
229 
230  @return (array_like) Next initialized center.<br>
231  (uint) Index of next initialized center if return_index is True.
232 
233  """
234 
235  distances = self.__calculate_shortest_distances(data=self.__data, centers=centers)
236 
237  if self.__candidates == kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
238  center_index = numpy.argmax(distances)
239  else:
240  probabilities = self.__calculate_probabilities(distances)
241  center_index = self.__get_probable_center(distances, probabilities)
242 
243  if return_index:
244  return center_index
245 
246  return self.__data[center_index]
247 
248 
249  def __get_initial_center(self, return_index):
250  """!
251  @brief Choose randomly first center.
252 
253  @param[in] return_index (bool): If True then return center's index instead of point.
254 
255  @return (array_like) First center.<br>
256  (uint) Index of first center.
257 
258  """
259 
260  index_center = random.randint(0, len(self.__data) - 1)
261  if return_index:
262  return index_center
263 
264  return self.__data[index_center]
265 
266 
267  def __calculate_probabilities(self, distances):
268  """!
269  @brief Calculates cumulative probabilities of being center of each point.
270 
271  @param[in] distances (array_like): Distances from each point to closest center.
272 
273  @return (array_like) Cumulative probabilities of being center of each point.
274 
275  """
276 
277  total_distance = numpy.sum(distances)
278  if total_distance != 0.0:
279  probabilities = distances / total_distance
280  return numpy.cumsum(probabilities)
281  else:
282  return numpy.zeros(len(distances))
283 
284 
285  def __get_probable_center(self, distances, probabilities):
286  """!
287  @brief Calculates the next probable center considering amount candidates.
288 
289  @param[in] distances (array_like): Distances from each point to closest center.
290  @param[in] probabilities (array_like): Cumulative probabilities of being center of each point.
291 
292  @return (uint) Index point that is next initialized center.
293 
294  """
295 
296  index_best_candidate = -1
297  for _ in range(self.__candidates):
298  candidate_probability = random.random()
299  index_candidate = 0
300 
301  for index_object in range(len(probabilities)):
302  if candidate_probability < probabilities[index_object]:
303  index_candidate = index_object
304  break
305 
306  if index_best_candidate == -1:
307  index_best_candidate = index_candidate
308  elif distances[index_best_candidate] < distances[index_candidate]:
309  index_best_candidate = index_candidate
310 
311  return index_best_candidate
312 
313 
314  def initialize(self, **kwargs):
315  """!
316  @brief Calculates initial centers using K-Means++ method.
317 
318  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'return_index').
319 
320  <b>Keyword Args:</b><br>
321  - return_index (bool): If True then returns indexes of points from input data instead of points itself.
322 
323  @return (list) List of initialized initial centers.
324  If argument 'return_index' is False then returns list of points.
325  If argument 'return_index' is True then returns list of indexes.
326 
327  """
328 
329  return_index = kwargs.get('return_index', False)
330  centers = [self.__get_initial_center(return_index)]
331 
332  # For each next center
333  for _ in range(1, self.__amount):
334  next_center = self.__get_next_center(centers, return_index)
335  centers.append(next_center)
336 
337  return centers
def __calculate_probabilities(self, distances)
Calculates cumulative probabilities of being center of each point.
def initialize(self, kwargs)
Calculates initial centers using K-Means++ method.
def __create_center(self, return_index)
Generates and returns random center.
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def initialize(self, kwargs)
Generates random centers in line with input parameters.
def __get_probable_center(self, distances, probabilities)
Calculates the next probable center considering amount candidates.
Random center initializer is for generation specified amount of random of centers for specified data...
def __check_parameters(self)
Checks input parameters of the algorithm and if something wrong then corresponding exception is throw...
def __init__(self, data, amount_centers)
Creates instance of random center initializer.
def __init__(self, data, amount_centers, amount_candidates=1)
Creates K-Means++ center initializer instance.
def __get_initial_center(self, return_index)
Choose randomly first center.
def __calculate_shortest_distances(self, data, centers)
Calculates distance from each data point to nearest center.
def __get_next_center(self, centers, return_index)
Calculates the next center for the data.