pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
center_initializer.py
1 """!
2 
3 @brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4 @details Implementation based on paper @cite article::kmeans++::1.
5 
6 @authors Andrei Novikov, Aleksey Kukushkin (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 @see pyclustering.cluster.kmeans
11 @see puclustering.cluster.xmeans
12 
13 """
14 
15 
16 import numpy
17 import random
18 import warnings
19 
20 
22  """!
23  @brief Random center initializer is for generation specified amount of random of centers for specified data.
24 
25  """
26 
27  def __init__(self, data, amount_centers, **kwargs):
28  """!
29  @brief Creates instance of random center initializer.
30 
31  @param[in] data (list): List of points where each point is represented by list of coordinates.
32  @param[in] amount_centers (unit): Amount of centers that should be initialized.
33  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'random_state').
34 
35  <b>Keyword Args:</b><br>
36  - random_state (int): Seed for random state (by default is `None`, current system time is used).
37 
38  """
39 
40  self.__data = data
41  self.__amount = amount_centers
42  self.__available_indexes = set(list(range(len(self.__data))))
43 
44  random.seed(kwargs.get('random_state', None))
45 
46  if self.__amount <= 0:
47  raise ValueError("Amount of cluster centers should be at least 1.")
48 
49  if self.__amount > len(self.__data):
50  raise ValueError("Amount of cluster centers '%d' should be less than data size." % self.__amount)
51 
52 
53  def initialize(self, **kwargs):
54  """!
55  @brief Generates random centers in line with input parameters.
56 
57  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'return_index').
58 
59  <b>Keyword Args:</b><br>
60  - return_index (bool): If True then returns indexes of points from input data instead of points itself.
61 
62  @return (list) List of initialized initial centers.
63  If argument 'return_index' is False then returns list of points.
64  If argument 'return_index' is True then returns list of indexes.
65 
66  """
67  return_index = kwargs.get('return_index', False)
68  if self.__amount == len(self.__data):
69  if return_index:
70  return list(range(len(self.__data)))
71  return self.__data[:]
72 
73  return [self.__create_center(return_index) for _ in range(self.__amount)]
74 
75 
76  def __create_center(self, return_index):
77  """!
78  @brief Generates and returns random center.
79 
80  @param[in] return_index (bool): If True then returns index of point from input data instead of point itself.
81 
82  """
83  random_index_point = random.randint(0, len(self.__data))
84  if random_index_point not in self.__available_indexes:
85  random_index_point = self.__available_indexes.pop()
86  else:
87  self.__available_indexes.remove(random_index_point)
88 
89  if return_index:
90  return random_index_point
91  return self.__data[random_index_point]
92 
93 
94 
96  """!
97  @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
98  @details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
99  initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
100  out optimal initial centers.
101 
102  Algorithm can be divided into three steps. The first center is chosen from input data randomly with
103  uniform distribution at the first step. At the second, probability to being center is calculated for each point:
104  \f[p_{i}=\frac{D(x_{i})}{\sum_{j=0}^{N}D(x_{j})}\f]
105  where \f$D(x_{i})\f$ is a distance from point \f$i\f$ to the closest center. Using this probabilities next center
106  is chosen. The last step is repeated until required amount of centers is initialized.
107 
108  Pyclustering implementation of the algorithm provides feature to consider several candidates on the second
109  step, for example:
110 
111  @code
112  amount_centers = 4;
113  amount_candidates = 3;
114  initializer = kmeans_plusplus_initializer(sample, amount_centers, amount_candidates);
115  @endcode
116 
117  If the farthest points should be used as centers then special constant 'FARTHEST_CENTER_CANDIDATE' should be used
118  for that purpose, for example:
119  @code
120  amount_centers = 4;
121  amount_candidates = kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE;
122  initializer = kmeans_plusplus_initializer(sample, amount_centers, amount_candidates);
123  @endcode
124 
125  There is an example of initial centers that were calculated by the K-Means++ method:
126 
127  @image html kmeans_plusplus_initializer_results.png
128 
129  Code example where initial centers are prepared for K-Means algorithm:
130  @code
131  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
132  from pyclustering.cluster.kmeans import kmeans
133  from pyclustering.cluster import cluster_visualizer
134  from pyclustering.utils import read_sample
135  from pyclustering.samples.definitions import SIMPLE_SAMPLES
136 
137  # Read data 'SampleSimple3' from Simple Sample collection.
138  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
139 
140  # Calculate initial centers using K-Means++ method.
141  centers = kmeans_plusplus_initializer(sample, 4, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
142 
143  # Display initial centers.
144  visualizer = cluster_visualizer()
145  visualizer.append_cluster(sample)
146  visualizer.append_cluster(centers, marker='*', markersize=10)
147  visualizer.show()
148 
149  # Perform cluster analysis using K-Means algorithm with initial centers.
150  kmeans_instance = kmeans(sample, centers)
151 
152  # Run clustering process and obtain result.
153  kmeans_instance.process()
154  clusters = kmeans_instance.get_clusters()
155  @endcode
156 
157  """
158 
159 
160 
161  FARTHEST_CENTER_CANDIDATE = "farthest"
162 
163 
164  def __init__(self, data, amount_centers, amount_candidates=None, **kwargs):
165  """!
166  @brief Creates K-Means++ center initializer instance.
167 
168  @param[in] data (array_like): List of points where each point is represented by list of coordinates.
169  @param[in] amount_centers (uint): Amount of centers that should be initialized.
170  @param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points
171  (with the highest probability) should be considered as centers then special constant should be used
172  'FARTHEST_CENTER_CANDIDATE'. By default the amount of candidates is 3.
173  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'random_state').
174 
175  <b>Keyword Args:</b><br>
176  - random_state (int): Seed for random state (by default is `None`, current system time is used).
177 
178  @see FARTHEST_CENTER_CANDIDATE
179 
180  """
181 
182  self.__data = numpy.array(data)
183  self.__amount = amount_centers
184  self.__free_indexes = set(range(len(self.__data)))
185 
186  if amount_candidates is None:
187  self.__candidates = 3
188  if self.__candidates > len(self.__data):
189  self.__candidates = len(self.__data)
190  else:
191  self.__candidates = amount_candidates
192 
193  self.__check_parameters()
194 
195  random.seed(kwargs.get('random_state', None))
196 
197 
198  def __check_parameters(self):
199  """!
200  @brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown.
201 
202  """
203  if (self.__amount <= 0) or (self.__amount > len(self.__data)):
204  raise ValueError("Amount of cluster centers '" + str(self.__amount) + "' should be at least 1 and "
205  "should be less or equal to amount of points in data.")
206 
207  if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
208  if (self.__candidates <= 0) or (self.__candidates > len(self.__data)):
209  raise ValueError("Amount of center candidates '" + str(self.__candidates) + "' should be at least 1 "
210  "and should be less or equal to amount of points in data.")
211 
212  if len(self.__data) == 0:
213  raise ValueError("Data is empty.")
214 
215 
216  def __calculate_shortest_distances(self, data, centers):
217  """!
218  @brief Calculates distance from each data point to nearest center.
219 
220  @param[in] data (numpy.array): Array of points for that initialization is performed.
221  @param[in] centers (numpy.array): Array of indexes that represents centers.
222 
223  @return (numpy.array) List of distances to closest center for each data point.
224 
225  """
226 
227  dataset_differences = numpy.zeros((len(centers), len(data)))
228  for index_center in range(len(centers)):
229  center = data[centers[index_center]]
230 
231  dataset_differences[index_center] = numpy.sum(numpy.square(data - center), axis=1).T
232 
233  with warnings.catch_warnings():
234  numpy.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')
235  shortest_distances = numpy.nanmin(dataset_differences, axis=0)
236 
237  return shortest_distances
238 
239 
240  def __get_next_center(self, centers):
241  """!
242  @brief Calculates the next center for the data.
243 
244  @param[in] centers (array_like): Current initialized centers represented by indexes.
245 
246  @return (array_like) Next initialized center.<br>
247  (uint) Index of next initialized center if return_index is True.
248 
249  """
250 
251  distances = self.__calculate_shortest_distances(self.__data, centers)
252 
253  if self.__candidates == kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
254  for index_point in centers:
255  distances[index_point] = numpy.nan
256  center_index = numpy.nanargmax(distances)
257  else:
258  probabilities = self.__calculate_probabilities(distances)
259  center_index = self.__get_probable_center(distances, probabilities)
260 
261  return center_index
262 
263 
264  def __get_initial_center(self, return_index):
265  """!
266  @brief Choose randomly first center.
267 
268  @param[in] return_index (bool): If True then return center's index instead of point.
269 
270  @return (array_like) First center.<br>
271  (uint) Index of first center.
272 
273  """
274 
275  index_center = random.randint(0, len(self.__data) - 1)
276  if return_index:
277  return index_center
278 
279  return self.__data[index_center]
280 
281 
282  def __calculate_probabilities(self, distances):
283  """!
284  @brief Calculates cumulative probabilities of being center of each point.
285 
286  @param[in] distances (array_like): Distances from each point to closest center.
287 
288  @return (array_like) Cumulative probabilities of being center of each point.
289 
290  """
291 
292  total_distance = numpy.sum(distances)
293  if total_distance != 0.0:
294  probabilities = distances / total_distance
295  return numpy.cumsum(probabilities)
296  else:
297  return numpy.zeros(len(distances))
298 
299 
300  def __get_probable_center(self, distances, probabilities):
301  """!
302  @brief Calculates the next probable center considering amount candidates.
303 
304  @param[in] distances (array_like): Distances from each point to closest center.
305  @param[in] probabilities (array_like): Cumulative probabilities of being center of each point.
306 
307  @return (uint) Index point that is next initialized center.
308 
309  """
310 
311  index_best_candidate = 0
312  for i in range(self.__candidates):
313  candidate_probability = random.random()
314  index_candidate = -1
315 
316  for index_object in range(len(probabilities)):
317  if candidate_probability < probabilities[index_object]:
318  index_candidate = index_object
319  break
320 
321  if index_candidate == -1:
322  index_best_candidate = next(iter(self.__free_indexes))
323  elif distances[index_best_candidate] < distances[index_candidate]:
324  index_best_candidate = index_candidate
325 
326  return index_best_candidate
327 
328 
329  def initialize(self, **kwargs):
330  """!
331  @brief Calculates initial centers using K-Means++ method.
332 
333  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'return_index').
334 
335  <b>Keyword Args:</b><br>
336  - return_index (bool): If True then returns indexes of points from input data instead of points itself.
337 
338  @return (list) List of initialized initial centers.
339  If argument 'return_index' is False then returns list of points.
340  If argument 'return_index' is True then returns list of indexes.
341 
342  """
343 
344  return_index = kwargs.get('return_index', False)
345 
346  index_point = self.__get_initial_center(True)
347  centers = [index_point]
348  self.__free_indexes.remove(index_point)
349 
350  # For each next center
351  for _ in range(1, self.__amount):
352  index_point = self.__get_next_center(centers)
353  centers.append(index_point)
354  self.__free_indexes.remove(index_point)
355 
356  if not return_index:
357  centers = [self.__data[index] for index in centers]
358 
359  return centers
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
Definition: center_initializer.py:95
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__candidates
__candidates
Definition: center_initializer.py:187
pyclustering.cluster.center_initializer.random_center_initializer.__init__
def __init__(self, data, amount_centers, **kwargs)
Creates instance of random center initializer.
Definition: center_initializer.py:27
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__amount
__amount
Definition: center_initializer.py:183
pyclustering.cluster.center_initializer.random_center_initializer.__amount
__amount
Definition: center_initializer.py:41
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__get_initial_center
def __get_initial_center(self, return_index)
Choose randomly first center.
Definition: center_initializer.py:264
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__init__
def __init__(self, data, amount_centers, amount_candidates=None, **kwargs)
Creates K-Means++ center initializer instance.
Definition: center_initializer.py:164
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__check_parameters
def __check_parameters(self)
Checks input parameters of the algorithm and if something wrong then corresponding exception is throw...
Definition: center_initializer.py:198
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__data
__data
Definition: center_initializer.py:182
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__calculate_probabilities
def __calculate_probabilities(self, distances)
Calculates cumulative probabilities of being center of each point.
Definition: center_initializer.py:282
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__calculate_shortest_distances
def __calculate_shortest_distances(self, data, centers)
Calculates distance from each data point to nearest center.
Definition: center_initializer.py:216
pyclustering.cluster.center_initializer.random_center_initializer.__available_indexes
__available_indexes
Definition: center_initializer.py:42
pyclustering.cluster.center_initializer.random_center_initializer
Random center initializer is for generation specified amount of random of centers for specified data.
Definition: center_initializer.py:21
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__get_next_center
def __get_next_center(self, centers)
Calculates the next center for the data.
Definition: center_initializer.py:240
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__free_indexes
__free_indexes
Definition: center_initializer.py:184
pyclustering.cluster.center_initializer.random_center_initializer.__data
__data
Definition: center_initializer.py:40
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.initialize
def initialize(self, **kwargs)
Calculates initial centers using K-Means++ method.
Definition: center_initializer.py:329
pyclustering.cluster.center_initializer.kmeans_plusplus_initializer.__get_probable_center
def __get_probable_center(self, distances, probabilities)
Calculates the next probable center considering amount candidates.
Definition: center_initializer.py:300
pyclustering.cluster.center_initializer.random_center_initializer.initialize
def initialize(self, **kwargs)
Generates random centers in line with input parameters.
Definition: center_initializer.py:53
pyclustering.cluster.center_initializer.random_center_initializer.__create_center
def __create_center(self, return_index)
Generates and returns random center.
Definition: center_initializer.py:76