gmeans.py
1 """!
2 
3 @brief The module contains G-Means algorithm and other related services.
4 @details Implementation based on paper @cite inproceedings::cluster::gmeans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import scipy.stats
30 
31 from pyclustering.core.gmeans_wrapper import gmeans as gmeans_wrapper
32 from pyclustering.core.wrapper import ccore_library
33 
34 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
35 from pyclustering.cluster.encoder import type_encoding
36 from pyclustering.cluster.kmeans import kmeans
37 from pyclustering.utils import distance_metric, type_metric
38 
39 
40 class gmeans:
41  """!
42  @brief Class implements G-Means clustering algorithm.
43  @details The G-means algorithm starts with a small number of centers, and grows the number of centers.
44  Each iteration of the G-Means algorithm splits into two those centers whose data appear not to come from a
45  Gaussian distribution. G-means repeatedly makes decisions based on a statistical test for the data
46  assigned to each center.
47 
48  Implementation based on the paper @cite inproceedings::cluster::gmeans::1.
49 
50  @image html gmeans_example_clustering.png "G-Means clustering results on most common data-sets."
51 
52  Example #1. In this example, G-Means starts analysis from single cluster.
53  @code
54  from pyclustering.cluster import cluster_visualizer
55  from pyclustering.cluster.gmeans import gmeans
56  from pyclustering.utils import read_sample
57  from pyclustering.samples.definitions import FCPS_SAMPLES
58 
59  # Read sample 'Lsun' from file.
60  sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
61 
62  # Create instance of G-Means algorithm. By default the algorithm starts search from a single cluster.
63  gmeans_instance = gmeans(sample).process()
64 
65  # Extract clustering results: clusters and their centers
66  clusters = gmeans_instance.get_clusters()
67  centers = gmeans_instance.get_centers()
68 
69  # Print total sum of metric errors
70  print("Total WCE:", gmeans_instance.get_total_wce())
71 
72  # Visualize clustering results
73  visualizer = cluster_visualizer()
74  visualizer.append_clusters(clusters, sample)
75  visualizer.show()
76  @endcode
77 
78  Example #2. Sometimes G-Means might find local optimum. `repeat` value can be used to increase probability to
79  find global optimum. Argument `repeat` defines how many times K-Means clustering with K-Means++
80  initialization should be run in order to find optimal clusters.
81  @code
82  # Read sample 'Tetra' from file.
83  sample = read_sample(FCPS_SAMPLES.SAMPLE_TETRA)
84 
85  # Create instance of G-Means algorithm. By default algorithm start search from single cluster.
86  gmeans_instance = gmeans(sample, repeat=10).process()
87 
88  # Extract clustering results: clusters and their centers
89  clusters = gmeans_instance.get_clusters()
90 
91  # Visualize clustering results
92  visualizer = cluster_visualizer()
93  visualizer.append_clusters(clusters, sample)
94  visualizer.show()
95  @endcode
96 
97  In case of requirement to have labels instead of default representation of clustering results `CLUSTER_INDEX_LIST_SEPARATION`:
98  @code
99  from pyclustering.cluster.gmeans import gmeans
100  from pyclustering.cluster.encoder import type_encoding, cluster_encoder
101  from pyclustering.samples.definitions import SIMPLE_SAMPLES
102  from pyclustering.utils import read_sample
103 
104  data = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1)
105 
106  gmeans_instance = gmeans(data).process()
107  clusters = gmeans_instance.get_clusters()
108 
109  # Change cluster representation from default to labeling.
110  encoder = cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data)
111  encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)
112  labels = encoder.get_clusters()
113 
114  print(labels) # Display labels
115  @endcode
116 
117  There is an output of the code above:
118  @code
119  [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
120  @endcode
121 
122  """
123 
124  def __init__(self, data, k_init=1, ccore=True, **kwargs):
125  """!
126  @brief Initializes G-Means algorithm.
127 
128  @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be
129  represented by array_like data structure.
130  @param[in] k_init (uint): Initial amount of centers (by default started search from 1).
131  @param[in] ccore (bool): Defines whether CCORE library (C/C++ part of the library) should be used instead of
132  Python code.
133  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `tolerance`, `repeat`, `k_max`, `random_state`).
134 
135  <b>Keyword Args:</b><br>
136  - tolerance (double): tolerance (double): Stop condition for each K-Means iteration: if maximum value of
137  change of centers of clusters is less than tolerance than algorithm will stop processing.
138  - repeat (unit): How many times K-Means should be run to improve parameters (by default is 3).
139  With larger 'repeat' values suggesting higher probability of finding global optimum.
140  - k_max (uint): Maximum amount of cluster that might be allocated. The argument is considered as a stop
141  condition. When the maximum amount is reached then algorithm stops processing. By default the maximum
142  amount of clusters is not restricted (`k_max` is -1).
143  - random_state (int): Seed for random state (by default is `None`, current system time is used).
144 
145  """
146  self.__data = data
147  self.__k_init = k_init
148 
149  self.__clusters = []
150  self.__centers = []
151  self.__total_wce = 0.0
152  self.__ccore = ccore
153 
154  self.__tolerance = kwargs.get('tolerance', 0.001)
155  self.__repeat = kwargs.get('repeat', 3)
156  self.__k_max = kwargs.get('k_max', -1)
157  self.__random_state = kwargs.get('random_state', None)
158 
159  if self.__ccore is True:
160  self.__ccore = ccore_library.workable()
161 
162  self._verify_arguments()
163 
164 
165  def process(self):
166  """!
167  @brief Performs cluster analysis in line with rules of G-Means algorithm.
168 
169  @return (gmeans) Returns itself (G-Means instance).
170 
171  @see get_clusters()
172  @see get_centers()
173 
174  """
175  if self.__ccore is True:
176  return self._process_by_ccore()
177 
178  return self._process_by_python()
179 
180 
181  def _process_by_ccore(self):
182  """!
183  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
184 
185  """
186  self.__clusters, self.__centers, self.__total_wce = gmeans_wrapper(self.__data, self.__k_init, self.__tolerance, self.__repeat, self.__k_max, self.__random_state)
187  return self
188 
189 
190  def _process_by_python(self):
191  """!
192  @brief Performs cluster analysis using Python.
193 
194  """
195  self.__clusters, self.__centers, _ = self._search_optimal_parameters(self.__data, self.__k_init)
196 
197  while self._run_condition():
198  current_amount_clusters = len(self.__clusters)
200 
201  if current_amount_clusters == len(self.__centers): # amount of centers the same - no need to continue.
202  break
203 
204  self._perform_clustering()
205 
206  return self
207 
208 
209  def predict(self, points):
210  """!
211  @brief Calculates the closest cluster to each point.
212 
213  @param[in] points (array_like): Points for which closest clusters are calculated.
214 
215  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
216  collection if 'process()' method was not called.
217 
218  """
219  nppoints = numpy.array(points)
220  if len(self.__clusters) == 0:
221  return []
222 
223  metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, numpy_usage=True)
224 
225  differences = numpy.zeros((len(nppoints), len(self.__centers)))
226  for index_point in range(len(nppoints)):
227  differences[index_point] = metric(nppoints[index_point], self.__centers)
228 
229  return numpy.argmin(differences, axis=1)
230 
231 
232  def get_clusters(self):
233  """!
234  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
235 
236  @return (array_like) Allocated clusters.
237 
238  @see process()
239  @see get_centers()
240 
241  """
242  return self.__clusters
243 
244 
245  def get_centers(self):
246  """!
247  @brief Returns list of centers of allocated clusters.
248 
249  @return (array_like) Allocated centers.
250 
251  @see process()
252  @see get_clusters()
253 
254  """
255  return self.__centers
256 
257 
258  def get_total_wce(self):
259  """!
260  @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
261  @details Sum of metric errors is calculated using distance between point and its center:
262  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
263 
264  @see process()
265  @see get_clusters()
266 
267  """
268 
269  return self.__total_wce
270 
271 
273  """!
274  @brief Returns clustering result representation type that indicate how clusters are encoded.
275 
276  @return (type_encoding) Clustering result representation.
277 
278  @see get_clusters()
279 
280  """
281 
282  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
283 
284 
285  def _statistical_optimization(self):
286  """!
287  @brief Try to split cluster into two to find optimal amount of clusters.
288 
289  """
290  centers = []
291  potential_amount_clusters = len(self.__clusters)
292  for index in range(len(self.__clusters)):
293  new_centers = self._split_and_search_optimal(self.__clusters[index])
294  if (new_centers is None) or ((self.__k_max != -1) and (potential_amount_clusters >= self.__k_max)):
295  centers.append(self.__centers[index])
296  else:
297  centers += new_centers
298  potential_amount_clusters += 1
299 
300  self.__centers = centers
301 
302 
303  def _split_and_search_optimal(self, cluster):
304  """!
305  @brief Split specified cluster into two by performing K-Means clustering and check correctness by
306  Anderson-Darling test.
307 
308  @param[in] cluster (array_like) Cluster that should be analysed and optimized by splitting if it is required.
309 
310  @return (array_like) Two new centers if two new clusters are considered as more suitable.
311  (None) If current cluster is more suitable.
312  """
313  if len(cluster) == 1:
314  return None
315 
316  points = [self.__data[index_point] for index_point in cluster]
317  new_clusters, new_centers, _ = self._search_optimal_parameters(points, 2)
318 
319  if len(new_centers) > 1:
320  accept_null_hypothesis = self._is_null_hypothesis(points, new_centers)
321  if not accept_null_hypothesis:
322  return new_centers # If null hypothesis is rejected then use two new clusters
323 
324  return None
325 
326 
327  def _is_null_hypothesis(self, data, centers):
328  """!
329  @brief Returns whether H0 hypothesis is accepted using Anderson-Darling test statistic.
330 
331  @param[in] data (array_like): N-dimensional data for statistical test.
332  @param[in] centers (array_like): Two new allocated centers.
333 
334  @return (bool) True is null hypothesis is acceptable.
335 
336  """
337  v = numpy.subtract(centers[0], centers[1])
338  points = self._project_data(data, v)
339 
340  estimation, critical, _ = scipy.stats.anderson(points, dist='norm') # the Anderson-Darling test statistic
341 
342  # If the returned statistic is larger than these critical values then for the corresponding significance level,
343  # the null hypothesis that the data come from the chosen distribution can be rejected.
344  return estimation < critical[-1] # False - not a gaussian distribution (reject H0)
345 
346 
347  @staticmethod
348  def _project_data(data, vector):
349  """!
350  @brief Transform input data by project it onto input vector using formula:
351 
352  \f[
353  x_{i}^{*}=\frac{\left \langle x_{i}, v \right \rangle}{\left \| v \right \|^{2}}.
354  \f]
355 
356  @param[in] data (array_like): Input data that is represented by points.
357  @param[in] vector (array_like): Input vector that is used for projection.
358 
359  @return (array_like) Transformed 1-dimensional data.
360 
361  """
362  square_norm = numpy.sum(numpy.multiply(vector, vector))
363  return numpy.divide(numpy.sum(numpy.multiply(data, vector), axis=1), square_norm)
364 
365 
366  def _search_optimal_parameters(self, data, amount):
367  """!
368  @brief Performs cluster analysis for specified data several times to find optimal clustering result in line
369  with WCE.
370 
371  @param[in] data (array_like): Input data that should be clustered.
372  @param[in] amount (unit): Amount of clusters that should be allocated.
373 
374  @return (tuple) Optimal clustering result: (clusters, centers, wce).
375 
376  """
377  best_wce, best_clusters, best_centers = float('+inf'), [], []
378  for _ in range(self.__repeat):
379  initial_centers = kmeans_plusplus_initializer(data, amount, random_state=self.__random_state).initialize()
380  solver = kmeans(data, initial_centers, tolerance=self.__tolerance, ccore=False).process()
381 
382  candidate_wce = solver.get_total_wce()
383  if candidate_wce < best_wce:
384  best_wce = candidate_wce
385  best_clusters = solver.get_clusters()
386  best_centers = solver.get_centers()
387 
388  if len(initial_centers) == 1:
389  break # No need to rerun clustering for one initial center.
390 
391  return best_clusters, best_centers, best_wce
392 
393 
394  def _perform_clustering(self):
395  """!
396  @brief Performs cluster analysis using K-Means algorithm using current centers are initial.
397 
398  @param[in] data (array_like): Input data for cluster analysis.
399 
400  """
401  solver = kmeans(self.__data, self.__centers, tolerance=self.__tolerance, ccore=False).process()
402  self.__clusters = solver.get_clusters()
403  self.__centers = solver.get_centers()
404  self.__total_wce = solver.get_total_wce()
405 
406 
407  def _run_condition(self):
408  """!
409  @brief Defines whether the algorithm should continue processing or should stop.
410 
411  @return `True` if the algorithm should continue processing, otherwise returns `False`
412 
413  """
414  if (self.__k_max > 0) and (len(self.__clusters) >= self.__k_max):
415  return False
416 
417  return True
418 
419 
420  def _verify_arguments(self):
421  """!
422  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
423 
424  """
425  if len(self.__data) == 0:
426  raise ValueError("Input data is empty (size: '%d')." % len(self.__data))
427 
428  if self.__k_init <= 0:
429  raise ValueError("Initial amount of centers should be greater than 0 "
430  "(current value: '%d')." % self.__k_init)
431 
432  if self.__tolerance <= 0.0:
433  raise ValueError("Tolerance should be greater than 0 (current value: '%f')." % self.__tolerance)
434 
435  if self.__repeat <= 0:
436  raise ValueError("Amount of attempt to find optimal parameters should be greater than 0 "
437  "(current value: '%d')." % self.__repeat)
438 
439  if (self.__k_max != -1) and (self.__k_max <= 0):
440  raise ValueError("Maximum amount of cluster that might be allocated should be greater than 0 or -1 if "
441  "the algorithm should be restricted in searching optimal number of clusters.")
442 
443  if (self.__k_max != -1) and (self.__k_max < self.__k_init):
444  raise ValueError("Initial amount of clusters should be less than the maximum amount 'k_max'.")
The module contains K-Means algorithm and other related services.
Definition: kmeans.py:1
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: gmeans.py:232
def process(self)
Performs cluster analysis in line with rules of G-Means algorithm.
Definition: gmeans.py:165
def get_centers(self)
Returns list of centers of allocated clusters.
Definition: gmeans.py:245
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
Module for representing clustering results.
Definition: encoder.py:1
def _run_condition(self)
Defines whether the algorithm should continue processing or should stop.
Definition: gmeans.py:407
def _search_optimal_parameters(self, data, amount)
Performs cluster analysis for specified data several times to find optimal clustering result in line ...
Definition: gmeans.py:366
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def __init__(self, data, k_init=1, ccore=True, kwargs)
Initializes G-Means algorithm.
Definition: gmeans.py:124
def _verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: gmeans.py:420
def _statistical_optimization(self)
Try to split cluster into two to find optimal amount of clusters.
Definition: gmeans.py:285
Class implements K-Means clustering algorithm.
Definition: kmeans.py:272
def _split_and_search_optimal(self, cluster)
Split specified cluster into two by performing K-Means clustering and check correctness by Anderson-D...
Definition: gmeans.py:303
def _process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: gmeans.py:181
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: gmeans.py:272
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: gmeans.py:258
def _process_by_python(self)
Performs cluster analysis using Python.
Definition: gmeans.py:190
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def _project_data(data, vector)
Transform input data by project it onto input vector using formula:
Definition: gmeans.py:348
Class implements G-Means clustering algorithm.
Definition: gmeans.py:40
def _perform_clustering(self)
Performs cluster analysis using K-Means algorithm using current centers are initial.
Definition: gmeans.py:394
def _is_null_hypothesis(self, data, centers)
Returns whether H0 hypothesis is accepted using Anderson-Darling test statistic.
Definition: gmeans.py:327
def predict(self, points)
Calculates the closest cluster to each point.
Definition: gmeans.py:209