gmeans.py
1 """!
2 
3 @brief The module contains G-Means algorithm and other related services.
4 @details Implementation based on paper @cite inproceedings::cluster::gmeans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import scipy.stats
30 
31 from pyclustering.core.gmeans_wrapper import gmeans as gmeans_wrapper
32 from pyclustering.core.wrapper import ccore_library
33 
34 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
35 from pyclustering.cluster.kmeans import kmeans
36 from pyclustering.utils import distance_metric, type_metric
37 
38 
39 class gmeans:
40  """!
41  @brief Class implements G-Means clustering algorithm.
42  @details The G-means algorithm starts with a small number of centers, and grows the number of centers.
43  Each iteration of the G-Means algorithm splits into two those centers whose data appear not to come from a
44  Gaussian distribution. G-means repeatedly makes decisions based on a statistical test for the data
45  assigned to each center.
46 
47  Implementation based on the paper @cite inproceedings::cluster::gmeans::1.
48 
49  @image html gmeans_example_clustering.png "G-Means clustering results on most common data-sets."
50 
51  Example #1. In this example, G-Means starts analysis from single cluster.
52  @code
53  from pyclustering.cluster import cluster_visualizer
54  from pyclustering.cluster.gmeans import gmeans
55  from pyclustering.utils import read_sample
56  from pyclustering.samples.definitions import FCPS_SAMPLES
57 
58  # Read sample 'Lsun' from file.
59  sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
60 
61  # Create instance of G-Means algorithm. By default algorithm start search from single cluster.
62  gmeans_instance = gmeans(sample).process()
63 
64  # Extract clustering results: clusters and their centers
65  clusters = gmeans_instance.get_clusters()
66  centers = gmeans_instance.get_centers()
67 
68  # Print total sum of metric errors
69  print("Total WCE:", gmeans_instance.get_total_wce())
70 
71  # Visualize clustering results
72  visualizer = cluster_visualizer()
73  visualizer.append_clusters(clusters, sample)
74  visualizer.show()
75  @endcode
76 
77  Example #2. Sometimes G-Means may found local optimum. 'repeat' value can be used to increase probability to
78  find global optimum. Argument 'repeat' defines how many times K-Means clustering with K-Means++
79  initialization should be run to find optimal clusters.
80  @code
81  # Read sample 'Tetra' from file.
82  sample = read_sample(FCPS_SAMPLES.SAMPLE_TETRA)
83 
84  # Create instance of G-Means algorithm. By default algorithm start search from single cluster.
85  gmeans_instance = gmeans(sample, repeat=10).process()
86 
87  # Extract clustering results: clusters and their centers
88  clusters = gmeans_instance.get_clusters()
89 
90  # Visualize clustering results
91  visualizer = cluster_visualizer()
92  visualizer.append_clusters(clusters, sample)
93  visualizer.show()
94  @endcode
95 
96  """
97 
98  def __init__(self, data, k_init=1, ccore=True, **kwargs):
99  """!
100  @brief Initializes G-Means algorithm.
101 
102  @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be
103  represented by array_like data structure.
104  @param[in] k_init (uint): Initial amount of centers (by default started search from 1).
105  @param[in] ccore (bool): Defines whether CCORE library (C/C++ part of the library) should be used instead of
106  Python code.
107  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'tolerance', 'repeat').
108 
109  <b>Keyword Args:</b><br>
110  - tolerance (double): tolerance (double): Stop condition for each K-Means iteration: if maximum value of
111  change of centers of clusters is less than tolerance than algorithm will stop processing.
112  - repeat (unit): How many times K-Means should be run to improve parameters (by default is 3).
113  With larger 'repeat' values suggesting higher probability of finding global optimum.
114 
115  """
116  self.__data = data
117  self.__k_init = k_init
118 
119  self.__clusters = []
120  self.__centers = []
121  self.__total_wce = 0.0
122  self.__ccore = ccore
123 
124  self.__tolerance = kwargs.get('tolerance', 0.025)
125  self.__repeat = kwargs.get('repeat', 3)
126 
127  if self.__ccore is True:
128  self.__ccore = ccore_library.workable()
129 
130  self._verify_arguments()
131 
132 
133  def process(self):
134  """!
135  @brief Performs cluster analysis in line with rules of G-Means algorithm.
136 
137  @return (gmeans) Returns itself (G-Means instance).
138 
139  @see get_clusters()
140  @see get_centers()
141 
142  """
143  if self.__ccore is True:
144  return self._process_by_ccore()
145 
146  return self._process_by_python()
147 
148 
149  def _process_by_ccore(self):
150  """!
151  @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
152 
153  """
154  self.__clusters, self.__centers, self.__total_wce = gmeans_wrapper(self.__data, self.__k_init, self.__tolerance, self.__repeat)
155  return self
156 
157 
158  def _process_by_python(self):
159  """!
160  @brief Performs cluster analysis using Python.
161 
162  """
163  self.__clusters, self.__centers, _ = self._search_optimal_parameters(self.__data, self.__k_init)
164  while True:
165  current_amount_clusters = len(self.__clusters)
167 
168  if current_amount_clusters == len(self.__centers):
169  break
170 
171  self._perform_clustering()
172 
173  return self
174 
175 
176  def predict(self, points):
177  """!
178  @brief Calculates the closest cluster to each point.
179 
180  @param[in] points (array_like): Points for which closest clusters are calculated.
181 
182  @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
183  collection if 'process()' method was not called.
184 
185  """
186  nppoints = numpy.array(points)
187  if len(self.__clusters) == 0:
188  return []
189 
190  metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, numpy_usage=True)
191 
192  differences = numpy.zeros((len(nppoints), len(self.__centers)))
193  for index_point in range(len(nppoints)):
194  differences[index_point] = metric(nppoints[index_point], self.__centers)
195 
196  return numpy.argmin(differences, axis=1)
197 
198 
199  def get_clusters(self):
200  """!
201  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
202 
203  @return (array_like) Allocated clusters.
204 
205  @see process()
206  @see get_centers()
207 
208  """
209  return self.__clusters
210 
211 
212  def get_centers(self):
213  """!
214  @brief Returns list of centers of allocated clusters.
215 
216  @return (array_like) Allocated centers.
217 
218  @see process()
219  @see get_clusters()
220 
221  """
222  return self.__centers
223 
224 
225  def get_total_wce(self):
226  """!
227  @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
228  @details Sum of metric errors is calculated using distance between point and its center:
229  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
230 
231  @see process()
232  @see get_clusters()
233 
234  """
235 
236  return self.__total_wce
237 
238 
239  def _statistical_optimization(self):
240  """!
241  @brief Try to split cluster into two to find optimal amount of clusters.
242 
243  """
244  centers = []
245  for index in range(len(self.__clusters)):
246  new_centers = self._split_and_search_optimal(self.__clusters[index])
247  if new_centers is None:
248  centers.append(self.__centers[index])
249  else:
250  centers += new_centers
251 
252  self.__centers = centers
253 
254 
255  def _split_and_search_optimal(self, cluster):
256  """!
257  @brief Split specified cluster into two by performing K-Means clustering and check correctness by
258  Anderson-Darling test.
259 
260  @param[in] cluster (array_like) Cluster that should be analysed and optimized by splitting if it is required.
261 
262  @return (array_like) Two new centers if two new clusters are considered as more suitable.
263  (None) If current cluster is more suitable.
264  """
265  if len(cluster) == 1:
266  return None
267 
268  points = [self.__data[index_point] for index_point in cluster]
269  new_clusters, new_centers, _ = self._search_optimal_parameters(points, 2)
270 
271  if len(new_centers) > 1:
272  accept_null_hypothesis = self._is_null_hypothesis(points, new_centers)
273  if not accept_null_hypothesis:
274  return new_centers # If null hypothesis is rejected then use two new clusters
275 
276  return None
277 
278 
279  def _is_null_hypothesis(self, data, centers):
280  """!
281  @brief Returns whether H0 hypothesis is accepted using Anderson-Darling test statistic.
282 
283  @param[in] data (array_like): N-dimensional data for statistical test.
284  @param[in] centers (array_like): Two new allocated centers.
285 
286  @return (bool) True is null hypothesis is acceptable.
287 
288  """
289  v = numpy.subtract(centers[0], centers[1])
290  points = self._project_data(data, v)
291 
292  estimation, critical, _ = scipy.stats.anderson(points, dist='norm') # the Anderson-Darling test statistic
293 
294  # If the returned statistic is larger than these critical values then for the corresponding significance level,
295  # the null hypothesis that the data come from the chosen distribution can be rejected.
296  return estimation < critical[-1] # False - not a gaussian distribution (reject H0)
297 
298 
299  @staticmethod
300  def _project_data(data, vector):
301  """!
302  @brief Transform input data by project it onto input vector using formula:
303 
304  \f[
305  x_{i}^{*}=\frac{\left \langle x_{i}, v \right \rangle}{\left \| v \right \|^{2}}.
306  \f]
307 
308  @param[in] data (array_like): Input data that is represented by points.
309  @param[in] vector (array_like): Input vector that is used for projection.
310 
311  @return (array_like) Transformed 1-dimensional data.
312 
313  """
314  square_norm = numpy.sum(numpy.multiply(vector, vector))
315  return numpy.divide(numpy.sum(numpy.multiply(data, vector), axis=1), square_norm)
316 
317 
318  def _search_optimal_parameters(self, data, amount):
319  """!
320  @brief Performs cluster analysis for specified data several times to find optimal clustering result in line
321  with WCE.
322 
323  @param[in] data (array_like): Input data that should be clustered.
324  @param[in] amount (unit): Amount of clusters that should be allocated.
325 
326  @return (tuple) Optimal clustering result: (clusters, centers, wce).
327 
328  """
329  best_wce, best_clusters, best_centers = float('+inf'), [], []
330  for _ in range(self.__repeat):
331  initial_centers = kmeans_plusplus_initializer(data, amount).initialize()
332  solver = kmeans(data, initial_centers, tolerance=self.__tolerance, ccore=False).process()
333 
334  candidate_wce = solver.get_total_wce()
335  if candidate_wce < best_wce:
336  best_wce = candidate_wce
337  best_clusters = solver.get_clusters()
338  best_centers = solver.get_centers()
339 
340  if len(initial_centers) == 1:
341  break # No need to rerun clustering for one initial center.
342 
343  return best_clusters, best_centers, best_wce
344 
345 
346  def _perform_clustering(self):
347  """!
348  @brief Performs cluster analysis using K-Means algorithm using current centers are initial.
349 
350  @param[in] data (array_like): Input data for cluster analysis.
351 
352  """
353  solver = kmeans(self.__data, self.__centers, tolerance=self.__tolerance, ccore=False).process()
354  self.__clusters = solver.get_clusters()
355  self.__centers = solver.get_centers()
356  self.__total_wce = solver.get_total_wce()
357 
358 
359  def _verify_arguments(self):
360  """!
361  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
362 
363  """
364  if len(self.__data) == 0:
365  raise ValueError("Input data is empty (size: '%d')." % len(self.__data))
366 
367  if self.__k_init <= 0:
368  raise ValueError("Initial amount of centers should be greater than 0 "
369  "(current value: '%d')." % self.__k_init)
370 
371  if self.__tolerance <= 0.0:
372  raise ValueError("Tolerance should be greater than 0 (current value: '%f')." % self.__tolerance)
373 
374  if self.__repeat <= 0:
375  raise ValueError("Amount of attempt to find optimal parameters should be greater than 0 "
376  "(current value: '%d')." % self.__repeat)
The module contains K-Means algorithm and other related services.
Definition: kmeans.py:1
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: gmeans.py:199
def process(self)
Performs cluster analysis in line with rules of G-Means algorithm.
Definition: gmeans.py:133
def get_centers(self)
Returns list of centers of allocated clusters.
Definition: gmeans.py:212
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def _search_optimal_parameters(self, data, amount)
Performs cluster analysis for specified data several times to find optimal clustering result in line ...
Definition: gmeans.py:318
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def __init__(self, data, k_init=1, ccore=True, kwargs)
Initializes G-Means algorithm.
Definition: gmeans.py:98
def _verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: gmeans.py:359
def _statistical_optimization(self)
Try to split cluster into two to find optimal amount of clusters.
Definition: gmeans.py:239
Class implements K-Means clustering algorithm.
Definition: kmeans.py:272
def _split_and_search_optimal(self, cluster)
Split specified cluster into two by performing K-Means clustering and check correctness by Anderson-D...
Definition: gmeans.py:255
def _process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: gmeans.py:149
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: gmeans.py:225
def _process_by_python(self)
Performs cluster analysis using Python.
Definition: gmeans.py:158
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def _project_data(data, vector)
Transform input data by project it onto input vector using formula:
Definition: gmeans.py:300
Class implements G-Means clustering algorithm.
Definition: gmeans.py:39
def _perform_clustering(self)
Performs cluster analysis using K-Means algorithm using current centers are initial.
Definition: gmeans.py:346
def _is_null_hypothesis(self, data, centers)
Returns whether H0 hypothesis is accepted using Anderson-Darling test statistic.
Definition: gmeans.py:279
def predict(self, points)
Calculates the closest cluster to each point.
Definition: gmeans.py:176