3 @brief The module contains G-Means algorithm and other related services.
4 @details Implementation based on paper @cite inproceedings::cluster::gmeans::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
16 from pyclustering.core.gmeans_wrapper
import gmeans
as gmeans_wrapper
17 from pyclustering.core.wrapper
import ccore_library
27 @brief Class implements G-Means clustering algorithm.
28 @details The G-means algorithm starts with a small number of centers, and grows the number of centers.
29 Each iteration of the G-Means algorithm splits into two those centers whose data appear not to come from a
30 Gaussian distribution. G-means repeatedly makes decisions based on a statistical test for the data
31 assigned to each center.
33 Implementation based on the paper @cite inproceedings::cluster::gmeans::1.
35 @image html gmeans_example_clustering.png "G-Means clustering results on most common data-sets."
37 Example #1. In this example, G-Means starts analysis from single cluster.
39 from pyclustering.cluster import cluster_visualizer
40 from pyclustering.cluster.gmeans import gmeans
41 from pyclustering.utils import read_sample
42 from pyclustering.samples.definitions import FCPS_SAMPLES
44 # Read sample 'Lsun' from file.
45 sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
47 # Create instance of G-Means algorithm. By default the algorithm starts search from a single cluster.
48 gmeans_instance = gmeans(sample).process()
50 # Extract clustering results: clusters and their centers
51 clusters = gmeans_instance.get_clusters()
52 centers = gmeans_instance.get_centers()
54 # Print total sum of metric errors
55 print("Total WCE:", gmeans_instance.get_total_wce())
57 # Visualize clustering results
58 visualizer = cluster_visualizer()
59 visualizer.append_clusters(clusters, sample)
63 Example #2. Sometimes G-Means might find local optimum. `repeat` value can be used to increase probability to
64 find global optimum. Argument `repeat` defines how many times K-Means clustering with K-Means++
65 initialization should be run in order to find optimal clusters.
67 # Read sample 'Tetra' from file.
68 sample = read_sample(FCPS_SAMPLES.SAMPLE_TETRA)
70 # Create instance of G-Means algorithm. By default algorithm start search from single cluster.
71 gmeans_instance = gmeans(sample, repeat=10).process()
73 # Extract clustering results: clusters and their centers
74 clusters = gmeans_instance.get_clusters()
76 # Visualize clustering results
77 visualizer = cluster_visualizer()
78 visualizer.append_clusters(clusters, sample)
82 In case of requirement to have labels instead of default representation of clustering results `CLUSTER_INDEX_LIST_SEPARATION`:
84 from pyclustering.cluster.gmeans import gmeans
85 from pyclustering.cluster.encoder import type_encoding, cluster_encoder
86 from pyclustering.samples.definitions import SIMPLE_SAMPLES
87 from pyclustering.utils import read_sample
89 data = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1)
91 gmeans_instance = gmeans(data).process()
92 clusters = gmeans_instance.get_clusters()
94 # Change cluster representation from default to labeling.
95 encoder = cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data)
96 encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)
97 labels = encoder.get_clusters()
99 print(labels) # Display labels
102 There is an output of the code above:
104 [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
109 def __init__(self, data, k_init=1, ccore=True, **kwargs):
111 @brief Initializes G-Means algorithm.
113 @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be
114 represented by array_like data structure.
115 @param[in] k_init (uint): Initial amount of centers (by default started search from 1).
116 @param[in] ccore (bool): Defines whether CCORE library (C/C++ part of the library) should be used instead of
118 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `tolerance`, `repeat`, `k_max`, `random_state`).
120 <b>Keyword Args:</b><br>
121 - tolerance (double): tolerance (double): Stop condition for each K-Means iteration: if maximum value of
122 change of centers of clusters is less than tolerance than algorithm will stop processing.
123 - repeat (unit): How many times K-Means should be run to improve parameters (by default is 3).
124 With larger 'repeat' values suggesting higher probability of finding global optimum.
125 - k_max (uint): Maximum amount of cluster that might be allocated. The argument is considered as a stop
126 condition. When the maximum amount is reached then algorithm stops processing. By default the maximum
127 amount of clusters is not restricted (`k_max` is -1).
128 - random_state (int): Seed for random state (by default is `None`, current system time is used).
140 self.
__repeat = kwargs.get(
'repeat', 3)
141 self.
__k_max = kwargs.get(
'k_max', -1)
145 self.
__ccore = ccore_library.workable()
152 @brief Performs cluster analysis in line with rules of G-Means algorithm.
154 @return (gmeans) Returns itself (G-Means instance).
166 def _process_by_ccore(self):
168 @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
175 def _process_by_python(self):
177 @brief Performs cluster analysis using Python.
183 current_amount_clusters = len(self.
__clusters)
186 if current_amount_clusters == len(self.
__centers):
196 @brief Calculates the closest cluster to each point.
198 @param[in] points (array_like): Points for which closest clusters are calculated.
200 @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
201 collection if 'process()' method was not called.
204 nppoints = numpy.array(points)
208 metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, numpy_usage=
True)
211 differences = numpy.zeros((len(nppoints), len(npcenters)))
212 for index_point
in range(len(nppoints)):
213 differences[index_point] = metric(nppoints[index_point], npcenters)
215 return numpy.argmin(differences, axis=1)
220 @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
222 @return (array_like) Allocated clusters.
233 @brief Returns list of centers of allocated clusters.
235 @return (array_like) Allocated centers.
246 @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
247 @details Sum of metric errors is calculated using distance between point and its center:
248 \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]
260 @brief Returns clustering result representation type that indicate how clusters are encoded.
262 @return (type_encoding) Clustering result representation.
268 return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
271 def _statistical_optimization(self):
273 @brief Try to split cluster into two to find optimal amount of clusters.
277 potential_amount_clusters = len(self.
__clusters)
280 if (new_centers
is None)
or ((self.
__k_max != -1)
and (potential_amount_clusters >= self.
__k_max)):
283 centers += new_centers
284 potential_amount_clusters += 1
289 def _split_and_search_optimal(self, cluster):
291 @brief Split specified cluster into two by performing K-Means clustering and check correctness by
292 Anderson-Darling test.
294 @param[in] cluster (array_like) Cluster that should be analysed and optimized by splitting if it is required.
296 @return (array_like) Two new centers if two new clusters are considered as more suitable.
297 (None) If current cluster is more suitable.
299 if len(cluster) == 1:
302 points = [self.
__data[index_point]
for index_point
in cluster]
305 if len(new_centers) > 1:
307 if not accept_null_hypothesis:
313 def _is_null_hypothesis(self, data, centers):
315 @brief Returns whether H0 hypothesis is accepted using Anderson-Darling test statistic.
317 @param[in] data (array_like): N-dimensional data for statistical test.
318 @param[in] centers (array_like): Two new allocated centers.
320 @return (bool) True is null hypothesis is acceptable.
323 v = numpy.subtract(centers[0], centers[1])
326 estimation, critical, _ = scipy.stats.anderson(points, dist=
'norm')
330 return estimation < critical[-1]
334 def _project_data(data, vector):
336 @brief Transform input data by project it onto input vector using formula:
339 x_{i}^{*}=\frac{\left \langle x_{i}, v \right \rangle}{\left \| v \right \|^{2}}.
342 @param[in] data (array_like): Input data that is represented by points.
343 @param[in] vector (array_like): Input vector that is used for projection.
345 @return (array_like) Transformed 1-dimensional data.
348 square_norm = numpy.sum(numpy.multiply(vector, vector))
349 return numpy.divide(numpy.sum(numpy.multiply(data, vector), axis=1), square_norm)
352 def _search_optimal_parameters(self, data, amount):
354 @brief Performs cluster analysis for specified data several times to find optimal clustering result in line
357 @param[in] data (array_like): Input data that should be clustered.
358 @param[in] amount (unit): Amount of clusters that should be allocated.
360 @return (tuple) Optimal clustering result: (clusters, centers, wce).
363 best_wce, best_clusters, best_centers = float(
'+inf'), [], []
368 candidate_wce = solver.get_total_wce()
369 if candidate_wce < best_wce:
370 best_wce = candidate_wce
371 best_clusters = solver.get_clusters()
372 best_centers = solver.get_centers()
374 if len(initial_centers) == 1:
377 return best_clusters, best_centers, best_wce
380 def _perform_clustering(self):
382 @brief Performs cluster analysis using K-Means algorithm using current centers are initial.
384 @param[in] data (array_like): Input data for cluster analysis.
393 def _run_condition(self):
395 @brief Defines whether the algorithm should continue processing or should stop.
397 @return `True` if the algorithm should continue processing, otherwise returns `False`
406 def _verify_arguments(self):
408 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
412 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__data))
415 raise ValueError(
"Initial amount of centers should be greater than 0 "
416 "(current value: '%d')." % self.
__k_init)
419 raise ValueError(
"Tolerance should be greater than 0 (current value: '%f')." % self.
__tolerance)
422 raise ValueError(
"Amount of attempt to find optimal parameters should be greater than 0 "
423 "(current value: '%d')." % self.
__repeat)
426 raise ValueError(
"Maximum amount of cluster that might be allocated should be greater than 0 or -1 if "
427 "the algorithm should be restricted in searching optimal number of clusters.")
430 raise ValueError(
"Initial amount of clusters should be less than the maximum amount 'k_max'.")