d8/dd0/gmeans_8py_source.html

 """!

 @brief The module contains G-Means algorithm and other related services.
 @details Implementation based on paper @cite inproceedings::cluster::gmeans::1.

 @authors Andrei Novikov (pyclustering@yandex.ru)
 @date 2014-2020
 @copyright GNU Public License

 @cond GNU_PUBLIC_LICENSE
     PyClustering is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
     (at your option) any later version.

     PyClustering is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 @endcond

 """


 import numpy
 import scipy.stats

 from pyclustering.core.gmeans_wrapper import gmeans as gmeans_wrapper
 from pyclustering.core.wrapper import ccore_library

 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
 from pyclustering.cluster.encoder import type_encoding
 from pyclustering.cluster.kmeans import kmeans
 from pyclustering.utils import distance_metric, type_metric


 class gmeans:
     """!
     @brief Class implements G-Means clustering algorithm.
     @details The G-means algorithm starts with a small number of centers, and grows the number of centers.
               Each iteration of the G-Means algorithm splits into two those centers whose data appear not to come from a
               Gaussian distribution. G-means repeatedly makes decisions based on a statistical test for the data
               assigned to each center.

     Implementation based on the paper @cite inproceedings::cluster::gmeans::1.

     @image html gmeans_example_clustering.png "G-Means clustering results on most common data-sets."

     Example #1. In this example, G-Means starts analysis from single cluster.
     @code
         from pyclustering.cluster import cluster_visualizer
         from pyclustering.cluster.gmeans import gmeans
         from pyclustering.utils import read_sample
         from pyclustering.samples.definitions import FCPS_SAMPLES

         # Read sample 'Lsun' from file.
         sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)

         # Create instance of G-Means algorithm. By default the algorithm starts search from a single cluster.
         gmeans_instance = gmeans(sample).process()

         # Extract clustering results: clusters and their centers
         clusters = gmeans_instance.get_clusters()
         centers = gmeans_instance.get_centers()

         # Print total sum of metric errors
         print("Total WCE:", gmeans_instance.get_total_wce())

         # Visualize clustering results
         visualizer = cluster_visualizer()
         visualizer.append_clusters(clusters, sample)
         visualizer.show()
     @endcode

     Example #2. Sometimes G-Means might find local optimum. `repeat` value can be used to increase probability to
     find global optimum. Argument `repeat` defines how many times K-Means clustering with K-Means++
     initialization should be run in order to find optimal clusters.
     @code
         # Read sample 'Tetra' from file.
         sample = read_sample(FCPS_SAMPLES.SAMPLE_TETRA)

         # Create instance of G-Means algorithm. By default algorithm start search from single cluster.
         gmeans_instance = gmeans(sample, repeat=10).process()

         # Extract clustering results: clusters and their centers
         clusters = gmeans_instance.get_clusters()

         # Visualize clustering results
         visualizer = cluster_visualizer()
         visualizer.append_clusters(clusters, sample)
         visualizer.show()
     @endcode

     In case of requirement to have labels instead of default representation of clustering results `CLUSTER_INDEX_LIST_SEPARATION`:
     @code
         from pyclustering.cluster.gmeans import gmeans
         from pyclustering.cluster.encoder import type_encoding, cluster_encoder
         from pyclustering.samples.definitions import SIMPLE_SAMPLES
         from pyclustering.utils import read_sample

         data = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1)

         gmeans_instance = gmeans(data).process()
         clusters = gmeans_instance.get_clusters()

         # Change cluster representation from default to labeling.
         encoder = cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data)
         encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)
         labels = encoder.get_clusters()

         print(labels)   # Display labels
     @endcode

     There is an output of the code above:
     @code
         [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
     @endcode

     """

     def __init__(self, data, k_init=1, ccore=True, **kwargs):
         """!
         @brief Initializes G-Means algorithm.

         @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be
                     represented by array_like data structure.
         @param[in] k_init (uint): Initial amount of centers (by default started search from 1).
         @param[in] ccore (bool): Defines whether CCORE library (C/C++ part of the library) should be used instead of
                     Python code.
         @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `tolerance`, `repeat`, `k_max`, `random_state`).

         <b>Keyword Args:</b><br>
             - tolerance (double): tolerance (double): Stop condition for each K-Means iteration: if maximum value of
                change of centers of clusters is less than tolerance than algorithm will stop processing.
             - repeat (unit): How many times K-Means should be run to improve parameters (by default is 3).
                With larger 'repeat' values suggesting higher probability of finding global optimum.
             - k_max (uint): Maximum amount of cluster that might be allocated. The argument is considered as a stop
                condition. When the maximum amount is reached then algorithm stops processing. By default the maximum
                amount of clusters is not restricted (`k_max` is -1).
             - random_state (int): Seed for random state (by default is `None`, current system time is used).

         """
         self.__data = data
         self.__k_init = k_init

         self.__clusters = []
         self.__centers = []
         self.__total_wce = 0.0
         self.__ccore = ccore

         self.__tolerance = kwargs.get('tolerance', 0.001)
         self.__repeat = kwargs.get('repeat', 3)
         self.__k_max = kwargs.get('k_max', -1)
         self.__random_state = kwargs.get('random_state', None)

         if self.__ccore is True:
             self.__ccore = ccore_library.workable()

         self._verify_arguments()


     def process(self):
         """!
         @brief Performs cluster analysis in line with rules of G-Means algorithm.

         @return (gmeans) Returns itself (G-Means instance).

         @see get_clusters()
         @see get_centers()

         """
         if self.__ccore is True:
             return self._process_by_ccore()

         return self._process_by_python()


     def _process_by_ccore(self):
         """!
         @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).

         """
         self.__clusters, self.__centers, self.__total_wce = gmeans_wrapper(self.__data, self.__k_init, self.__tolerance, self.__repeat, self.__k_max, self.__random_state)
         return self


     def _process_by_python(self):
         """!
         @brief Performs cluster analysis using Python.

         """
         self.__clusters, self.__centers, _ = self._search_optimal_parameters(self.__data, self.__k_init)

         while self._run_condition():
             current_amount_clusters = len(self.__clusters)
             self._statistical_optimization()

             if current_amount_clusters == len(self.__centers):  # amount of centers the same - no need to continue.
                 break

             self._perform_clustering()

         return self


     def predict(self, points):
         """!
         @brief Calculates the closest cluster to each point.

         @param[in] points (array_like): Points for which closest clusters are calculated.

         @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
                  collection if 'process()' method was not called.

         """
         nppoints = numpy.array(points)
         if len(self.__clusters) == 0:
             return []

         metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, numpy_usage=True)

         differences = numpy.zeros((len(nppoints), len(self.__centers)))
         for index_point in range(len(nppoints)):
             differences[index_point] = metric(nppoints[index_point], self.__centers)

         return numpy.argmin(differences, axis=1)


     def get_clusters(self):
         """!
         @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.

         @return (array_like) Allocated clusters.

         @see process()
         @see get_centers()

         """
         return self.__clusters


     def get_centers(self):
         """!
         @brief Returns list of centers of allocated clusters.

         @return (array_like) Allocated centers.

         @see process()
         @see get_clusters()

         """
         return self.__centers


     def get_total_wce(self):
         """!
         @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
         @details Sum of metric errors is calculated using distance between point and its center:
                  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]

         @see process()
         @see get_clusters()

         """

         return self.__total_wce


     def get_cluster_encoding(self):
         """!
         @brief Returns clustering result representation type that indicate how clusters are encoded.

         @return (type_encoding) Clustering result representation.

         @see get_clusters()

         """

         return type_encoding.CLUSTER_INDEX_LIST_SEPARATION


     def _statistical_optimization(self):
         """!
         @brief Try to split cluster into two to find optimal amount of clusters.

         """
         centers = []
         potential_amount_clusters = len(self.__clusters)
         for index in range(len(self.__clusters)):
             new_centers = self._split_and_search_optimal(self.__clusters[index])
             if (new_centers is None) or ((self.__k_max != -1) and (potential_amount_clusters >= self.__k_max)):
                 centers.append(self.__centers[index])
             else:
                 centers += new_centers
                 potential_amount_clusters += 1

         self.__centers = centers


     def _split_and_search_optimal(self, cluster):
         """!
         @brief Split specified cluster into two by performing K-Means clustering and check correctness by
                 Anderson-Darling test.

         @param[in] cluster (array_like) Cluster that should be analysed and optimized by splitting if it is required.

         @return (array_like) Two new centers if two new clusters are considered as more suitable.
                 (None) If current cluster is more suitable.
         """
         if len(cluster) == 1:
             return None

         points = [self.__data[index_point] for index_point in cluster]
         new_clusters, new_centers, _ = self._search_optimal_parameters(points, 2)

         if len(new_centers) > 1:
             accept_null_hypothesis = self._is_null_hypothesis(points, new_centers)
             if not accept_null_hypothesis:
                 return new_centers  # If null hypothesis is rejected then use two new clusters

         return None


     def _is_null_hypothesis(self, data, centers):
         """!
         @brief Returns whether H0 hypothesis is accepted using Anderson-Darling test statistic.

         @param[in] data (array_like): N-dimensional data for statistical test.
         @param[in] centers (array_like): Two new allocated centers.

         @return (bool) True is null hypothesis is acceptable.

         """
         v = numpy.subtract(centers[0], centers[1])
         points = self._project_data(data, v)

         estimation, critical, _ = scipy.stats.anderson(points, dist='norm')  # the Anderson-Darling test statistic

         # If the returned statistic is larger than these critical values then for the corresponding significance level,
         # the null hypothesis that the data come from the chosen distribution can be rejected.
         return estimation < critical[-1]  # False - not a gaussian distribution (reject H0)


     @staticmethod
     def _project_data(data, vector):
         """!
         @brief Transform input data by project it onto input vector using formula:

         \f[
         x_{i}^{*}=\frac{\left \langle x_{i}, v \right \rangle}{\left \| v \right \|^{2}}.
         \f]

         @param[in] data (array_like): Input data that is represented by points.
         @param[in] vector (array_like): Input vector that is used for projection.

         @return (array_like) Transformed 1-dimensional data.

         """
         square_norm = numpy.sum(numpy.multiply(vector, vector))
         return numpy.divide(numpy.sum(numpy.multiply(data, vector), axis=1), square_norm)


     def _search_optimal_parameters(self, data, amount):
         """!
         @brief Performs cluster analysis for specified data several times to find optimal clustering result in line
                 with WCE.

         @param[in] data (array_like): Input data that should be clustered.
         @param[in] amount (unit): Amount of clusters that should be allocated.

         @return (tuple) Optimal clustering result: (clusters, centers, wce).

         """
         best_wce, best_clusters, best_centers = float('+inf'), [], []
         for _ in range(self.__repeat):
             initial_centers = kmeans_plusplus_initializer(data, amount, random_state=self.__random_state).initialize()
             solver = kmeans(data, initial_centers, tolerance=self.__tolerance, ccore=False).process()

             candidate_wce = solver.get_total_wce()
             if candidate_wce < best_wce:
                 best_wce = candidate_wce
                 best_clusters = solver.get_clusters()
                 best_centers = solver.get_centers()

             if len(initial_centers) == 1:
                 break   # No need to rerun clustering for one initial center.

         return best_clusters, best_centers, best_wce


     def _perform_clustering(self):
         """!
         @brief Performs cluster analysis using K-Means algorithm using current centers are initial.

         @param[in] data (array_like): Input data for cluster analysis.

         """
         solver = kmeans(self.__data, self.__centers, tolerance=self.__tolerance, ccore=False).process()
         self.__clusters = solver.get_clusters()
         self.__centers = solver.get_centers()
         self.__total_wce = solver.get_total_wce()


     def _run_condition(self):
         """!
         @brief Defines whether the algorithm should continue processing or should stop.

         @return `True` if the algorithm should continue processing, otherwise returns `False`

         """
         if (self.__k_max > 0) and (len(self.__clusters) >= self.__k_max):
             return False

         return True


     def _verify_arguments(self):
         """!
         @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.

         """
         if len(self.__data) == 0:
             raise ValueError("Input data is empty (size: '%d')." % len(self.__data))

         if self.__k_init <= 0:
             raise ValueError("Initial amount of centers should be greater than 0 "
                              "(current value: '%d')." % self.__k_init)

         if self.__tolerance <= 0.0:
             raise ValueError("Tolerance should be greater than 0 (current value: '%f')." % self.__tolerance)

         if self.__repeat <= 0:
             raise ValueError("Amount of attempt to find optimal parameters should be greater than 0 "
                              "(current value: '%d')." % self.__repeat)

         if (self.__k_max != -1) and (self.__k_max <= 0):
             raise ValueError("Maximum amount of cluster that might be allocated should be greater than 0 or -1 if "
                              "the algorithm should be restricted in searching optimal number of clusters.")

         if (self.__k_max != -1) and (self.__k_max < self.__k_init):
             raise ValueError("Initial amount of clusters should be less than the maximum amount 'k_max'.")
pyclustering.cluster.kmeans
The module contains K-Means algorithm and other related services.
Definition: kmeans.py:1

pyclustering.cluster.gmeans.gmeans.get_clusters
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: gmeans.py:232

pyclustering.cluster.gmeans.gmeans.__tolerance
__tolerance
Definition: gmeans.py:154

pyclustering.cluster.gmeans.gmeans.process
def process(self)
Performs cluster analysis in line with rules of G-Means algorithm.
Definition: gmeans.py:165

pyclustering.cluster.gmeans.gmeans.__clusters
__clusters
Definition: gmeans.py:149

pyclustering.cluster.gmeans.gmeans.get_centers
def get_centers(self)
Returns list of centers of allocated clusters.
Definition: gmeans.py:245

pyclustering.utils
Utils that are used by modules of pyclustering.
Definition: __init__.py:1

pyclustering.cluster.encoder
Module for representing clustering results.
Definition: encoder.py:1

pyclustering.cluster.gmeans.gmeans._run_condition
def _run_condition(self)
Defines whether the algorithm should continue processing or should stop.
Definition: gmeans.py:407

pyclustering.cluster.gmeans.gmeans.__random_state
__random_state
Definition: gmeans.py:157

pyclustering.cluster.gmeans.gmeans._search_optimal_parameters
def _search_optimal_parameters(self, data, amount)
Performs cluster analysis for specified data several times to find optimal clustering result in line ...
Definition: gmeans.py:366

pyclustering.cluster.center_initializer.kmeans_plusplus_initializer
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
Definition: center_initializer.py:110

pyclustering.cluster.gmeans.gmeans.__init__
def __init__(self, data, k_init=1, ccore=True, kwargs)
Initializes G-Means algorithm.
Definition: gmeans.py:124

pyclustering.cluster.gmeans.gmeans.__centers
__centers
Definition: gmeans.py:150

pyclustering.cluster.gmeans.gmeans._verify_arguments
def _verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: gmeans.py:420

pyclustering.cluster.gmeans.gmeans.__data
__data
Definition: gmeans.py:146

pyclustering.cluster.gmeans.gmeans._statistical_optimization
def _statistical_optimization(self)
Try to split cluster into two to find optimal amount of clusters.
Definition: gmeans.py:285

pyclustering.cluster.kmeans.kmeans
Class implements K-Means clustering algorithm.
Definition: kmeans.py:272

pyclustering.cluster.gmeans.gmeans.__total_wce
__total_wce
Definition: gmeans.py:151

pyclustering.cluster.gmeans.gmeans._split_and_search_optimal
def _split_and_search_optimal(self, cluster)
Split specified cluster into two by performing K-Means clustering and check correctness by Anderson-D...
Definition: gmeans.py:303

pyclustering.cluster.gmeans.gmeans._process_by_ccore
def _process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: gmeans.py:181

pyclustering.cluster.gmeans.gmeans.get_cluster_encoding
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: gmeans.py:272

pyclustering.cluster.gmeans.gmeans.get_total_wce
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: gmeans.py:258

pyclustering.cluster.gmeans.gmeans._process_by_python
def _process_by_python(self)
Performs cluster analysis using Python.
Definition: gmeans.py:190

pyclustering.cluster.gmeans.gmeans.__k_max
__k_max
Definition: gmeans.py:156

pyclustering.cluster.center_initializer
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
Definition: center_initializer.py:1

pyclustering.cluster.gmeans.gmeans._project_data
def _project_data(data, vector)
Transform input data by project it onto input vector using formula:
Definition: gmeans.py:348

pyclustering.cluster.gmeans.gmeans.__repeat
__repeat
Definition: gmeans.py:155

pyclustering.cluster.gmeans.gmeans
Class implements G-Means clustering algorithm.
Definition: gmeans.py:40

pyclustering.cluster.gmeans.gmeans.__ccore
__ccore
Definition: gmeans.py:152

pyclustering.cluster.gmeans.gmeans._perform_clustering
def _perform_clustering(self)
Performs cluster analysis using K-Means algorithm using current centers are initial.
Definition: gmeans.py:394

pyclustering.cluster.gmeans.gmeans._is_null_hypothesis
def _is_null_hypothesis(self, data, centers)
Returns whether H0 hypothesis is accepted using Anderson-Darling test statistic.
Definition: gmeans.py:327

pyclustering.cluster.gmeans.gmeans.__k_init
__k_init
Definition: gmeans.py:147

pyclustering.cluster.gmeans.gmeans.predict
def predict(self, points)
Calculates the closest cluster to each point.
Definition: gmeans.py:209