dc/d6a/kmeans_8py_source.html

 """!

 @brief Cluster analysis algorithm: K-Means
 @details Implementation based on paper @cite inproceedings::kmeans::1.

 @authors Andrei Novikov (pyclustering@yandex.ru)
 @date 2014-2018
 @copyright GNU Public License

 @cond GNU_PUBLIC_LICENSE
     PyClustering is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
     (at your option) any later version.

     PyClustering is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 @endcond

 """


 import numpy
 import warnings

 try:
     import matplotlib.pyplot as plt
     import matplotlib.animation as animation
 except Exception as error_instance:
     warnings.warn("Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization "
                   "functionality is not available (details: '%s')." % str(error_instance))

 import pyclustering.core.kmeans_wrapper as wrapper

 from pyclustering.core.wrapper import ccore_library
 from pyclustering.core.metric_wrapper import metric_wrapper

 from pyclustering.cluster.encoder import type_encoding
 from pyclustering.cluster import cluster_visualizer

 from pyclustering.utils.metric import distance_metric, type_metric


 class kmeans_observer:
     """!
     @brief Observer of K-Means algorithm that is used to collect information about clustering process on each iteration of the algorithm.

     @see kmeans

     """

     def __init__(self):
         """!
         @brief Initializer of observer of K-Means algorithm.

         """
         self.__evolution_clusters   = []
         self.__evolution_centers    = []
         self.__initial_centers      = []


     def __len__(self):
         """!
         @brief Returns amount of steps that were observer during clustering process in K-Means algorithm.

         """
         return len(self.__evolution_clusters)


     def notify(self, clusters, centers):
         """!
         @brief This method is called by K-Means algorithm to notify about changes.

         @param[in] clusters (array_like): Allocated clusters by K-Means algorithm.
         @param[in] centers (array_like): Allocated centers by K-Means algorithm.

         """
         self.__evolution_clusters.append(clusters)
         self.__evolution_centers.append(centers)


     def set_evolution_centers(self, evolution_centers):
         """!
         @brief Set evolution of changes of centers during clustering process.

         @param[in] evolution_centers (array_like): Evolution of changes of centers during clustering process.

         """
         self.__evolution_centers = evolution_centers


     def get_centers(self, iteration):
         """!
         @brief Get method to return centers at specific iteration of clustering process.

         @param[in] iteration (uint): Clustering process iteration at which centers are required.

         @return (array_like) Centers at specific iteration.

         """
         return self.__evolution_centers[iteration]


     def set_evolution_clusters(self, evolution_clusters):
         """!
         @brief Set evolution of changes of centers during clustering process.

         @param[in] evolution_clusters (array_like): Evolution of changes of clusters during clustering process.

         """
         self.__evolution_clusters = evolution_clusters


     def get_clusters(self, iteration):
         """!
         @brief Get method to return allocated clusters at specific iteration of clustering process.

         @param[in] iteration (uint): Clustering process iteration at which clusters are required.

         @return (array_like) Clusters at specific iteration.

         """
         return self.__evolution_clusters[iteration]


 class kmeans_visualizer:
     """!
     @brief Visualizer of K-Means algorithm's results.
     @details K-Means visualizer provides visualization services that are specific for K-Means algorithm.

     """

     __default_2d_marker_size = 15
     __default_3d_marker_size = 70


     @staticmethod
     def show_clusters(sample, clusters, centers, initial_centers = None, **kwargs):
         """!
         @brief Display K-Means clustering results.

         @param[in] sample (list): Dataset that was used for clustering.
         @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
         @param[in] centers (array_like): Centers that were allocated by the algorithm.
         @param[in] initial_centers (array_like): Initial centers that were used by the algorithm, if 'None' then initial centers are not displyed.
         @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display', 'offset').

         <b>Keyword Args:</b><br>
             - figure (figure): If 'None' then new is figure is created, otherwise specified figure is used for visualization.
             - display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'.
             - offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified).

         @return (figure) Figure where clusters were drawn.

         """

         visualizer = cluster_visualizer()
         visualizer.append_clusters(clusters, sample)

         offset = kwargs.get('offset', 0)
         figure = kwargs.get('figure', None)
         display = kwargs.get('display', True)

         if figure is None:
             figure = visualizer.show(display = False)
         else:
             visualizer.show(figure = figure, display = False)

         kmeans_visualizer.__draw_centers(figure, offset, visualizer, centers, initial_centers)
         kmeans_visualizer.__draw_rays(figure, offset, visualizer, sample, clusters, centers)

         if display is True:
             plt.show()

         return figure


     @staticmethod
     def __draw_rays(figure, offset, visualizer, sample, clusters, centers):
         ax = figure.get_axes()[offset]

         for index_cluster in range(len(clusters)):
             color = visualizer.get_cluster_color(index_cluster, 0)
             kmeans_visualizer.__draw_cluster_rays(ax, color, sample, clusters[index_cluster], centers[index_cluster])


     @staticmethod
     def __draw_cluster_rays(ax, color, sample, cluster, center):
         dimension = len(sample[0])

         for index_point in cluster:
             point = sample[index_point]
             if dimension == 1:
                 ax.plot([point[0], center[0]], [0.0, 0.0], '-', color=color, linewidth=0.5)
             elif dimension == 2:
                 ax.plot([point[0], center[0]], [point[1], center[1]], '-', color=color, linewidth=0.5)
             elif dimension == 3:
                 ax.plot([point[0], center[0]], [point[1], center[1]], [point[2], center[2]], '-', color=color, linewidth=0.5)


     @staticmethod
     def __draw_center(ax, center, color, marker, alpha):
         dimension = len(center)

         if dimension == 1:
             ax.plot(center[0], 0.0, color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
         elif dimension == 2:
             ax.plot(center[0], center[1], color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
         elif dimension == 3:
             ax.scatter(center[0], center[1], center[2], c=color, alpha=alpha, marker=marker, s=kmeans_visualizer.__default_3d_marker_size)


     @staticmethod
     def __draw_centers(figure, offset, visualizer, centers, initial_centers):
         ax = figure.get_axes()[offset]

         for index_center in range(len(centers)):
             color = visualizer.get_cluster_color(index_center, 0)
             kmeans_visualizer.__draw_center(ax, centers[index_center], color, '*', 1.0)

             if initial_centers is not None:
                 kmeans_visualizer.__draw_center(ax, initial_centers[index_center], color, '*', 0.4)


     @staticmethod
     def animate_cluster_allocation(data, observer, animation_velocity = 500, movie_fps = 1, save_movie = None):
         """!
         @brief Animates clustering process that is performed by K-Means algorithm.

         @param[in] data (list): Dataset that is used for clustering.
         @param[in] observer (kmeans_observer): EM observer that was used for collection information about clustering process.
         @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
         @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
         @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.

         """
         figure = plt.figure()

         def init_frame():
             return frame_generation(0)

         def frame_generation(index_iteration):
             figure.clf()

             figure.suptitle("K-Means algorithm (iteration: " + str(index_iteration) + ")", fontsize=18, fontweight='bold')

             clusters = observer.get_clusters(index_iteration)
             centers = observer.get_centers(index_iteration)
             kmeans_visualizer.show_clusters(data, clusters, centers, None, figure=figure, display=False)

             figure.subplots_adjust(top=0.85)

             return [figure.gca()]

         iterations = len(observer)
         cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval=animation_velocity,
                                                     init_func=init_frame, repeat_delay=5000)

         if save_movie is not None:
             cluster_animation.save(save_movie, writer='ffmpeg', fps=movie_fps, bitrate=3000)
         else:
             plt.show()


 class kmeans:
     """!
     @brief Class represents K-Means clustering algorithm.
     @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.

     CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.

     K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization
     initial centers from module 'pyclustering.cluster.center_initializer'.

     @image html kmeans_example_clustering.png "K-Means clustering results. At the left - 'Simple03.data' sample, at the right - 'Lsun.data' sample."

     Example #1 - Trivial clustering:
     @code
         # load list of points for cluster analysis
         sample = read_sample(path)

         # create instance of K-Means algorithm
         kmeans_instance = kmeans(sample, [ [0.0, 0.1], [2.5, 2.6] ])

         # run cluster analysis and obtain results
         kmeans_instance.process()
         clusters = kmeans_instance.get_clusters()
     @endcode

     Example #2 - Clustering using K-Means++ for center initialization:
     @code
         # load list of points for cluster analysis
         sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2)

         # initialize initial centers using K-Means++ method
         initial_centers = kmeans_plusplus_initializer(sample, 3).initialize()

         # create instance of K-Means algorithm with prepared centers
         kmeans_instance = kmeans(sample, initial_centers)

         # run cluster analysis and obtain results
         kmeans_instance.process()
         clusters = kmeans_instance.get_clusters()
         final_centers = kmeans_instance.get_centers()
     @endcode

     @see center_initializer

     """

     def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
         """!
         @brief Constructor of clustering algorithm K-Means.
         @details Center initializer can be used for creating initial centers, for example, K-Means++ method.

         @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
         @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
         @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
         @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
         @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer', 'metric', 'itermax').

         <b>Keyword Args:</b><br>
             - observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration.
             - metric (distance_metric): Metric that is used for distance calculation between two points (by default euclidean square distance).
             - itermax (uint): Maximum number of iterations that is used for clustering process (by default: 200).

         @see center_initializer

         """
         self.__pointer_data = numpy.array(data)
         self.__clusters = []
         self.__centers = numpy.array(initial_centers)
         self.__tolerance = tolerance
         self.__total_wce = 0

         self.__observer = kwargs.get('observer', None)
         self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
         self.__maxiter = kwargs.get('maxiter', 200)

         if self.__metric.get_type() != type_metric.USER_DEFINED:
             self.__metric.enable_numpy_usage()
         else:
             self.__metric.disable_numpy_usage()

         self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
         if self.__ccore is True:
             self.__ccore = ccore_library.workable()


     def process(self):
         """!
         @brief Performs cluster analysis in line with rules of K-Means algorithm.

         @return (kmeans) Returns itself (K-Means instance).

         @remark Results of clustering can be obtained using corresponding get methods.

         @see get_clusters()
         @see get_centers()

         """

         if len(self.__pointer_data[0]) != len(self.__centers[0]):
             raise ValueError("Dimension of the input data and dimension of the initial cluster centers must be equal.")

         if self.__ccore is True:
             self.__process_by_ccore()
         else:
             self.__process_by_python()

         return self


     def __process_by_ccore(self):
         """!
         @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).

         """
         ccore_metric = metric_wrapper.create_instance(self.__metric)

         results = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance, (self.__observer is not None), ccore_metric.get_pointer())
         self.__clusters = results[0]
         self.__centers = results[1]

         if self.__observer is not None:
             self.__observer.set_evolution_clusters(results[2])
             self.__observer.set_evolution_centers(results[3])

         self.__total_wce = results[4][0]


     def __process_by_python(self):
         """!
         @brief Performs cluster analysis using python code.

         """

         maximum_change = float('inf')
         stop_condition = self.__tolerance * self.__tolerance
         iteration = 0

         if self.__observer is not None:
             initial_clusters = self.__update_clusters()
             self.__observer.notify(initial_clusters, self.__centers.tolist())

         while maximum_change > stop_condition and iteration < self.__maxiter:
             self.__clusters = self.__update_clusters()
             updated_centers = self.__update_centers()  # changes should be calculated before assignment

             if self.__observer is not None:
                 self.__observer.notify(self.__clusters, updated_centers.tolist())

             maximum_change = self.__calculate_changes(updated_centers)

             self.__centers = updated_centers    # assign center after change calculation
             iteration += 1

         self.__calculate_total_wce()


     def get_clusters(self):
         """!
         @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.

         @see process()
         @see get_centers()

         """

         return self.__clusters


     def get_centers(self):
         """!
         @brief Returns list of centers of allocated clusters.

         @see process()
         @see get_clusters()

         """

         if isinstance(self.__centers, list):
             return self.__centers

         return self.__centers.tolist()


     def get_total_wce(self):
         """!
         @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
         @details Sum of metric errors is calculated using distance between point and its center:
                  \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]

         @see process()
         @see get_clusters()

         """

         return self.__total_wce


     def get_cluster_encoding(self):
         """!
         @brief Returns clustering result representation type that indicate how clusters are encoded.

         @return (type_encoding) Clustering result representation.

         @see get_clusters()

         """

         return type_encoding.CLUSTER_INDEX_LIST_SEPARATION


     def __update_clusters(self):
         """!
         @brief Calculate Euclidean distance to each point from the each cluster. Nearest points are captured by according clusters and as a result clusters are updated.

         @return (list) Updated clusters as list of clusters. Each cluster contains indexes of objects from data.

         """

         clusters = [[] for _ in range(len(self.__centers))]

         dataset_differences = self.__calculate_dataset_difference(len(clusters))

         optimum_indexes = numpy.argmin(dataset_differences, axis=0)
         for index_point in range(len(optimum_indexes)):
             index_cluster = optimum_indexes[index_point]
             clusters[index_cluster].append(index_point)

         clusters = [cluster for cluster in clusters if len(cluster) > 0]

         return clusters


     def __update_centers(self):
         """!
         @brief Calculate centers of clusters in line with contained objects.

         @return (numpy.matrix) Updated centers as list of centers.

         """

         dimension = self.__pointer_data.shape[1]
         centers = numpy.zeros((len(self.__clusters), dimension))

         for index in range(len(self.__clusters)):
             cluster_points = self.__pointer_data[self.__clusters[index], :]
             centers[index] = cluster_points.mean(axis=0)

         return numpy.array(centers)


     def __calculate_total_wce(self):
         """!
         @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm.

         """

         dataset_differences = self.__calculate_dataset_difference(len(self.__clusters))

         self.__total_wce = 0
         for index_cluster in range(len(self.__clusters)):
             for index_point in self.__clusters[index_cluster]:
                 self.__total_wce += dataset_differences[index_cluster][index_point]


     def __calculate_dataset_difference(self, amount_clusters):
         """!
         @brief Calculate distance from each point to each cluster center.

         """
         dataset_differences = numpy.zeros((amount_clusters, len(self.__pointer_data)))
         for index_center in range(amount_clusters):
             if self.__metric.get_type() != type_metric.USER_DEFINED:
                 dataset_differences[index_center] = self.__metric(self.__pointer_data, self.__centers[index_center])
             else:
                 dataset_differences[index_center] = [ self.__metric(point, self.__centers[index_center])
                                                       for point in self.__pointer_data ]

         return dataset_differences


     def __calculate_changes(self, updated_centers):
         """!
         @brief Calculates changes estimation between previous and current iteration using centers for that purpose.

         @param[in] updated_centers (array_like): New cluster centers.

         @return (float) Maximum changes between centers.

         """
         if len(self.__centers) != len(updated_centers):
             maximum_change = float('inf')

         else:
             changes = self.__metric(self.__centers, updated_centers)
             maximum_change = numpy.max(changes)

         return maximum_change
pyclustering.cluster.kmeans.kmeans.__metric
__metric
Definition: kmeans.py:344

pyclustering.cluster.cluster_visualizer
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:359

pyclustering.cluster
pyclustering module for cluster analysis.
Definition: __init__.py:1

pyclustering.cluster.kmeans.kmeans_observer.notify
def notify(self, clusters, centers)
This method is called by K-Means algorithm to notify about changes.
Definition: kmeans.py:75

pyclustering.cluster.kmeans.kmeans.get_centers
def get_centers(self)
Returns list of centers of allocated clusters.
Definition: kmeans.py:440

pyclustering.cluster.kmeans.kmeans.__calculate_dataset_difference
def __calculate_dataset_difference(self, amount_clusters)
Calculate distance from each point to each cluster center.
Definition: kmeans.py:536

pyclustering.cluster.kmeans.kmeans.process
def process(self)
Performs cluster analysis in line with rules of K-Means algorithm.
Definition: kmeans.py:357

pyclustering.cluster.kmeans.kmeans_observer.__initial_centers
__initial_centers
Definition: kmeans.py:64

pyclustering.utils.metric
Module provides various distance metrics - abstraction of the notion of distance in a metric space...
Definition: metric.py:1

pyclustering.cluster.kmeans.kmeans_observer.get_clusters
def get_clusters(self, iteration)
Get method to return allocated clusters at specific iteration of clustering process.
Definition: kmeans.py:119

pyclustering.cluster.kmeans.kmeans.__calculate_changes
def __calculate_changes(self, updated_centers)
Calculates changes estimation between previous and current iteration using centers for that purpose...
Definition: kmeans.py:552

pyclustering.cluster.kmeans.kmeans.__total_wce
__total_wce
Definition: kmeans.py:341

pyclustering.cluster.encoder
Module for representing clustering results.
Definition: encoder.py:1

pyclustering.cluster.kmeans.kmeans_observer.__evolution_centers
__evolution_centers
Definition: kmeans.py:63

pyclustering.utils.metric.distance_metric
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:58

pyclustering.cluster.kmeans.kmeans_observer
Observer of K-Means algorithm that is used to collect information about clustering process on each it...
Definition: kmeans.py:49

pyclustering.cluster.kmeans.kmeans.__clusters
__clusters
Definition: kmeans.py:338

pyclustering.cluster.kmeans.kmeans.__update_centers
def __update_centers(self)
Calculate centers of clusters in line with contained objects.
Definition: kmeans.py:504

pyclustering.cluster.kmeans.kmeans.get_total_wce
def get_total_wce(self)
Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Su...
Definition: kmeans.py:455

pyclustering.cluster.kmeans.kmeans
Class represents K-Means clustering algorithm.
Definition: kmeans.py:272

pyclustering.cluster.kmeans.kmeans_observer.__init__
def __init__(self)
Initializer of observer of K-Means algorithm.
Definition: kmeans.py:57

pyclustering.cluster.kmeans.kmeans.__ccore
__ccore
Definition: kmeans.py:352

pyclustering.cluster.kmeans.kmeans_visualizer
Visualizer of K-Means algorithm&#39;s results.
Definition: kmeans.py:132

pyclustering.cluster.kmeans.kmeans.__maxiter
__maxiter
Definition: kmeans.py:345

pyclustering.cluster.kmeans.kmeans_observer.get_centers
def get_centers(self, iteration)
Get method to return centers at specific iteration of clustering process.
Definition: kmeans.py:97

pyclustering.cluster.kmeans.kmeans.__centers
__centers
Definition: kmeans.py:339

pyclustering.cluster.kmeans.kmeans.__calculate_total_wce
def __calculate_total_wce(self)
Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm...
Definition: kmeans.py:522

pyclustering.cluster.kmeans.kmeans_observer.set_evolution_centers
def set_evolution_centers(self, evolution_centers)
Set evolution of changes of centers during clustering process.
Definition: kmeans.py:87

pyclustering.cluster.kmeans.kmeans.__init__
def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, kwargs)
Constructor of clustering algorithm K-Means.
Definition: kmeans.py:318

pyclustering.cluster.kmeans.kmeans.__pointer_data
__pointer_data
Definition: kmeans.py:337

pyclustering.cluster.kmeans.kmeans.__observer
__observer
Definition: kmeans.py:343

pyclustering.cluster.kmeans.kmeans_observer.__len__
def __len__(self)
Returns amount of steps that were observer during clustering process in K-Means algorithm.
Definition: kmeans.py:67

pyclustering.cluster.kmeans.kmeans_observer.set_evolution_clusters
def set_evolution_clusters(self, evolution_clusters)
Set evolution of changes of centers during clustering process.
Definition: kmeans.py:109

pyclustering.cluster.kmeans.kmeans_observer.__evolution_clusters
__evolution_clusters
Definition: kmeans.py:62

pyclustering.cluster.kmeans.kmeans_visualizer.show_clusters
def show_clusters(sample, clusters, centers, initial_centers=None, kwargs)
Display K-Means clustering results.
Definition: kmeans.py:144

pyclustering.cluster.kmeans.kmeans.get_clusters
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: kmeans.py:428

pyclustering.cluster.kmeans.kmeans.__process_by_ccore
def __process_by_ccore(self)
Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
Definition: kmeans.py:381

pyclustering.cluster.kmeans.kmeans.__tolerance
__tolerance
Definition: kmeans.py:340

pyclustering.cluster.kmeans.kmeans.get_cluster_encoding
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: kmeans.py:469

pyclustering.cluster.kmeans.kmeans.__update_clusters
def __update_clusters(self)
Calculate Euclidean distance to each point from the each cluster.
Definition: kmeans.py:482

pyclustering.cluster.kmeans.kmeans.__process_by_python
def __process_by_python(self)
Performs cluster analysis using python code.
Definition: kmeans.py:399

pyclustering.cluster.kmeans.kmeans_visualizer.animate_cluster_allocation
def animate_cluster_allocation(data, observer, animation_velocity=500, movie_fps=1, save_movie=None)
Animates clustering process that is performed by K-Means algorithm.
Definition: kmeans.py:232