3 @brief Elbow method to determine the optimal number of clusters for k-means clustering.
4 @details Implementation based on paper @cite article::cluster::elbow::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
17 from pyclustering.core.wrapper
import ccore_library
19 import pyclustering.core.elbow_wrapper
as wrapper
24 @brief Class represents Elbow method that is used to find out appropriate amount of clusters in a dataset.
25 @details The elbow is a heuristic method of interpretation and validation of consistency within cluster analysis
26 designed to help find the appropriate number of clusters in a dataset.Elbow method performs clustering
27 using K-Means algorithm for each K and estimate clustering results using sum of square erros. By default
28 K-Means++ algorithm is used to calculate initial centers that are used by K-Means algorithm.
30 The Elbow is determined by max distance from each point (x, y) to segment from kmin-point (x0, y0) to kmax-point (x1, y1),
31 where 'x' is K (amount of clusters), and 'y' is within-cluster error. Following expression is used to calculate Elbow
33 \f[Elbow_{k} = \frac{\left ( y_{0} - y_{1} \right )x_{k} + \left ( x_{1} - x_{0} \right )y_{k} + \left ( x_{0}y_{1} - x_{1}y_{0} \right )}{\sqrt{\left ( x_{1} - x_{0} \right )^{2} + \left ( y_{1} - y_{0} \right )^{2}}}\f]
35 Usage example of Elbow method for cluster analysis:
37 from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
38 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
39 from pyclustering.cluster.elbow import elbow
40 from pyclustering.utils import read_sample
41 from pyclustering.samples.definitions import SIMPLE_SAMPLES
43 # read sample 'Simple3' from file (sample contains four clusters)
44 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
46 # create instance of Elbow method using K value from 1 to 10.
48 elbow_instance = elbow(sample, kmin, kmax)
50 # process input data and obtain results of analysis
51 elbow_instance.process()
52 amount_clusters = elbow_instance.get_amount() # most probable amount of clusters
53 wce = elbow_instance.get_wce() # total within-cluster errors for each K
55 # perform cluster analysis using K-Means algorithm
56 centers = kmeans_plusplus_initializer(sample, amount_clusters,
57 amount_candidates=kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
58 kmeans_instance = kmeans(sample, centers)
59 kmeans_instance.process()
61 # obtain clustering results and visualize them
62 clusters = kmeans_instance.get_clusters()
63 centers = kmeans_instance.get_centers()
64 kmeans_visualizer.show_clusters(sample, clusters, centers)
67 By default Elbow uses K-Means++ initializer to calculate initial centers for K-Means algorithm, it can be changed
68 using argument 'initializer':
70 # perform analysis using Elbow method with random center initializer for K-Means algorithm inside of the method.
72 elbow_instance = elbow(sample, kmin, kmax, initializer=random_center_initializer)
73 elbow_instance.process()
76 @image html elbow_example_simple_03.png "Elbows analysis with further K-Means clustering."
80 def __init__(self, data, kmin, kmax, **kwargs):
82 @brief Construct Elbow method.
84 @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
85 @param[in] kmin (int): Minimum amount of clusters that should be considered.
86 @param[in] kmax (int): Maximum amount of clusters that should be considered.
87 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `ccore`, `initializer`, `random_state`, `kstep`).
89 <b>Keyword Args:</b><br>
90 - ccore (bool): If `True` then C++ implementation of pyclustering library is used (by default `True`).
91 - initializer (callable): Center initializer that is used by K-Means algorithm (by default K-Means++).
92 - random_state (int): Seed for random state (by default is `None`, current system time is used).
93 - kstep (int): Search step in the interval [kmin, kmax] (by default is `1`).
97 self.
__initializer = kwargs.get(
'initializer', kmeans_plusplus_initializer)
99 self.
__kstep = kwargs.get(
'kstep', 1)
101 self.
__ccore = kwargs.get(
'ccore',
True)
or \
102 isinstance(self.
__initializer, kmeans_plusplus_initializer)
or \
106 self.
__ccore = ccore_library.workable()
121 @brief Performs analysis to find out appropriate amount of clusters.
123 @return (elbow) Returns itself (Elbow instance).
136 def __process_by_ccore(self):
138 @brief Performs processing using C++ implementation.
141 if isinstance(self.
__initializer, kmeans_plusplus_initializer):
142 initializer = wrapper.elbow_center_initializer.KMEANS_PLUS_PLUS
144 initializer = wrapper.elbow_center_initializer.RANDOM
149 self.
__wce = result[1]
152 def __process_by_python(self):
154 @brief Performs processing using python implementation.
162 self.
__wce.append(instance.get_total_wce())
170 @brief Returns appropriate amount of clusters.
178 @brief Returns list of total within cluster errors for each K-value, for example, in case of `kstep = 1`:
179 (kmin, kmin + 1, ..., kmax).
186 def __calculate_elbows(self):
188 @brief Calculates potential elbows.
189 @details Elbow is calculated as a distance from each point (x, y) to segment from kmin-point (x0, y0) to kmax-point (x1, y1).
193 x0, y0 = 0.0, self.
__wce[0]
194 x1, y1 = float(len(self.
__wce)), self.
__wce[-1]
196 for index_elbow
in range(1, len(self.
__wce) - 1):
197 x, y = float(index_elbow), self.
__wce[index_elbow]
199 segment = abs((y0 - y1) * x + (x1 - x0) * y + (x0 * y1 - x1 * y0))
200 norm = math.sqrt((x1 - x0) ** 2 + (y1 - y0) ** 2)
201 distance = segment / norm
206 def __find_optimal_kvalue(self):
208 @brief Finds elbow and returns corresponding K-value.
211 optimal_elbow_value = max(self.
__elbows)
215 def __verify_arguments(self):
217 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
221 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__data))
224 raise ValueError(
"K min value (current value '%d') should be greater or equal to 1." % self.
__kmin)
227 raise ValueError(
"K step value (current value '%d') should be greater or equal to 1." % self.
__kstep)
230 raise ValueError(
"Amount of K (" + str(self.
__kmax - self.
__kmin) +
") is too small for analysis. "
231 "It is require to have at least three K to build elbow.")
234 if steps_to_process < 3:
235 raise ValueError(
"The search step is too high '%d' for analysis (amount of K for analysis is '%d'). "
236 "It is require to have at least three K to build elbow." % (self.
__kstep, steps_to_process))
239 raise ValueError(
"K max value '%d' is greater than amount of points in data '%d'." %