elbow.py
1 """!
2 
3 @brief Elbow method to determine the optimal number of clusters for k-means clustering.
4 @details Implementation based on paper @cite article::cluster::elbow::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import math
29 
30 from pyclustering.cluster.kmeans import kmeans
31 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer, random_center_initializer
32 from pyclustering.core.wrapper import ccore_library
33 
34 import pyclustering.core.elbow_wrapper as wrapper
35 
36 
37 class elbow:
38  """!
39  @brief Class represents Elbow method that is used to find out appropriate amount of clusters in a dataset.
40  @details The elbow is a heuristic method of interpretation and validation of consistency within cluster analysis
41  designed to help find the appropriate number of clusters in a dataset.Elbow method performs clustering
42  using K-Means algorithm for each K and estimate clustering results using sum of square erros. By default
43  K-Means++ algorithm is used to calculate initial centers that are used by K-Means algorithm.
44 
45  The Elbow is determined by max distance from each point (x, y) to segment from kmin-point (x0, y0) to kmax-point (x1, y1),
46  where 'x' is K (amount of clusters), and 'y' is within-cluster error. Following expression is used to calculate Elbow
47  length:
48  \f[Elbow_{k} = \frac{\left ( y_{0} - y_{1} \right )x_{k} + \left ( x_{1} - x_{0} \right )y_{k} + \left ( x_{0}y_{1} - x_{1}y_{0} \right )}{\sqrt{\left ( x_{1} - x_{0} \right )^{2} + \left ( y_{1} - y_{0} \right )^{2}}}\f]
49 
50  Usage example of Elbow method for cluster analysis:
51  @code
52  from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
53  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
54  from pyclustering.cluster.elbow import elbow
55  from pyclustering.utils import read_sample
56  from pyclustering.samples.definitions import SIMPLE_SAMPLES
57 
58  # read sample 'Simple3' from file (sample contains four clusters)
59  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
60 
61  # create instance of Elbow method using K value from 1 to 10.
62  kmin, kmax = 1, 10
63  elbow_instance = elbow(sample, kmin, kmax)
64 
65  # process input data and obtain results of analysis
66  elbow_instance.process()
67  amount_clusters = elbow_instance.get_amount() # most probable amount of clusters
68  wce = elbow_instance.get_wce() # total within-cluster errors for each K
69 
70  # perform cluster analysis using K-Means algorithm
71  centers = kmeans_plusplus_initializer(sample, amount_clusters,
72  amount_candidates=kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
73  kmeans_instance = kmeans(sample, centers)
74  kmeans_instance.process()
75 
76  # obtain clustering results and visualize them
77  clusters = kmeans_instance.get_clusters()
78  centers = kmeans_instance.get_centers()
79  kmeans_visualizer.show_clusters(sample, clusters, centers)
80  @endcode
81 
82  By default Elbow uses K-Means++ initializer to calculate initial centers for K-Means algorithm, it can be changed
83  using argument 'initializer':
84  @code
85  # perform analysis using Elbow method with random center initializer for K-Means algorithm inside of the method.
86  kmin, kmax = 1, 10
87  elbow_instance = elbow(sample, kmin, kmax, initializer=random_center_initializer)
88  elbow_instance.process()
89  @endcode
90 
91  @image html elbow_example_simple_03.png "Elbows analysis with further K-Means clustering."
92 
93  """
94 
95  def __init__(self, data, kmin, kmax, **kwargs):
96  """!
97  @brief Construct Elbow method.
98 
99  @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
100  @param[in] kmin (int): Minimum amount of clusters that should be considered.
101  @param[in] kmax (int): Maximum amount of clusters that should be considered.
102  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `ccore`, `initializer`, `random_state`).
103 
104  <b>Keyword Args:</b><br>
105  - ccore (bool): If `True` then C++ implementation of pyclustering library is used (by default `True`).
106  - initializer (callable): Center initializer that is used by K-Means algorithm (by default K-Means++).
107  - random_state (int): Seed for random state (by default is `None`, current system time is used).
108 
109  """
110 
111  self.__initializer = kwargs.get('initializer', kmeans_plusplus_initializer)
112  self.__random_state = kwargs.get('random_state', None)
113 
114  self.__ccore = kwargs.get('ccore', True) or \
115  isinstance(self.__initializer, kmeans_plusplus_initializer) or \
116  isinstance(self.__initializer, random_center_initializer)
117 
118  if self.__ccore:
119  self.__ccore = ccore_library.workable()
120 
121  self.__data = data
122  self.__kmin = kmin
123  self.__kmax = kmax
124 
125  self.__wce = []
126  self.__elbows = []
127  self.__kvalue = -1
128 
129  self.__verify_arguments()
130 
131 
132  def process(self):
133  """!
134  @brief Performs analysis to find out appropriate amount of clusters.
135 
136  @return (elbow) Returns itself (Elbow instance).
137 
138  @return
139 
140  """
141  if self.__ccore:
142  self.__process_by_ccore()
143  else:
144  self.__process_by_python()
145 
146  return self
147 
148 
149  def __process_by_ccore(self):
150  """!
151  @brief Performs processing using C++ implementation.
152 
153  """
154  if isinstance(self.__initializer, kmeans_plusplus_initializer):
155  initializer = wrapper.elbow_center_initializer.KMEANS_PLUS_PLUS
156  else:
157  initializer = wrapper.elbow_center_initializer.RANDOM
158 
159  result = wrapper.elbow(self.__data, self.__kmin, self.__kmax, initializer, self.__random_state)
160 
161  self.__kvalue = result[0]
162  self.__wce = result[1]
163 
164 
165  def __process_by_python(self):
166  """!
167  @brief Performs processing using python implementation.
168 
169  """
170  for amount in range(self.__kmin, self.__kmax):
171  centers = self.__initializer(self.__data, amount, random_state=self.__random_state).initialize()
172  instance = kmeans(self.__data, centers, ccore=False)
173  instance.process()
174 
175  self.__wce.append(instance.get_total_wce())
176 
177  self.__calculate_elbows()
178  self.__find_optimal_kvalue()
179 
180 
181  def get_amount(self):
182  """!
183  @brief Returns appropriate amount of clusters.
184 
185  """
186  return self.__kvalue
187 
188 
189  def get_wce(self):
190  """!
191  @brief Returns list of total within cluster errors for each K-value (kmin, kmin + 1, ..., kmax - 1).
192 
193  """
194 
195  return self.__wce
196 
197 
198  def __calculate_elbows(self):
199  """!
200  @brief Calculates potential elbows.
201  @details Elbow is calculated as a distance from each point (x, y) to segment from kmin-point (x0, y0) to kmax-point (x1, y1).
202 
203  """
204 
205  x0, y0 = 0.0, self.__wce[0]
206  x1, y1 = float(len(self.__wce)), self.__wce[-1]
207 
208  for index_elbow in range(1, len(self.__wce) - 1):
209  x, y = float(index_elbow), self.__wce[index_elbow]
210 
211  segment = abs((y0 - y1) * x + (x1 - x0) * y + (x0 * y1 - x1 * y0))
212  norm = math.sqrt((x1 - x0) ** 2 + (y1 - y0) ** 2)
213  distance = segment / norm
214 
215  self.__elbows.append(distance)
216 
217 
218  def __find_optimal_kvalue(self):
219  """!
220  @brief Finds elbow and returns corresponding K-value.
221 
222  """
223  optimal_elbow_value = max(self.__elbows)
224  self.__kvalue = self.__elbows.index(optimal_elbow_value) + 1 + self.__kmin
225 
226 
227  def __verify_arguments(self):
228  """!
229  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
230 
231  """
232  if len(self.__data) == 0:
233  raise ValueError("Input data is empty (size: '%d')." % len(self.__data))
234 
235  if self.__kmin < 1:
236  raise ValueError("K min value (current value '%d') should be greater or equal to 1." % self.__kmin)
237 
238  if self.__kmax - self.__kmin < 3:
239  raise ValueError("Amount of K (" + str(self.__kmax - self.__kmin) + ") is too small for analysis. "
240  "It is require to have at least three K to build elbow.")
241 
242  if len(self.__data) < self.__kmax:
243  raise ValueError("K max value '%d' is greater than amount of points in data '%d'." %
244  (self.__kmax, len(self.__data)))
def __calculate_elbows(self)
Calculates potential elbows.
Definition: elbow.py:198
The module contains K-Means algorithm and other related services.
Definition: kmeans.py:1
def __process_by_python(self)
Performs processing using python implementation.
Definition: elbow.py:165
def __find_optimal_kvalue(self)
Finds elbow and returns corresponding K-value.
Definition: elbow.py:218
Class implements K-Means clustering algorithm.
Definition: kmeans.py:272
def __init__(self, data, kmin, kmax, kwargs)
Construct Elbow method.
Definition: elbow.py:95
def __process_by_ccore(self)
Performs processing using C++ implementation.
Definition: elbow.py:149
def process(self)
Performs analysis to find out appropriate amount of clusters.
Definition: elbow.py:132
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: elbow.py:227
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
Class represents Elbow method that is used to find out appropriate amount of clusters in a dataset...
Definition: elbow.py:37
def get_amount(self)
Returns appropriate amount of clusters.
Definition: elbow.py:181
def get_wce(self)
Returns list of total within cluster errors for each K-value (kmin, kmin + 1, ..., kmax - 1).
Definition: elbow.py:189