pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
elbow.py
1 """!
2 
3 @brief Elbow method to determine the optimal number of clusters for k-means clustering.
4 @details Implementation based on paper @cite article::cluster::elbow::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 
13 import math
14 
15 from pyclustering.cluster.kmeans import kmeans
16 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer, random_center_initializer
17 from pyclustering.core.wrapper import ccore_library
18 
19 import pyclustering.core.elbow_wrapper as wrapper
20 
21 
22 class elbow:
23  """!
24  @brief Class represents Elbow method that is used to find out appropriate amount of clusters in a dataset.
25  @details The elbow is a heuristic method of interpretation and validation of consistency within cluster analysis
26  designed to help find the appropriate number of clusters in a dataset.Elbow method performs clustering
27  using K-Means algorithm for each K and estimate clustering results using sum of square erros. By default
28  K-Means++ algorithm is used to calculate initial centers that are used by K-Means algorithm.
29 
30  The Elbow is determined by max distance from each point (x, y) to segment from kmin-point (x0, y0) to kmax-point (x1, y1),
31  where 'x' is K (amount of clusters), and 'y' is within-cluster error. Following expression is used to calculate Elbow
32  length:
33  \f[Elbow_{k} = \frac{\left ( y_{0} - y_{1} \right )x_{k} + \left ( x_{1} - x_{0} \right )y_{k} + \left ( x_{0}y_{1} - x_{1}y_{0} \right )}{\sqrt{\left ( x_{1} - x_{0} \right )^{2} + \left ( y_{1} - y_{0} \right )^{2}}}\f]
34 
35  Usage example of Elbow method for cluster analysis:
36  @code
37  from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
38  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
39  from pyclustering.cluster.elbow import elbow
40  from pyclustering.utils import read_sample
41  from pyclustering.samples.definitions import SIMPLE_SAMPLES
42 
43  # read sample 'Simple3' from file (sample contains four clusters)
44  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
45 
46  # create instance of Elbow method using K value from 1 to 10.
47  kmin, kmax = 1, 10
48  elbow_instance = elbow(sample, kmin, kmax)
49 
50  # process input data and obtain results of analysis
51  elbow_instance.process()
52  amount_clusters = elbow_instance.get_amount() # most probable amount of clusters
53  wce = elbow_instance.get_wce() # total within-cluster errors for each K
54 
55  # perform cluster analysis using K-Means algorithm
56  centers = kmeans_plusplus_initializer(sample, amount_clusters,
57  amount_candidates=kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
58  kmeans_instance = kmeans(sample, centers)
59  kmeans_instance.process()
60 
61  # obtain clustering results and visualize them
62  clusters = kmeans_instance.get_clusters()
63  centers = kmeans_instance.get_centers()
64  kmeans_visualizer.show_clusters(sample, clusters, centers)
65  @endcode
66 
67  By default Elbow uses K-Means++ initializer to calculate initial centers for K-Means algorithm, it can be changed
68  using argument 'initializer':
69  @code
70  # perform analysis using Elbow method with random center initializer for K-Means algorithm inside of the method.
71  kmin, kmax = 1, 10
72  elbow_instance = elbow(sample, kmin, kmax, initializer=random_center_initializer)
73  elbow_instance.process()
74  @endcode
75 
76  @image html elbow_example_simple_03.png "Elbows analysis with further K-Means clustering."
77 
78  """
79 
80  def __init__(self, data, kmin, kmax, **kwargs):
81  """!
82  @brief Construct Elbow method.
83 
84  @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
85  @param[in] kmin (int): Minimum amount of clusters that should be considered.
86  @param[in] kmax (int): Maximum amount of clusters that should be considered.
87  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `ccore`, `initializer`, `random_state`, `kstep`).
88 
89  <b>Keyword Args:</b><br>
90  - ccore (bool): If `True` then C++ implementation of pyclustering library is used (by default `True`).
91  - initializer (callable): Center initializer that is used by K-Means algorithm (by default K-Means++).
92  - random_state (int): Seed for random state (by default is `None`, current system time is used).
93  - kstep (int): Search step in the interval [kmin, kmax] (by default is `1`).
94 
95  """
96 
97  self.__initializer = kwargs.get('initializer', kmeans_plusplus_initializer)
98  self.__random_state = kwargs.get('random_state', None)
99  self.__kstep = kwargs.get('kstep', 1)
100 
101  self.__ccore = kwargs.get('ccore', True) or \
102  isinstance(self.__initializer, kmeans_plusplus_initializer) or \
103  isinstance(self.__initializer, random_center_initializer)
104 
105  if self.__ccore:
106  self.__ccore = ccore_library.workable()
107 
108  self.__data = data
109  self.__kmin = kmin
110  self.__kmax = kmax
111 
112  self.__wce = []
113  self.__elbows = []
114  self.__kvalue = -1
115 
116  self.__verify_arguments()
117 
118 
119  def process(self):
120  """!
121  @brief Performs analysis to find out appropriate amount of clusters.
122 
123  @return (elbow) Returns itself (Elbow instance).
124 
125  @return
126 
127  """
128  if self.__ccore:
129  self.__process_by_ccore()
130  else:
131  self.__process_by_python()
132 
133  return self
134 
135 
136  def __process_by_ccore(self):
137  """!
138  @brief Performs processing using C++ implementation.
139 
140  """
141  if isinstance(self.__initializer, kmeans_plusplus_initializer):
142  initializer = wrapper.elbow_center_initializer.KMEANS_PLUS_PLUS
143  else:
144  initializer = wrapper.elbow_center_initializer.RANDOM
145 
146  result = wrapper.elbow(self.__data, self.__kmin, self.__kmax, self.__kstep, initializer, self.__random_state)
147 
148  self.__kvalue = result[0]
149  self.__wce = result[1]
150 
151 
152  def __process_by_python(self):
153  """!
154  @brief Performs processing using python implementation.
155 
156  """
157  for amount in range(self.__kmin, self.__kmax + 1, self.__kstep):
158  centers = self.__initializer(self.__data, amount, random_state=self.__random_state).initialize()
159  instance = kmeans(self.__data, centers, ccore=False)
160  instance.process()
161 
162  self.__wce.append(instance.get_total_wce())
163 
164  self.__calculate_elbows()
165  self.__find_optimal_kvalue()
166 
167 
168  def get_amount(self):
169  """!
170  @brief Returns appropriate amount of clusters.
171 
172  """
173  return self.__kvalue
174 
175 
176  def get_wce(self):
177  """!
178  @brief Returns list of total within cluster errors for each K-value, for example, in case of `kstep = 1`:
179  (kmin, kmin + 1, ..., kmax).
180 
181  """
182 
183  return self.__wce
184 
185 
186  def __calculate_elbows(self):
187  """!
188  @brief Calculates potential elbows.
189  @details Elbow is calculated as a distance from each point (x, y) to segment from kmin-point (x0, y0) to kmax-point (x1, y1).
190 
191  """
192 
193  x0, y0 = 0.0, self.__wce[0]
194  x1, y1 = float(len(self.__wce)), self.__wce[-1]
195 
196  for index_elbow in range(1, len(self.__wce) - 1):
197  x, y = float(index_elbow), self.__wce[index_elbow]
198 
199  segment = abs((y0 - y1) * x + (x1 - x0) * y + (x0 * y1 - x1 * y0))
200  norm = math.sqrt((x1 - x0) ** 2 + (y1 - y0) ** 2)
201  distance = segment / norm
202 
203  self.__elbows.append(distance)
204 
205 
206  def __find_optimal_kvalue(self):
207  """!
208  @brief Finds elbow and returns corresponding K-value.
209 
210  """
211  optimal_elbow_value = max(self.__elbows)
212  self.__kvalue = (self.__elbows.index(optimal_elbow_value) + 1) * self.__kstep + self.__kmin
213 
214 
215  def __verify_arguments(self):
216  """!
217  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
218 
219  """
220  if len(self.__data) == 0:
221  raise ValueError("Input data is empty (size: '%d')." % len(self.__data))
222 
223  if self.__kmin < 1:
224  raise ValueError("K min value (current value '%d') should be greater or equal to 1." % self.__kmin)
225 
226  if self.__kstep < 1:
227  raise ValueError("K step value (current value '%d') should be greater or equal to 1." % self.__kstep)
228 
229  if self.__kmax - self.__kmin + 1 < 3:
230  raise ValueError("Amount of K (" + str(self.__kmax - self.__kmin) + ") is too small for analysis. "
231  "It is require to have at least three K to build elbow.")
232 
233  steps_to_process = math.floor((self.__kmax - self.__kmin) / self.__kstep) + 1
234  if steps_to_process < 3:
235  raise ValueError("The search step is too high '%d' for analysis (amount of K for analysis is '%d'). "
236  "It is require to have at least three K to build elbow." % (self.__kstep, steps_to_process))
237 
238  if len(self.__data) < self.__kmax:
239  raise ValueError("K max value '%d' is greater than amount of points in data '%d'." %
240  (self.__kmax, len(self.__data)))
pyclustering.cluster.elbow.elbow.__data
__data
Definition: elbow.py:108
pyclustering.cluster.elbow.elbow.__ccore
__ccore
Definition: elbow.py:101
pyclustering.cluster.kmeans.kmeans
Class implements K-Means clustering algorithm.
Definition: kmeans.py:253
pyclustering.cluster.center_initializer
Collection of center initializers for algorithm that uses initial centers, for example,...
Definition: center_initializer.py:1
pyclustering.cluster.elbow.elbow.__kvalue
__kvalue
Definition: elbow.py:114
pyclustering.cluster.elbow.elbow.__calculate_elbows
def __calculate_elbows(self)
Calculates potential elbows.
Definition: elbow.py:186
pyclustering.cluster.elbow.elbow.__kstep
__kstep
Definition: elbow.py:99
pyclustering.cluster.elbow.elbow.__process_by_python
def __process_by_python(self)
Performs processing using python implementation.
Definition: elbow.py:152
pyclustering.cluster.elbow.elbow.__init__
def __init__(self, data, kmin, kmax, **kwargs)
Construct Elbow method.
Definition: elbow.py:80
pyclustering.cluster.elbow.elbow.process
def process(self)
Performs analysis to find out appropriate amount of clusters.
Definition: elbow.py:119
pyclustering.cluster.elbow.elbow.__find_optimal_kvalue
def __find_optimal_kvalue(self)
Finds elbow and returns corresponding K-value.
Definition: elbow.py:206
pyclustering.cluster.elbow.elbow.__verify_arguments
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: elbow.py:215
pyclustering.cluster.elbow.elbow.__process_by_ccore
def __process_by_ccore(self)
Performs processing using C++ implementation.
Definition: elbow.py:136
pyclustering.cluster.elbow.elbow.__initializer
__initializer
Definition: elbow.py:97
pyclustering.cluster.elbow.elbow
Class represents Elbow method that is used to find out appropriate amount of clusters in a dataset.
Definition: elbow.py:22
pyclustering.cluster.elbow.elbow.__elbows
__elbows
Definition: elbow.py:113
pyclustering.cluster.kmeans
The module contains K-Means algorithm and other related services.
Definition: kmeans.py:1
pyclustering.cluster.elbow.elbow.get_amount
def get_amount(self)
Returns appropriate amount of clusters.
Definition: elbow.py:168
pyclustering.cluster.elbow.elbow.__kmin
__kmin
Definition: elbow.py:109
pyclustering.cluster.elbow.elbow.__wce
__wce
Definition: elbow.py:112
pyclustering.cluster.elbow.elbow.get_wce
def get_wce(self)
Returns list of total within cluster errors for each K-value, for example, in case of kstep = 1: (kmi...
Definition: elbow.py:176
pyclustering.cluster.elbow.elbow.__kmax
__kmax
Definition: elbow.py:110
pyclustering.cluster.elbow.elbow.__random_state
__random_state
Definition: elbow.py:98