clarans.py
1 """!
2 
3 @brief Cluster analysis algorithm: CLARANS.
4 @details Implementation based on paper @cite article::clarans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import random
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.utils import euclidean_distance_square
33 
34 
35 class clarans:
36  """!
37  @brief Class represents clustering algorithm CLARANS (a method for clustering objects for spatial data mining).
38 
39  """
40 
41  def __init__(self, data, number_clusters, numlocal, maxneighbor):
42  """!
43  @brief Constructor of clustering algorithm CLARANS.
44  @details The higher the value of maxneighbor, the closer is CLARANS to K-Medoids, and the longer is each search of a local minima.
45 
46  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
47  @param[in] number_clusters (uint): Amount of clusters that should be allocated.
48  @param[in] numlocal (uint): The number of local minima obtained (amount of iterations for solving the problem).
49  @param[in] maxneighbor (uint): The maximum number of neighbors examined.
50 
51  """
52 
53  self.__pointer_data = data
54  self.__numlocal = numlocal
55  self.__maxneighbor = maxneighbor
56  self.__number_clusters = number_clusters
57 
58  self.__clusters = []
59  self.__current = []
60  self.__belong = []
61 
62  self.__optimal_medoids = []
63  self.__optimal_estimation = float('inf')
64 
65  self.__verify_arguments()
66 
67 
68  def __verify_arguments(self):
69  """!
70  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
71 
72  """
73  if len(self.__pointer_data) == 0:
74  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
75 
76  if self.__number_clusters <= 0:
77  raise ValueError("Amount of cluster (current value: '%d') for allocation should be greater than 0." %
78  self.__number_clusters)
79 
80  if self.__numlocal < 0:
81  raise ValueError("Local minima (current value: '%d') should be greater or equal to 0." % self.__numlocal)
82 
83  if self.__maxneighbor < 0:
84  raise ValueError("Maximum number of neighbors (current value: '%d') should be greater or "
85  "equal to 0." % self.__maxneighbor)
86 
87 
88  def process(self):
89  """!
90  @brief Performs cluster analysis in line with rules of CLARANS algorithm.
91 
92  @return (clarans) Returns itself (CLARANS instance).
93 
94  @see get_clusters()
95  @see get_medoids()
96 
97  """
98 
99  random.seed()
100 
101  for _ in range(0, self.__numlocal):
102  # set (current) random medoids
103  self.__current = random.sample(range(0, len(self.__pointer_data)), self.__number_clusters)
104 
105  # update clusters in line with random allocated medoids
106  self.__update_clusters(self.__current)
107 
108  # optimize configuration
110 
111  # obtain cost of current cluster configuration and compare it with the best obtained
112  estimation = self.__calculate_estimation()
113  if estimation < self.__optimal_estimation:
114  self.__optimal_medoids = self.__current[:]
115  self.__optimal_estimation = estimation
116 
118  return self
119 
120 
121  def get_clusters(self):
122  """!
123  @brief Returns allocated clusters by the algorithm.
124 
125  @remark Allocated clusters can be returned only after data processing (use method process()), otherwise empty list is returned.
126 
127  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
128 
129  @see process()
130  @see get_medoids()
131 
132  """
133 
134  return self.__clusters
135 
136 
137  def get_medoids(self):
138  """!
139  @brief Returns list of medoids of allocated clusters.
140 
141  @see process()
142  @see get_clusters()
143 
144  """
145 
146  return self.__optimal_medoids
147 
148 
150  """!
151  @brief Returns clustering result representation type that indicate how clusters are encoded.
152 
153  @return (type_encoding) Clustering result representation.
154 
155  @see get_clusters()
156 
157  """
158 
159  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
160 
161 
162  def __update_clusters(self, medoids):
163  """!
164  @brief Forms cluster in line with specified medoids by calculation distance from each point to medoids.
165 
166  """
167 
168  self.__belong = [0] * len(self.__pointer_data)
169  self.__clusters = [[] for i in range(len(medoids))]
170  for index_point in range(len(self.__pointer_data)):
171  index_optim = -1
172  dist_optim = 0.0
173 
174  for index in range(len(medoids)):
175  dist = euclidean_distance_square(self.__pointer_data[index_point], self.__pointer_data[medoids[index]])
176 
177  if (dist < dist_optim) or (index is 0):
178  index_optim = index
179  dist_optim = dist
180 
181  self.__clusters[index_optim].append(index_point)
182  self.__belong[index_point] = index_optim
183 
184  # If cluster is not able to capture object it should be removed
185  self.__clusters = [cluster for cluster in self.__clusters if len(cluster) > 0]
186 
187 
188  def __optimize_configuration(self):
189  """!
190  @brief Finds quasi-optimal medoids and updates in line with them clusters in line with algorithm's rules.
191 
192  """
193  index_neighbor = 0
194  while (index_neighbor < self.__maxneighbor):
195  # get random current medoid that is to be replaced
196  current_medoid_index = self.__current[random.randint(0, self.__number_clusters - 1)]
197  current_medoid_cluster_index = self.__belong[current_medoid_index]
198 
199  # get new candidate to be medoid
200  candidate_medoid_index = random.randint(0, len(self.__pointer_data) - 1)
201 
202  while candidate_medoid_index in self.__current:
203  candidate_medoid_index = random.randint(0, len(self.__pointer_data) - 1)
204 
205  candidate_cost = 0.0
206  for point_index in range(0, len(self.__pointer_data)):
207  if point_index not in self.__current:
208  # get non-medoid point and its medoid
209  point_cluster_index = self.__belong[point_index]
210  point_medoid_index = self.__current[point_cluster_index]
211 
212  # get other medoid that is nearest to the point (except current and candidate)
213  other_medoid_index = self.__find_another_nearest_medoid(point_index, current_medoid_index)
214  other_medoid_cluster_index = self.__belong[other_medoid_index]
215 
216  # for optimization calculate all required distances
217  # from the point to current medoid
218  distance_current = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[current_medoid_index])
219 
220  # from the point to candidate median
221  distance_candidate = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[candidate_medoid_index])
222 
223  # from the point to nearest (own) medoid
224  distance_nearest = float('inf')
225  if ( (point_medoid_index != candidate_medoid_index) and (point_medoid_index != current_medoid_cluster_index) ):
226  distance_nearest = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[point_medoid_index])
227 
228  # apply rules for cost calculation
229  if (point_cluster_index == current_medoid_cluster_index):
230  # case 1:
231  if (distance_candidate >= distance_nearest):
232  candidate_cost += distance_nearest - distance_current
233 
234  # case 2:
235  else:
236  candidate_cost += distance_candidate - distance_current
237 
238  elif (point_cluster_index == other_medoid_cluster_index):
239  # case 3 ('nearest medoid' is the representative object of that cluster and object is more similar to 'nearest' than to 'candidate'):
240  if (distance_candidate > distance_nearest):
241  pass;
242 
243  # case 4:
244  else:
245  candidate_cost += distance_candidate - distance_nearest
246 
247  if (candidate_cost < 0):
248  # set candidate that has won
249  self.__current[current_medoid_cluster_index] = candidate_medoid_index
250 
251  # recalculate clusters
252  self.__update_clusters(self.__current)
253 
254  # reset iterations and starts investigation from the begining
255  index_neighbor = 0
256 
257  else:
258  index_neighbor += 1
259 
260 
261  def __find_another_nearest_medoid(self, point_index, current_medoid_index):
262  """!
263  @brief Finds the another nearest medoid for the specified point that is differ from the specified medoid.
264 
265  @param[in] point_index: index of point in dataspace for that searching of medoid in current list of medoids is perfomed.
266  @param[in] current_medoid_index: index of medoid that shouldn't be considered as a nearest.
267 
268  @return (uint) index of the another nearest medoid for the point.
269 
270  """
271  other_medoid_index = -1
272  other_distance_nearest = float('inf')
273  for index_medoid in self.__current:
274  if (index_medoid != current_medoid_index):
275  other_distance_candidate = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[current_medoid_index])
276 
277  if other_distance_candidate < other_distance_nearest:
278  other_distance_nearest = other_distance_candidate
279  other_medoid_index = index_medoid
280 
281  return other_medoid_index
282 
283 
284  def __calculate_estimation(self):
285  """!
286  @brief Calculates estimation (cost) of the current clusters. The lower the estimation,
287  the more optimally configuration of clusters.
288 
289  @return (double) estimation of current clusters.
290 
291  """
292  estimation = 0.0
293  for index_cluster in range(0, len(self.__clusters)):
294  cluster = self.__clusters[index_cluster]
295  index_medoid = self.__current[index_cluster]
296  for index_point in cluster:
297  estimation += euclidean_distance_square(self.__pointer_data[index_point], self.__pointer_data[index_medoid])
298 
299  return estimation
def get_clusters(self)
Returns allocated clusters by the algorithm.
Definition: clarans.py:121
def __update_clusters(self, medoids)
Forms cluster in line with specified medoids by calculation distance from each point to medoids...
Definition: clarans.py:162
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: clarans.py:68
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def __init__(self, data, number_clusters, numlocal, maxneighbor)
Constructor of clustering algorithm CLARANS.
Definition: clarans.py:41
Module for representing clustering results.
Definition: encoder.py:1
def __calculate_estimation(self)
Calculates estimation (cost) of the current clusters.
Definition: clarans.py:284
def __optimize_configuration(self)
Finds quasi-optimal medoids and updates in line with them clusters in line with algorithm&#39;s rules...
Definition: clarans.py:188
def get_medoids(self)
Returns list of medoids of allocated clusters.
Definition: clarans.py:137
def __find_another_nearest_medoid(self, point_index, current_medoid_index)
Finds the another nearest medoid for the specified point that is differ from the specified medoid...
Definition: clarans.py:261
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: clarans.py:149
def process(self)
Performs cluster analysis in line with rules of CLARANS algorithm.
Definition: clarans.py:88
Class represents clustering algorithm CLARANS (a method for clustering objects for spatial data minin...
Definition: clarans.py:35