clarans.py
1 """!
2 
3 @brief Cluster analysis algorithm: CLARANS.
4 @details Implementation based on paper @cite article::clarans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import random
29 
30 from pyclustering.cluster.encoder import type_encoding
31 
32 from pyclustering.utils import euclidean_distance_square
33 
34 
35 class clarans:
36  """!
37  @brief Class represents clustering algorithm CLARANS (a method for clustering objects for spatial data mining).
38 
39  """
40 
41  def __init__(self, data, number_clusters, numlocal, maxneighbor):
42  """!
43  @brief Constructor of clustering algorithm CLARANS.
44  @details The higher the value of maxneighbor, the closer is CLARANS to K-Medoids, and the longer is each search of a local minima.
45 
46  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
47  @param[in] number_clusters (uint): amount of clusters that should be allocated.
48  @param[in] numlocal (uint): the number of local minima obtained (amount of iterations for solving the problem).
49  @param[in] maxneighbor (uint): the maximum number of neighbors examined.
50 
51  """
52 
53  self.__pointer_data = data
54  self.__numlocal = numlocal
55  self.__maxneighbor = maxneighbor
56  self.__number_clusters = number_clusters
57 
58  self.__clusters = []
59  self.__current = []
60  self.__belong = []
61 
62  self.__optimal_medoids = []
63  self.__optimal_estimation = float('inf')
64 
65 
66  def process(self):
67  """!
68  @brief Performs cluster analysis in line with rules of CLARANS algorithm.
69 
70  @see get_clusters()
71  @see get_medoids()
72 
73  """
74 
75  random.seed()
76 
77  for _ in range(0, self.__numlocal):
78  # set (current) random medoids
79  self.__current = random.sample(range(0, len(self.__pointer_data)), self.__number_clusters)
80 
81  # update clusters in line with random allocated medoids
82  self.__update_clusters(self.__current)
83 
84  # optimize configuration
86 
87  # obtain cost of current cluster configuration and compare it with the best obtained
88  estimation = self.__calculate_estimation()
89  if estimation < self.__optimal_estimation:
90  self.__optimal_medoids = self.__current[:]
91  self.__optimal_estimation = estimation
92 
94 
95 
96  def get_clusters(self):
97  """!
98  @brief Returns allocated clusters by the algorithm.
99 
100  @remark Allocated clusters can be returned only after data processing (use method process()), otherwise empty list is returned.
101 
102  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
103 
104  @see process()
105  @see get_medoids()
106 
107  """
108 
109  return self.__clusters
110 
111 
112  def get_medoids(self):
113  """!
114  @brief Returns list of medoids of allocated clusters.
115 
116  @see process()
117  @see get_clusters()
118 
119  """
120 
121  return self.__optimal_medoids
122 
123 
125  """!
126  @brief Returns clustering result representation type that indicate how clusters are encoded.
127 
128  @return (type_encoding) Clustering result representation.
129 
130  @see get_clusters()
131 
132  """
133 
134  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
135 
136 
137  def __update_clusters(self, medoids):
138  """!
139  @brief Forms cluster in line with specified medoids by calculation distance from each point to medoids.
140 
141  """
142 
143  self.__belong = [0] * len(self.__pointer_data)
144  self.__clusters = [[] for i in range(len(medoids))]
145  for index_point in range(len(self.__pointer_data)):
146  index_optim = -1
147  dist_optim = 0.0
148 
149  for index in range(len(medoids)):
150  dist = euclidean_distance_square(self.__pointer_data[index_point], self.__pointer_data[medoids[index]])
151 
152  if (dist < dist_optim) or (index is 0):
153  index_optim = index
154  dist_optim = dist
155 
156  self.__clusters[index_optim].append(index_point)
157  self.__belong[index_point] = index_optim
158 
159  # If cluster is not able to capture object it should be removed
160  self.__clusters = [cluster for cluster in self.__clusters if len(cluster) > 0]
161 
162 
163  def __optimize_configuration(self):
164  """!
165  @brief Finds quasi-optimal medoids and updates in line with them clusters in line with algorithm's rules.
166 
167  """
168  index_neighbor = 0
169  while (index_neighbor < self.__maxneighbor):
170  # get random current medoid that is to be replaced
171  current_medoid_index = self.__current[random.randint(0, self.__number_clusters - 1)]
172  current_medoid_cluster_index = self.__belong[current_medoid_index]
173 
174  # get new candidate to be medoid
175  candidate_medoid_index = random.randint(0, len(self.__pointer_data) - 1)
176 
177  while candidate_medoid_index in self.__current:
178  candidate_medoid_index = random.randint(0, len(self.__pointer_data) - 1)
179 
180  candidate_cost = 0.0
181  for point_index in range(0, len(self.__pointer_data)):
182  if point_index not in self.__current:
183  # get non-medoid point and its medoid
184  point_cluster_index = self.__belong[point_index]
185  point_medoid_index = self.__current[point_cluster_index]
186 
187  # get other medoid that is nearest to the point (except current and candidate)
188  other_medoid_index = self.__find_another_nearest_medoid(point_index, current_medoid_index)
189  other_medoid_cluster_index = self.__belong[other_medoid_index]
190 
191  # for optimization calculate all required distances
192  # from the point to current medoid
193  distance_current = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[current_medoid_index])
194 
195  # from the point to candidate median
196  distance_candidate = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[candidate_medoid_index])
197 
198  # from the point to nearest (own) medoid
199  distance_nearest = float('inf')
200  if ( (point_medoid_index != candidate_medoid_index) and (point_medoid_index != current_medoid_cluster_index) ):
201  distance_nearest = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[point_medoid_index])
202 
203  # apply rules for cost calculation
204  if (point_cluster_index == current_medoid_cluster_index):
205  # case 1:
206  if (distance_candidate >= distance_nearest):
207  candidate_cost += distance_nearest - distance_current
208 
209  # case 2:
210  else:
211  candidate_cost += distance_candidate - distance_current
212 
213  elif (point_cluster_index == other_medoid_cluster_index):
214  # case 3 ('nearest medoid' is the representative object of that cluster and object is more similar to 'nearest' than to 'candidate'):
215  if (distance_candidate > distance_nearest):
216  pass;
217 
218  # case 4:
219  else:
220  candidate_cost += distance_candidate - distance_nearest
221 
222  if (candidate_cost < 0):
223  # set candidate that has won
224  self.__current[current_medoid_cluster_index] = candidate_medoid_index
225 
226  # recalculate clusters
227  self.__update_clusters(self.__current)
228 
229  # reset iterations and starts investigation from the begining
230  index_neighbor = 0
231 
232  else:
233  index_neighbor += 1
234 
235 
236  def __find_another_nearest_medoid(self, point_index, current_medoid_index):
237  """!
238  @brief Finds the another nearest medoid for the specified point that is differ from the specified medoid.
239 
240  @param[in] point_index: index of point in dataspace for that searching of medoid in current list of medoids is perfomed.
241  @param[in] current_medoid_index: index of medoid that shouldn't be considered as a nearest.
242 
243  @return (uint) index of the another nearest medoid for the point.
244 
245  """
246  other_medoid_index = -1
247  other_distance_nearest = float('inf')
248  for index_medoid in self.__current:
249  if (index_medoid != current_medoid_index):
250  other_distance_candidate = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[current_medoid_index])
251 
252  if other_distance_candidate < other_distance_nearest:
253  other_distance_nearest = other_distance_candidate
254  other_medoid_index = index_medoid
255 
256  return other_medoid_index
257 
258 
259  def __calculate_estimation(self):
260  """!
261  @brief Calculates estimation (cost) of the current clusters. The lower the estimation,
262  the more optimally configuration of clusters.
263 
264  @return (double) estimation of current clusters.
265 
266  """
267  estimation = 0.0
268  for index_cluster in range(0, len(self.__clusters)):
269  cluster = self.__clusters[index_cluster]
270  index_medoid = self.__current[index_cluster]
271  for index_point in cluster:
272  estimation += euclidean_distance_square(self.__pointer_data[index_point], self.__pointer_data[index_medoid])
273 
274  return estimation
def get_clusters(self)
Returns allocated clusters by the algorithm.
Definition: clarans.py:96
def __update_clusters(self, medoids)
Forms cluster in line with specified medoids by calculation distance from each point to medoids...
Definition: clarans.py:137
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def __init__(self, data, number_clusters, numlocal, maxneighbor)
Constructor of clustering algorithm CLARANS.
Definition: clarans.py:41
Module for representing clustering results.
Definition: encoder.py:1
def __calculate_estimation(self)
Calculates estimation (cost) of the current clusters.
Definition: clarans.py:259
def __optimize_configuration(self)
Finds quasi-optimal medoids and updates in line with them clusters in line with algorithm&#39;s rules...
Definition: clarans.py:163
def get_medoids(self)
Returns list of medoids of allocated clusters.
Definition: clarans.py:112
def __find_another_nearest_medoid(self, point_index, current_medoid_index)
Finds the another nearest medoid for the specified point that is differ from the specified medoid...
Definition: clarans.py:236
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: clarans.py:124
def process(self)
Performs cluster analysis in line with rules of CLARANS algorithm.
Definition: clarans.py:66
Class represents clustering algorithm CLARANS (a method for clustering objects for spatial data minin...
Definition: clarans.py:35