pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
clarans.py
1 """!
2 
3 @brief Cluster analysis algorithm: CLARANS.
4 @details Implementation based on paper @cite article::clarans::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 
13 import random
14 
15 from pyclustering.cluster.encoder import type_encoding
16 
17 from pyclustering.utils import euclidean_distance_square
18 
19 
20 class clarans:
21  """!
22  @brief Class represents clustering algorithm CLARANS (a method for clustering objects for spatial data mining).
23 
24  """
25 
26  def __init__(self, data, number_clusters, numlocal, maxneighbor):
27  """!
28  @brief Constructor of clustering algorithm CLARANS.
29  @details The higher the value of maxneighbor, the closer is CLARANS to K-Medoids, and the longer is each search of a local minima.
30 
31  @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
32  @param[in] number_clusters (uint): Amount of clusters that should be allocated.
33  @param[in] numlocal (uint): The number of local minima obtained (amount of iterations for solving the problem).
34  @param[in] maxneighbor (uint): The maximum number of neighbors examined.
35 
36  """
37 
38  self.__pointer_data = data
39  self.__numlocal = numlocal
40  self.__maxneighbor = maxneighbor
41  self.__number_clusters = number_clusters
42 
43  self.__clusters = []
44  self.__current = []
45  self.__belong = []
46 
47  self.__optimal_medoids = []
48  self.__optimal_estimation = float('inf')
49 
50  self.__verify_arguments()
51 
52 
53  def __verify_arguments(self):
54  """!
55  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
56 
57  """
58  if len(self.__pointer_data) == 0:
59  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
60 
61  if self.__number_clusters <= 0:
62  raise ValueError("Amount of cluster (current value: '%d') for allocation should be greater than 0." %
63  self.__number_clusters)
64 
65  if self.__numlocal < 0:
66  raise ValueError("Local minima (current value: '%d') should be greater or equal to 0." % self.__numlocal)
67 
68  if self.__maxneighbor < 0:
69  raise ValueError("Maximum number of neighbors (current value: '%d') should be greater or "
70  "equal to 0." % self.__maxneighbor)
71 
72 
73  def process(self):
74  """!
75  @brief Performs cluster analysis in line with rules of CLARANS algorithm.
76 
77  @return (clarans) Returns itself (CLARANS instance).
78 
79  @see get_clusters()
80  @see get_medoids()
81 
82  """
83 
84  random.seed()
85 
86  for _ in range(0, self.__numlocal):
87  # set (current) random medoids
88  self.__current = random.sample(range(0, len(self.__pointer_data)), self.__number_clusters)
89 
90  # update clusters in line with random allocated medoids
91  self.__update_clusters(self.__current)
92 
93  # optimize configuration
95 
96  # obtain cost of current cluster configuration and compare it with the best obtained
97  estimation = self.__calculate_estimation()
98  if estimation < self.__optimal_estimation:
99  self.__optimal_medoids = self.__current[:]
100  self.__optimal_estimation = estimation
101 
103  return self
104 
105 
106  def get_clusters(self):
107  """!
108  @brief Returns allocated clusters by the algorithm.
109 
110  @remark Allocated clusters can be returned only after data processing (use method process()), otherwise empty list is returned.
111 
112  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
113 
114  @see process()
115  @see get_medoids()
116 
117  """
118 
119  return self.__clusters
120 
121 
122  def get_medoids(self):
123  """!
124  @brief Returns list of medoids of allocated clusters.
125 
126  @see process()
127  @see get_clusters()
128 
129  """
130 
131  return self.__optimal_medoids
132 
133 
135  """!
136  @brief Returns clustering result representation type that indicate how clusters are encoded.
137 
138  @return (type_encoding) Clustering result representation.
139 
140  @see get_clusters()
141 
142  """
143 
144  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
145 
146 
147  def __update_clusters(self, medoids):
148  """!
149  @brief Forms cluster in line with specified medoids by calculation distance from each point to medoids.
150 
151  """
152 
153  self.__belong = [0] * len(self.__pointer_data)
154  self.__clusters = [[] for i in range(len(medoids))]
155  for index_point in range(len(self.__pointer_data)):
156  index_optim = -1
157  dist_optim = 0.0
158 
159  for index in range(len(medoids)):
160  dist = euclidean_distance_square(self.__pointer_data[index_point], self.__pointer_data[medoids[index]])
161 
162  if (dist < dist_optim) or (index == 0):
163  index_optim = index
164  dist_optim = dist
165 
166  self.__clusters[index_optim].append(index_point)
167  self.__belong[index_point] = index_optim
168 
169  # If cluster is not able to capture object it should be removed
170  self.__clusters = [cluster for cluster in self.__clusters if len(cluster) > 0]
171 
172 
173  def __optimize_configuration(self):
174  """!
175  @brief Finds quasi-optimal medoids and updates in line with them clusters in line with algorithm's rules.
176 
177  """
178  index_neighbor = 0
179  while (index_neighbor < self.__maxneighbor):
180  # get random current medoid that is to be replaced
181  current_medoid_index = self.__current[random.randint(0, self.__number_clusters - 1)]
182  current_medoid_cluster_index = self.__belong[current_medoid_index]
183 
184  # get new candidate to be medoid
185  candidate_medoid_index = random.randint(0, len(self.__pointer_data) - 1)
186 
187  while candidate_medoid_index in self.__current:
188  candidate_medoid_index = random.randint(0, len(self.__pointer_data) - 1)
189 
190  candidate_cost = 0.0
191  for point_index in range(0, len(self.__pointer_data)):
192  if point_index not in self.__current:
193  # get non-medoid point and its medoid
194  point_cluster_index = self.__belong[point_index]
195  point_medoid_index = self.__current[point_cluster_index]
196 
197  # get other medoid that is nearest to the point (except current and candidate)
198  other_medoid_index = self.__find_another_nearest_medoid(point_index, current_medoid_index)
199  other_medoid_cluster_index = self.__belong[other_medoid_index]
200 
201  # for optimization calculate all required distances
202  # from the point to current medoid
203  distance_current = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[current_medoid_index])
204 
205  # from the point to candidate median
206  distance_candidate = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[candidate_medoid_index])
207 
208  # from the point to nearest (own) medoid
209  distance_nearest = float('inf')
210  if ( (point_medoid_index != candidate_medoid_index) and (point_medoid_index != current_medoid_cluster_index) ):
211  distance_nearest = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[point_medoid_index])
212 
213  # apply rules for cost calculation
214  if (point_cluster_index == current_medoid_cluster_index):
215  # case 1:
216  if (distance_candidate >= distance_nearest):
217  candidate_cost += distance_nearest - distance_current
218 
219  # case 2:
220  else:
221  candidate_cost += distance_candidate - distance_current
222 
223  elif (point_cluster_index == other_medoid_cluster_index):
224  # case 3 ('nearest medoid' is the representative object of that cluster and object is more similar to 'nearest' than to 'candidate'):
225  if (distance_candidate > distance_nearest):
226  pass;
227 
228  # case 4:
229  else:
230  candidate_cost += distance_candidate - distance_nearest
231 
232  if (candidate_cost < 0):
233  # set candidate that has won
234  self.__current[current_medoid_cluster_index] = candidate_medoid_index
235 
236  # recalculate clusters
237  self.__update_clusters(self.__current)
238 
239  # reset iterations and starts investigation from the begining
240  index_neighbor = 0
241 
242  else:
243  index_neighbor += 1
244 
245 
246  def __find_another_nearest_medoid(self, point_index, current_medoid_index):
247  """!
248  @brief Finds the another nearest medoid for the specified point that is differ from the specified medoid.
249 
250  @param[in] point_index: index of point in dataspace for that searching of medoid in current list of medoids is perfomed.
251  @param[in] current_medoid_index: index of medoid that shouldn't be considered as a nearest.
252 
253  @return (uint) index of the another nearest medoid for the point.
254 
255  """
256  other_medoid_index = -1
257  other_distance_nearest = float('inf')
258  for index_medoid in self.__current:
259  if (index_medoid != current_medoid_index):
260  other_distance_candidate = euclidean_distance_square(self.__pointer_data[point_index], self.__pointer_data[current_medoid_index])
261 
262  if other_distance_candidate < other_distance_nearest:
263  other_distance_nearest = other_distance_candidate
264  other_medoid_index = index_medoid
265 
266  return other_medoid_index
267 
268 
269  def __calculate_estimation(self):
270  """!
271  @brief Calculates estimation (cost) of the current clusters. The lower the estimation,
272  the more optimally configuration of clusters.
273 
274  @return (double) estimation of current clusters.
275 
276  """
277  estimation = 0.0
278  for index_cluster in range(0, len(self.__clusters)):
279  cluster = self.__clusters[index_cluster]
280  index_medoid = self.__current[index_cluster]
281  for index_point in cluster:
282  estimation += euclidean_distance_square(self.__pointer_data[index_point], self.__pointer_data[index_medoid])
283 
284  return estimation
pyclustering.cluster.clarans.clarans.__optimal_medoids
__optimal_medoids
Definition: clarans.py:47
pyclustering.cluster.clarans.clarans.__verify_arguments
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: clarans.py:53
pyclustering.cluster.clarans.clarans.__clusters
__clusters
Definition: clarans.py:43
pyclustering.cluster.clarans.clarans.__pointer_data
__pointer_data
Definition: clarans.py:38
pyclustering.cluster.clarans.clarans.__maxneighbor
__maxneighbor
Definition: clarans.py:40
pyclustering.cluster.clarans.clarans.get_medoids
def get_medoids(self)
Returns list of medoids of allocated clusters.
Definition: clarans.py:122
pyclustering.cluster.clarans.clarans.__current
__current
Definition: clarans.py:44
pyclustering.cluster.clarans.clarans.__number_clusters
__number_clusters
Definition: clarans.py:41
pyclustering.cluster.clarans.clarans.get_clusters
def get_clusters(self)
Returns allocated clusters by the algorithm.
Definition: clarans.py:106
pyclustering.cluster.clarans.clarans.__optimal_estimation
__optimal_estimation
Definition: clarans.py:48
pyclustering.cluster.clarans.clarans.__update_clusters
def __update_clusters(self, medoids)
Forms cluster in line with specified medoids by calculation distance from each point to medoids.
Definition: clarans.py:147
pyclustering.cluster.clarans.clarans.__calculate_estimation
def __calculate_estimation(self)
Calculates estimation (cost) of the current clusters.
Definition: clarans.py:269
pyclustering.cluster.clarans.clarans.process
def process(self)
Performs cluster analysis in line with rules of CLARANS algorithm.
Definition: clarans.py:73
pyclustering.cluster.clarans.clarans.__numlocal
__numlocal
Definition: clarans.py:39
pyclustering.cluster.clarans.clarans.get_cluster_encoding
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: clarans.py:134
pyclustering.utils
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
pyclustering.cluster.clarans.clarans
Class represents clustering algorithm CLARANS (a method for clustering objects for spatial data minin...
Definition: clarans.py:20
pyclustering.cluster.clarans.clarans.__find_another_nearest_medoid
def __find_another_nearest_medoid(self, point_index, current_medoid_index)
Finds the another nearest medoid for the specified point that is differ from the specified medoid.
Definition: clarans.py:246
pyclustering.cluster.clarans.clarans.__belong
__belong
Definition: clarans.py:45
pyclustering.cluster.encoder
Module for representing clustering results.
Definition: encoder.py:1
pyclustering.cluster.clarans.clarans.__init__
def __init__(self, data, number_clusters, numlocal, maxneighbor)
Constructor of clustering algorithm CLARANS.
Definition: clarans.py:26
pyclustering.cluster.clarans.clarans.__optimize_configuration
def __optimize_configuration(self)
Finds quasi-optimal medoids and updates in line with them clusters in line with algorithm's rules.
Definition: clarans.py:173