agglomerative.py
1 """!
2 
3 @brief Cluster analysis algorithm: agglomerative algorithm.
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from enum import IntEnum;
29 
30 from pyclustering.cluster.encoder import type_encoding;
31 
32 from pyclustering.utils import euclidean_distance_square;
33 
34 from pyclustering.core.wrapper import ccore_library
35 
36 import pyclustering.core.agglomerative_wrapper as wrapper;
37 
38 
39 class type_link(IntEnum):
40  """!
41  @brief Enumerator of types of link between clusters.
42 
43  """
44 
45 
46  SINGLE_LINK = 0;
47 
48 
49  COMPLETE_LINK = 1;
50 
51 
52  AVERAGE_LINK = 2;
53 
54 
55  CENTROID_LINK = 3;
56 
57 
59  """!
60  @brief Class represents agglomerative algorithm for cluster analysis.
61  @details Agglomerative algorithm considers each data point (object) as a separate cluster at the beggining and
62  step by step finds the best pair of clusters for merge until required amount of clusters is obtained.
63 
64  Example of agglomerative algorithm where centroid link is used:
65  @code
66  from pyclustering.cluster.agglomerative import agglomerative, type_link
67  from pyclustering.cluster import cluster_visualizer
68  from pyclustering.samples.definitions import FCPS_SAMPLES
69  from pyclustering.utils import read_sample
70 
71  # Sample for cluster analysis (represented by list)
72  sample = read_sample(FCPS_SAMPLES.SAMPLE_TARGET)
73 
74  # Create object that uses python code only
75  agglomerative_instance = agglomerative(sample, 6, type_link.SINGLE_LINK, ccore=True)
76 
77  # Cluster analysis
78  agglomerative_instance.process()
79 
80  # Obtain results of clustering
81  clusters = agglomerative_instance.get_clusters()
82 
83  # Visualize clustering results
84  visualizer = cluster_visualizer()
85  visualizer.append_clusters(clusters, sample)
86  visualizer.show()
87  @endcode
88 
89  There is example of clustering 'LSUN' sample:
90  @code
91  from pyclustering.cluster.agglomerative import agglomerative, type_link
92  from pyclustering.cluster import cluster_visualizer
93  from pyclustering.samples.definitions import FCPS_SAMPLES
94  from pyclustering.utils import read_sample
95 
96  # sample Lsun for cluster analysis
97  lsun_sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
98 
99  # create instance of the algorithm that will use ccore library (the last argument)
100  agglomerative_instance = agglomerative(lsun_sample, 3, type_link.SINGLE_LINK, True)
101 
102  # start processing
103  agglomerative_instance.process()
104 
105  # get result and visualize it
106  lsun_clusters = agglomerative_instance.get_clusters()
107  visualizer = cluster_visualizer()
108  visualizer.append_clusters(lsun_clusters, lsun_sample)
109  visualizer.show()
110  @endcode
111 
112  Example of agglomerative clustering using different links:
113  @image html agglomerative_lsun_clustering_single_link.png
114 
115  """
116 
117  def __init__(self, data, number_clusters, link = None, ccore = True):
118  """!
119  @brief Constructor of agglomerative hierarchical algorithm.
120 
121  @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by list, for example
122  [[0.1, 0.2], [0.4, 0.5], [1.3, 0.9]].
123  @param[in] number_clusters (uint): Number of clusters that should be allocated.
124  @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
125  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not (by default it is 'False').
126 
127  """
128 
129  self.__pointer_data = data
130  self.__number_clusters = number_clusters
131  self.__similarity = link
132 
133  self.__verify_arguments()
134 
135  if self.__similarity is None:
136  self.__similarity = type_link.CENTROID_LINK
137 
138  self.__clusters = []
139  self.__ccore = ccore
140  if self.__ccore:
141  self.__ccore = ccore_library.workable()
142 
143  if self.__similarity == type_link.CENTROID_LINK:
144  self.__centers = self.__pointer_data.copy() # used in case of usage of centroid links
145 
146 
147  def process(self):
148  """!
149  @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
150 
151  @return (agglomerative) Returns itself (Agglomerative instance).
152 
153  @see get_clusters()
154 
155  """
156 
157  if self.__ccore is True:
158  self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity)
159 
160  else:
161  self.__clusters = [[index] for index in range(0, len(self.__pointer_data))]
162 
163  current_number_clusters = len(self.__clusters)
164 
165  while current_number_clusters > self.__number_clusters:
167  current_number_clusters = len(self.__clusters)
168 
169  return self
170 
171 
172  def get_clusters(self):
173  """!
174  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
175 
176  @remark Results of clustering can be obtained using corresponding gets methods.
177 
178  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
179 
180  @see process()
181 
182  """
183 
184  return self.__clusters
185 
186 
188  """!
189  @brief Returns clustering result representation type that indicate how clusters are encoded.
190 
191  @return (type_encoding) Clustering result representation.
192 
193  @see get_clusters()
194 
195  """
196 
197  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
198 
199 
200  def __merge_similar_clusters(self):
201  """!
202  @brief Merges the most similar clusters in line with link type.
203 
204  """
205 
206  if self.__similarity == type_link.AVERAGE_LINK:
208 
209  elif self.__similarity == type_link.CENTROID_LINK:
211 
212  elif self.__similarity == type_link.COMPLETE_LINK:
214 
215  elif self.__similarity == type_link.SINGLE_LINK:
217 
218  else:
219  raise NameError('Not supported similarity is used')
220 
221 
222  def __merge_by_average_link(self):
223  """!
224  @brief Merges the most similar clusters in line with average link type.
225 
226  """
227 
228  minimum_average_distance = float('Inf')
229 
230  for index_cluster1 in range(0, len(self.__clusters)):
231  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
232 
233  # Find farthest objects
234  candidate_average_distance = 0.0
235  for index_object1 in self.__clusters[index_cluster1]:
236  for index_object2 in self.__clusters[index_cluster2]:
237  candidate_average_distance += euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2])
238 
239  candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]))
240 
241  if candidate_average_distance < minimum_average_distance:
242  minimum_average_distance = candidate_average_distance
243  indexes = [index_cluster1, index_cluster2]
244 
245  self.__clusters[indexes[0]] += self.__clusters[indexes[1]]
246  self.__clusters.pop(indexes[1]) # remove merged cluster.
247 
248 
249  def __merge_by_centroid_link(self):
250  """!
251  @brief Merges the most similar clusters in line with centroid link type.
252 
253  """
254 
255  minimum_centroid_distance = float('Inf')
256  indexes = None
257 
258  for index1 in range(0, len(self.__centers)):
259  for index2 in range(index1 + 1, len(self.__centers)):
260  distance = euclidean_distance_square(self.__centers[index1], self.__centers[index2])
261  if distance < minimum_centroid_distance:
262  minimum_centroid_distance = distance
263  indexes = [index1, index2]
264 
265  self.__clusters[indexes[0]] += self.__clusters[indexes[1]]
266  self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]])
267 
268  self.__clusters.pop(indexes[1]) # remove merged cluster.
269  self.__centers.pop(indexes[1]) # remove merged center.
270 
271 
272  def __merge_by_complete_link(self):
273  """!
274  @brief Merges the most similar clusters in line with complete link type.
275 
276  """
277 
278  minimum_complete_distance = float('Inf')
279  indexes = None
280 
281  for index_cluster1 in range(0, len(self.__clusters)):
282  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
283  candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2)
284 
285  if candidate_maximum_distance < minimum_complete_distance:
286  minimum_complete_distance = candidate_maximum_distance
287  indexes = [index_cluster1, index_cluster2]
288 
289  self.__clusters[indexes[0]] += self.__clusters[indexes[1]]
290  self.__clusters.pop(indexes[1]) # remove merged cluster.
291 
292 
293  def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
294  """!
295  @brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
296 
297  @param[in] (uint) Index of the first cluster.
298  @param[in] (uint) Index of the second cluster.
299 
300  @return The farthest euclidean distance between two clusters.
301 
302  """
303  candidate_maximum_distance = 0.0
304  for index_object1 in self.__clusters[index_cluster1]:
305  for index_object2 in self.__clusters[index_cluster2]:
306  distance = euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2])
307 
308  if distance > candidate_maximum_distance:
309  candidate_maximum_distance = distance
310 
311  return candidate_maximum_distance
312 
313 
314  def __merge_by_signle_link(self):
315  """!
316  @brief Merges the most similar clusters in line with single link type.
317 
318  """
319 
320  minimum_single_distance = float('Inf')
321  indexes = None
322 
323  for index_cluster1 in range(0, len(self.__clusters)):
324  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
325  candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2)
326 
327  if candidate_minimum_distance < minimum_single_distance:
328  minimum_single_distance = candidate_minimum_distance
329  indexes = [index_cluster1, index_cluster2]
330 
331  self.__clusters[indexes[0]] += self.__clusters[indexes[1]]
332  self.__clusters.pop(indexes[1]) # remove merged cluster.
333 
334 
335  def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
336  """!
337  @brief Finds two nearest objects in two specified clusters and returns distance between them.
338 
339  @param[in] (uint) Index of the first cluster.
340  @param[in] (uint) Index of the second cluster.
341 
342  @return The nearest euclidean distance between two clusters.
343 
344  """
345  candidate_minimum_distance = float('Inf')
346 
347  for index_object1 in self.__clusters[index_cluster1]:
348  for index_object2 in self.__clusters[index_cluster2]:
349  distance = euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2])
350  if distance < candidate_minimum_distance:
351  candidate_minimum_distance = distance
352 
353  return candidate_minimum_distance
354 
355 
356  def __calculate_center(self, cluster):
357  """!
358  @brief Calculates new center.
359 
360  @return (list) New value of the center of the specified cluster.
361 
362  """
363 
364  dimension = len(self.__pointer_data[cluster[0]])
365  center = [0] * dimension
366  for index_point in cluster:
367  for index_dimension in range(0, dimension):
368  center[index_dimension] += self.__pointer_data[index_point][index_dimension]
369 
370  for index_dimension in range(0, dimension):
371  center[index_dimension] /= len(cluster)
372 
373  return center
374 
375 
376  def __verify_arguments(self):
377  """!
378  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
379 
380  """
381  if len(self.__pointer_data) == 0:
382  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
383 
384  if self.__number_clusters <= 0:
385  raise ValueError("Amount of cluster (current value: '%d') for allocation should be greater than 0." %
386  self.__number_clusters)
def __calculate_farthest_distance(self, index_cluster1, index_cluster2)
Finds two farthest objects in two specified clusters in terms and returns distance between them...
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
def __merge_by_signle_link(self)
Merges the most similar clusters in line with single link type.
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
Module for representing clustering results.
Definition: encoder.py:1
def __merge_by_centroid_link(self)
Merges the most similar clusters in line with centroid link type.
Class represents agglomerative algorithm for cluster analysis.
def __init__(self, data, number_clusters, link=None, ccore=True)
Constructor of agglomerative hierarchical algorithm.
def __merge_by_average_link(self)
Merges the most similar clusters in line with average link type.
def __calculate_center(self, cluster)
Calculates new center.
def process(self)
Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
def __calculate_nearest_distance(self, index_cluster1, index_cluster2)
Finds two nearest objects in two specified clusters and returns distance between them.
def __merge_similar_clusters(self)
Merges the most similar clusters in line with link type.
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
def __merge_by_complete_link(self)
Merges the most similar clusters in line with complete link type.