pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
agglomerative.py
1 """!
2 
3 @brief Cluster analysis algorithm: agglomerative algorithm.
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 
13 from enum import IntEnum
14 
15 from pyclustering.cluster.encoder import type_encoding
16 
17 from pyclustering.utils import euclidean_distance_square
18 
19 from pyclustering.core.wrapper import ccore_library
20 
21 import pyclustering.core.agglomerative_wrapper as wrapper
22 
23 
24 class type_link(IntEnum):
25  """!
26  @brief Enumerator of types of link between clusters.
27 
28  """
29 
30 
31  SINGLE_LINK = 0
32 
33 
34  COMPLETE_LINK = 1
35 
36 
37  AVERAGE_LINK = 2
38 
39 
40  CENTROID_LINK = 3
41 
42 
44  """!
45  @brief Class represents agglomerative algorithm for cluster analysis.
46  @details Agglomerative algorithm considers each data point (object) as a separate cluster at the beginning and
47  step by step finds the best pair of clusters for merge until required amount of clusters is obtained.
48 
49  Example of agglomerative algorithm where centroid link is used:
50  @code
51  from pyclustering.cluster.agglomerative import agglomerative, type_link
52  from pyclustering.cluster import cluster_visualizer
53  from pyclustering.samples.definitions import FCPS_SAMPLES
54  from pyclustering.utils import read_sample
55 
56  # Sample for cluster analysis (represented by list)
57  sample = read_sample(FCPS_SAMPLES.SAMPLE_TARGET)
58 
59  # Create object that uses python code only
60  agglomerative_instance = agglomerative(sample, 6, type_link.SINGLE_LINK, ccore=True)
61 
62  # Cluster analysis
63  agglomerative_instance.process()
64 
65  # Obtain results of clustering
66  clusters = agglomerative_instance.get_clusters()
67 
68  # Visualize clustering results
69  visualizer = cluster_visualizer()
70  visualizer.append_clusters(clusters, sample)
71  visualizer.show()
72  @endcode
73 
74  There is example of clustering 'LSUN' sample:
75  @code
76  from pyclustering.cluster.agglomerative import agglomerative, type_link
77  from pyclustering.cluster import cluster_visualizer
78  from pyclustering.samples.definitions import FCPS_SAMPLES
79  from pyclustering.utils import read_sample
80 
81  # sample Lsun for cluster analysis
82  lsun_sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
83 
84  # create instance of the algorithm that will use ccore library (the last argument)
85  agglomerative_instance = agglomerative(lsun_sample, 3, type_link.SINGLE_LINK, True)
86 
87  # start processing
88  agglomerative_instance.process()
89 
90  # get result and visualize it
91  lsun_clusters = agglomerative_instance.get_clusters()
92  visualizer = cluster_visualizer()
93  visualizer.append_clusters(lsun_clusters, lsun_sample)
94  visualizer.show()
95  @endcode
96 
97  Example of agglomerative clustering using different links:
98  @image html agglomerative_lsun_clustering_single_link.png
99 
100  """
101 
102  def __init__(self, data, number_clusters, link = None, ccore = True):
103  """!
104  @brief Constructor of agglomerative hierarchical algorithm.
105 
106  @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by list, for example
107  [[0.1, 0.2], [0.4, 0.5], [1.3, 0.9]].
108  @param[in] number_clusters (uint): Number of clusters that should be allocated.
109  @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
110  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not (by default it is 'False').
111 
112  """
113 
114  self.__pointer_data = data
115  self.__number_clusters = number_clusters
116  self.__similarity = link
117 
118  self.__verify_arguments()
119 
120  if self.__similarity is None:
121  self.__similarity = type_link.CENTROID_LINK
122 
123  self.__clusters = []
124  self.__ccore = ccore
125  if self.__ccore:
126  self.__ccore = ccore_library.workable()
127 
128  if self.__similarity == type_link.CENTROID_LINK:
129  self.__centers = self.__pointer_data.copy() # used in case of usage of centroid links
130 
131 
132  def process(self):
133  """!
134  @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
135 
136  @return (agglomerative) Returns itself (Agglomerative instance).
137 
138  @see get_clusters()
139 
140  """
141 
142  if self.__ccore is True:
143  self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity)
144 
145  else:
146  self.__clusters = [[index] for index in range(0, len(self.__pointer_data))]
147 
148  current_number_clusters = len(self.__clusters)
149 
150  while current_number_clusters > self.__number_clusters:
152  current_number_clusters = len(self.__clusters)
153 
154  return self
155 
156 
157  def get_clusters(self):
158  """!
159  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
160 
161  @remark Results of clustering can be obtained using corresponding gets methods.
162 
163  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
164 
165  @see process()
166 
167  """
168 
169  return self.__clusters
170 
171 
173  """!
174  @brief Returns clustering result representation type that indicate how clusters are encoded.
175 
176  @return (type_encoding) Clustering result representation.
177 
178  @see get_clusters()
179 
180  """
181 
182  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
183 
184 
185  def __merge_similar_clusters(self):
186  """!
187  @brief Merges the most similar clusters in line with link type.
188 
189  """
190 
191  if self.__similarity == type_link.AVERAGE_LINK:
193 
194  elif self.__similarity == type_link.CENTROID_LINK:
196 
197  elif self.__similarity == type_link.COMPLETE_LINK:
199 
200  elif self.__similarity == type_link.SINGLE_LINK:
202 
203  else:
204  raise NameError('Not supported similarity is used')
205 
206 
207  def __merge_by_average_link(self):
208  """!
209  @brief Merges the most similar clusters in line with average link type.
210 
211  """
212 
213  minimum_average_distance = float('Inf')
214 
215  for index_cluster1 in range(0, len(self.__clusters)):
216  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
217 
218  # Find farthest objects
219  candidate_average_distance = 0.0
220  for index_object1 in self.__clusters[index_cluster1]:
221  for index_object2 in self.__clusters[index_cluster2]:
222  candidate_average_distance += euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2])
223 
224  candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]))
225 
226  if candidate_average_distance < minimum_average_distance:
227  minimum_average_distance = candidate_average_distance
228  indexes = [index_cluster1, index_cluster2]
229 
230  self.__clusters[indexes[0]] += self.__clusters[indexes[1]]
231  self.__clusters.pop(indexes[1]) # remove merged cluster.
232 
233 
234  def __merge_by_centroid_link(self):
235  """!
236  @brief Merges the most similar clusters in line with centroid link type.
237 
238  """
239 
240  minimum_centroid_distance = float('Inf')
241  indexes = None
242 
243  for index1 in range(0, len(self.__centers)):
244  for index2 in range(index1 + 1, len(self.__centers)):
245  distance = euclidean_distance_square(self.__centers[index1], self.__centers[index2])
246  if distance < minimum_centroid_distance:
247  minimum_centroid_distance = distance
248  indexes = [index1, index2]
249 
250  self.__clusters[indexes[0]] += self.__clusters[indexes[1]]
251  self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]])
252 
253  self.__clusters.pop(indexes[1]) # remove merged cluster.
254  self.__centers.pop(indexes[1]) # remove merged center.
255 
256 
257  def __merge_by_complete_link(self):
258  """!
259  @brief Merges the most similar clusters in line with complete link type.
260 
261  """
262 
263  minimum_complete_distance = float('Inf')
264  indexes = None
265 
266  for index_cluster1 in range(0, len(self.__clusters)):
267  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
268  candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2)
269 
270  if candidate_maximum_distance < minimum_complete_distance:
271  minimum_complete_distance = candidate_maximum_distance
272  indexes = [index_cluster1, index_cluster2]
273 
274  self.__clusters[indexes[0]] += self.__clusters[indexes[1]]
275  self.__clusters.pop(indexes[1]) # remove merged cluster.
276 
277 
278  def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
279  """!
280  @brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
281 
282  @param[in] (uint) Index of the first cluster.
283  @param[in] (uint) Index of the second cluster.
284 
285  @return The farthest euclidean distance between two clusters.
286 
287  """
288  candidate_maximum_distance = 0.0
289  for index_object1 in self.__clusters[index_cluster1]:
290  for index_object2 in self.__clusters[index_cluster2]:
291  distance = euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2])
292 
293  if distance > candidate_maximum_distance:
294  candidate_maximum_distance = distance
295 
296  return candidate_maximum_distance
297 
298 
299  def __merge_by_signle_link(self):
300  """!
301  @brief Merges the most similar clusters in line with single link type.
302 
303  """
304 
305  minimum_single_distance = float('Inf')
306  indexes = None
307 
308  for index_cluster1 in range(0, len(self.__clusters)):
309  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
310  candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2)
311 
312  if candidate_minimum_distance < minimum_single_distance:
313  minimum_single_distance = candidate_minimum_distance
314  indexes = [index_cluster1, index_cluster2]
315 
316  self.__clusters[indexes[0]] += self.__clusters[indexes[1]]
317  self.__clusters.pop(indexes[1]) # remove merged cluster.
318 
319 
320  def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
321  """!
322  @brief Finds two nearest objects in two specified clusters and returns distance between them.
323 
324  @param[in] (uint) Index of the first cluster.
325  @param[in] (uint) Index of the second cluster.
326 
327  @return The nearest euclidean distance between two clusters.
328 
329  """
330  candidate_minimum_distance = float('Inf')
331 
332  for index_object1 in self.__clusters[index_cluster1]:
333  for index_object2 in self.__clusters[index_cluster2]:
334  distance = euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2])
335  if distance < candidate_minimum_distance:
336  candidate_minimum_distance = distance
337 
338  return candidate_minimum_distance
339 
340 
341  def __calculate_center(self, cluster):
342  """!
343  @brief Calculates new center.
344 
345  @return (list) New value of the center of the specified cluster.
346 
347  """
348 
349  dimension = len(self.__pointer_data[cluster[0]])
350  center = [0] * dimension
351  for index_point in cluster:
352  for index_dimension in range(0, dimension):
353  center[index_dimension] += self.__pointer_data[index_point][index_dimension]
354 
355  for index_dimension in range(0, dimension):
356  center[index_dimension] /= len(cluster)
357 
358  return center
359 
360 
361  def __verify_arguments(self):
362  """!
363  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
364 
365  """
366  if len(self.__pointer_data) == 0:
367  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
368 
369  if self.__number_clusters <= 0:
370  raise ValueError("Amount of cluster (current value: '%d') for allocation should be greater than 0." %
371  self.__number_clusters)
pyclustering.cluster.agglomerative.agglomerative.__calculate_farthest_distance
def __calculate_farthest_distance(self, index_cluster1, index_cluster2)
Finds two farthest objects in two specified clusters in terms and returns distance between them.
Definition: agglomerative.py:278
pyclustering.cluster.agglomerative.agglomerative.__verify_arguments
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: agglomerative.py:361
pyclustering.cluster.agglomerative.agglomerative.__merge_by_centroid_link
def __merge_by_centroid_link(self)
Merges the most similar clusters in line with centroid link type.
Definition: agglomerative.py:234
pyclustering.cluster.agglomerative.agglomerative.__ccore
__ccore
Definition: agglomerative.py:124
pyclustering.cluster.agglomerative.agglomerative.process
def process(self)
Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
Definition: agglomerative.py:132
pyclustering.cluster.agglomerative.agglomerative.__calculate_center
def __calculate_center(self, cluster)
Calculates new center.
Definition: agglomerative.py:341
pyclustering.cluster.agglomerative.agglomerative.get_clusters
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
Definition: agglomerative.py:157
pyclustering.cluster.agglomerative.agglomerative.__similarity
__similarity
Definition: agglomerative.py:116
pyclustering.cluster.agglomerative.agglomerative.get_cluster_encoding
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: agglomerative.py:172
pyclustering.cluster.agglomerative.agglomerative.__calculate_nearest_distance
def __calculate_nearest_distance(self, index_cluster1, index_cluster2)
Finds two nearest objects in two specified clusters and returns distance between them.
Definition: agglomerative.py:320
pyclustering.cluster.agglomerative.agglomerative
Class represents agglomerative algorithm for cluster analysis.
Definition: agglomerative.py:43
pyclustering.cluster.agglomerative.agglomerative.__centers
__centers
Definition: agglomerative.py:129
pyclustering.cluster.agglomerative.agglomerative.__merge_by_average_link
def __merge_by_average_link(self)
Merges the most similar clusters in line with average link type.
Definition: agglomerative.py:207
pyclustering.cluster.agglomerative.agglomerative.__pointer_data
__pointer_data
Definition: agglomerative.py:114
pyclustering.cluster.agglomerative.agglomerative.__init__
def __init__(self, data, number_clusters, link=None, ccore=True)
Constructor of agglomerative hierarchical algorithm.
Definition: agglomerative.py:102
pyclustering.cluster.agglomerative.agglomerative.__clusters
__clusters
Definition: agglomerative.py:123
pyclustering.cluster.agglomerative.agglomerative.__merge_by_complete_link
def __merge_by_complete_link(self)
Merges the most similar clusters in line with complete link type.
Definition: agglomerative.py:257
pyclustering.utils
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
pyclustering.cluster.agglomerative.agglomerative.__merge_by_signle_link
def __merge_by_signle_link(self)
Merges the most similar clusters in line with single link type.
Definition: agglomerative.py:299
pyclustering.cluster.agglomerative.agglomerative.__merge_similar_clusters
def __merge_similar_clusters(self)
Merges the most similar clusters in line with link type.
Definition: agglomerative.py:185
pyclustering.cluster.encoder
Module for representing clustering results.
Definition: encoder.py:1
pyclustering.cluster.agglomerative.agglomerative.__number_clusters
__number_clusters
Definition: agglomerative.py:115