agglomerative.py
1 """!
2 
3 @brief Cluster analysis algorithm: agglomerative algorithm.
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from enum import IntEnum;
29 
30 from pyclustering.cluster.encoder import type_encoding;
31 
32 from pyclustering.utils import euclidean_distance_square;
33 
34 from pyclustering.core.wrapper import ccore_library
35 
36 import pyclustering.core.agglomerative_wrapper as wrapper;
37 
38 
39 class type_link(IntEnum):
40  """!
41  @brief Enumerator of types of link between clusters.
42 
43  """
44 
45 
46  SINGLE_LINK = 0;
47 
48 
49  COMPLETE_LINK = 1;
50 
51 
52  AVERAGE_LINK = 2;
53 
54 
55  CENTROID_LINK = 3;
56 
57 
59  """!
60  @brief Class represents agglomerative algorithm for cluster analysis.
61  @details Agglomerative algorithm considers each data point (object) as a separate cluster at the beggining and
62  step by step finds the best pair of clusters for merge until required amount of clusters is obtained.
63 
64  Example of agglomerative algorithm where centroid link is used:
65  @code
66  from pyclustering.cluster.agglomerative import agglomerative, type_link
67  from pyclustering.cluster import cluster_visualizer
68  from pyclustering.samples.definitions import FCPS_SAMPLES
69  from pyclustering.utils import read_sample
70 
71  # Sample for cluster analysis (represented by list)
72  sample = read_sample(FCPS_SAMPLES.SAMPLE_TARGET)
73 
74  # Create object that uses python code only
75  agglomerative_instance = agglomerative(sample, 6, type_link.SINGLE_LINK, ccore=True)
76 
77  # Cluster analysis
78  agglomerative_instance.process()
79 
80  # Obtain results of clustering
81  clusters = agglomerative_instance.get_clusters()
82 
83  # Visualize clustering results
84  visualizer = cluster_visualizer()
85  visualizer.append_clusters(clusters, sample)
86  visualizer.show()
87  @endcode
88 
89  There is example of clustering 'LSUN' sample:
90  @code
91  from pyclustering.cluster.agglomerative import agglomerative, type_link
92  from pyclustering.cluster import cluster_visualizer
93  from pyclustering.samples.definitions import FCPS_SAMPLES
94  from pyclustering.utils import read_sample
95 
96  # sample Lsun for cluster analysis
97  lsun_sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
98 
99  # create instance of the algorithm that will use ccore library (the last argument)
100  agglomerative_instance = agglomerative(lsun_sample, 3, type_link.SINGLE_LINK, True)
101 
102  # start processing
103  agglomerative_instance.process()
104 
105  # get result and visualize it
106  lsun_clusters = agglomerative_instance.get_clusters()
107  visualizer = cluster_visualizer()
108  visualizer.append_clusters(lsun_clusters, lsun_sample)
109  visualizer.show()
110  @endcode
111 
112  Example of agglomerative clustering using different links:
113  @image html agglomerative_lsun_clustering_single_link.png
114 
115  """
116 
117  def __init__(self, data, number_clusters, link = None, ccore = True):
118  """!
119  @brief Constructor of agglomerative hierarchical algorithm.
120 
121  @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by list, for example
122  [[0.1, 0.2], [0.4, 0.5], [1.3, 0.9]].
123  @param[in] number_clusters (uint): Number of clusters that should be allocated.
124  @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
125  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not (by default it is 'False').
126 
127  """
128 
129  self.__pointer_data = data
130  self.__number_clusters = number_clusters
131  self.__similarity = link
132 
133  if self.__similarity is None:
134  self.__similarity = type_link.CENTROID_LINK
135 
136  self.__clusters = []
137  self.__ccore = ccore
138  if (self.__ccore):
139  self.__ccore = ccore_library.workable()
140 
141  if (self.__similarity == type_link.CENTROID_LINK):
142  self.__centers = self.__pointer_data.copy() # used in case of usage of centroid links
143 
144 
145  def process(self):
146  """!
147  @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
148 
149  @see get_clusters()
150 
151  """
152 
153  if (self.__ccore is True):
154  self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity);
155 
156  else:
157  self.__clusters = [[index] for index in range(0, len(self.__pointer_data))];
158 
159  current_number_clusters = len(self.__clusters);
160 
161  while (current_number_clusters > self.__number_clusters):
163  current_number_clusters = len(self.__clusters);
164 
165 
166  def get_clusters(self):
167  """!
168  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
169 
170  @remark Results of clustering can be obtained using corresponding gets methods.
171 
172  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
173 
174  @see process()
175 
176  """
177 
178  return self.__clusters;
179 
180 
182  """!
183  @brief Returns clustering result representation type that indicate how clusters are encoded.
184 
185  @return (type_encoding) Clustering result representation.
186 
187  @see get_clusters()
188 
189  """
190 
191  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
192 
193 
194  def __merge_similar_clusters(self):
195  """!
196  @brief Merges the most similar clusters in line with link type.
197 
198  """
199 
200  if (self.__similarity == type_link.AVERAGE_LINK):
202 
203  elif (self.__similarity == type_link.CENTROID_LINK):
205 
206  elif (self.__similarity == type_link.COMPLETE_LINK):
208 
209  elif (self.__similarity == type_link.SINGLE_LINK):
210  self.__merge_by_signle_link();
211 
212  else:
213  raise NameError('Not supported similarity is used');
214 
215 
216  def __merge_by_average_link(self):
217  """!
218  @brief Merges the most similar clusters in line with average link type.
219 
220  """
221 
222  minimum_average_distance = float('Inf');
223 
224  for index_cluster1 in range(0, len(self.__clusters)):
225  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
226 
227  # Find farthest objects
228  candidate_average_distance = 0.0;
229  for index_object1 in self.__clusters[index_cluster1]:
230  for index_object2 in self.__clusters[index_cluster2]:
231  candidate_average_distance += euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
232 
233  candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]));
234 
235  if (candidate_average_distance < minimum_average_distance):
236  minimum_average_distance = candidate_average_distance;
237  indexes = [index_cluster1, index_cluster2];
238 
239  self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
240  self.__clusters.pop(indexes[1]); # remove merged cluster.
241 
242 
243  def __merge_by_centroid_link(self):
244  """!
245  @brief Merges the most similar clusters in line with centroid link type.
246 
247  """
248 
249  minimum_centroid_distance = float('Inf');
250  indexes = None;
251 
252  for index1 in range(0, len(self.__centers)):
253  for index2 in range(index1 + 1, len(self.__centers)):
254  distance = euclidean_distance_square(self.__centers[index1], self.__centers[index2]);
255  if (distance < minimum_centroid_distance):
256  minimum_centroid_distance = distance;
257  indexes = [index1, index2];
258 
259  self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
260  self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]);
261 
262  self.__clusters.pop(indexes[1]); # remove merged cluster.
263  self.__centers.pop(indexes[1]); # remove merged center.
264 
265 
266  def __merge_by_complete_link(self):
267  """!
268  @brief Merges the most similar clusters in line with complete link type.
269 
270  """
271 
272  minimum_complete_distance = float('Inf');
273  indexes = None;
274 
275  for index_cluster1 in range(0, len(self.__clusters)):
276  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
277  candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2);
278 
279  if (candidate_maximum_distance < minimum_complete_distance):
280  minimum_complete_distance = candidate_maximum_distance;
281  indexes = [index_cluster1, index_cluster2];
282 
283  self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
284  self.__clusters.pop(indexes[1]); # remove merged cluster.
285 
286 
287  def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
288  """!
289  @brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
290 
291  @param[in] (uint) Index of the first cluster.
292  @param[in] (uint) Index of the second cluster.
293 
294  @return The farthest euclidean distance between two clusters.
295 
296  """
297  candidate_maximum_distance = 0.0;
298  for index_object1 in self.__clusters[index_cluster1]:
299  for index_object2 in self.__clusters[index_cluster2]:
300  distance = euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
301 
302  if (distance > candidate_maximum_distance):
303  candidate_maximum_distance = distance;
304 
305  return candidate_maximum_distance;
306 
307 
308  def __merge_by_signle_link(self):
309  """!
310  @brief Merges the most similar clusters in line with single link type.
311 
312  """
313 
314  minimum_single_distance = float('Inf');
315  indexes = None;
316 
317  for index_cluster1 in range(0, len(self.__clusters)):
318  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
319  candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2);
320 
321  if (candidate_minimum_distance < minimum_single_distance):
322  minimum_single_distance = candidate_minimum_distance;
323  indexes = [index_cluster1, index_cluster2];
324 
325  self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
326  self.__clusters.pop(indexes[1]); # remove merged cluster.
327 
328 
329  def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
330  """!
331  @brief Finds two nearest objects in two specified clusters and returns distance between them.
332 
333  @param[in] (uint) Index of the first cluster.
334  @param[in] (uint) Index of the second cluster.
335 
336  @return The nearest euclidean distance between two clusters.
337 
338  """
339  candidate_minimum_distance = float('Inf');
340 
341  for index_object1 in self.__clusters[index_cluster1]:
342  for index_object2 in self.__clusters[index_cluster2]:
343  distance = euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
344  if (distance < candidate_minimum_distance):
345  candidate_minimum_distance = distance;
346 
347  return candidate_minimum_distance;
348 
349 
350  def __calculate_center(self, cluster):
351  """!
352  @brief Calculates new center.
353 
354  @return (list) New value of the center of the specified cluster.
355 
356  """
357 
358  dimension = len(self.__pointer_data[cluster[0]]);
359  center = [0] * dimension;
360  for index_point in cluster:
361  for index_dimension in range(0, dimension):
362  center[index_dimension] += self.__pointer_data[index_point][index_dimension];
363 
364  for index_dimension in range(0, dimension):
365  center[index_dimension] /= len(cluster);
366 
367  return center;
def __calculate_farthest_distance(self, index_cluster1, index_cluster2)
Finds two farthest objects in two specified clusters in terms and returns distance between them...
def __merge_by_signle_link(self)
Merges the most similar clusters in line with single link type.
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
Module for representing clustering results.
Definition: encoder.py:1
def __merge_by_centroid_link(self)
Merges the most similar clusters in line with centroid link type.
Class represents agglomerative algorithm for cluster analysis.
def __init__(self, data, number_clusters, link=None, ccore=True)
Constructor of agglomerative hierarchical algorithm.
def __merge_by_average_link(self)
Merges the most similar clusters in line with average link type.
def __calculate_center(self, cluster)
Calculates new center.
def process(self)
Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
def __calculate_nearest_distance(self, index_cluster1, index_cluster2)
Finds two nearest objects in two specified clusters and returns distance between them.
def __merge_similar_clusters(self)
Merges the most similar clusters in line with link type.
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
def __merge_by_complete_link(self)
Merges the most similar clusters in line with complete link type.