agglomerative.py
1 """!
2 
3 @brief Cluster analysis algorithm: agglomerative algorithm.
4 @details Implementation based on paper @cite book::algorithms_for_clustering_data::1988.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from enum import IntEnum;
29 
30 from pyclustering.cluster.encoder import type_encoding;
31 
32 from pyclustering.utils import euclidean_distance_square;
33 
34 from pyclustering.core.wrapper import ccore_library
35 
36 import pyclustering.core.agglomerative_wrapper as wrapper;
37 
38 
39 class type_link(IntEnum):
40  """!
41  @brief Enumerator of types of link between clusters.
42 
43  """
44 
45 
46  SINGLE_LINK = 0;
47 
48 
49  COMPLETE_LINK = 1;
50 
51 
52  AVERAGE_LINK = 2;
53 
54 
55  CENTROID_LINK = 3;
56 
57 
59  """!
60  @brief Class represents agglomerative algorithm for cluster analysis.
61  @details Agglomerative algorithm considers each data point (object) as a separate cluster at the beggining and
62  step by step finds the best pair of clusters for merge until required amount of clusters is obtained.
63 
64  CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
65 
66  Example of agglomerative algorithm where centroid link is used:
67  @code
68  # sample for cluster analysis (represented by list)
69  sample = read_sample(path_to_sample);
70 
71  # create object that uses python code only
72  agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK)
73 
74  # cluster analysis
75  agglomerative_instance.process();
76 
77  # obtain results of clustering
78  clusters = agglomerative_instance.get_clusters();
79  @endcode
80 
81  Algorithm performance can be improved if 'ccore' flag is on. In this case C++ library will be called for clustering.
82  There is example of clustering 'LSUN' sample when usage of single or complete link will take a lot of resources and
83  when core usage is prefereble.
84  @code
85  # sample Lsun for cluster analysis
86  lsun_sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);
87 
88  # create instance of the algorithm that will use ccore library (the last argument)
89  agglomerative_instance = agglomerative(lsun_sample, 3, link_type.SINGLE_LINK, True);
90 
91  # start processing
92  agglomerative_instance.process();
93 
94  # get result and visualize it
95  lsun_clusters = agglomerative_instance.get_clusters();
96  visualizer = cluster_visualizer();
97  visualizer.append_clusters(lsun_clusters, lsun_sample);
98  visualizer.show();
99  @endcode
100 
101  Example of agglomerative clustering using different links:
102  @image html agglomerative_lsun_clustering_single_link.png
103 
104  """
105 
106  def __init__(self, data, number_clusters, link = None, ccore = True):
107  """!
108  @brief Constructor of agglomerative hierarchical algorithm.
109 
110  @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by list, for example
111  [[0.1, 0.2], [0.4, 0.5], [1.3, 0.9]].
112  @param[in] number_clusters (uint): Number of clusters that should be allocated.
113  @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
114  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not (by default it is 'False').
115 
116  """
117 
118  self.__pointer_data = data;
119  self.__number_clusters = number_clusters;
120  self.__similarity = link;
121 
122  if (self.__similarity is None):
123  self.__similarity = type_link.CENTROID_LINK;
124 
125  self.__clusters = [];
126  self.__ccore = ccore;
127  if (self.__ccore):
128  self.__ccore = ccore_library.workable();
129 
130  if (self.__similarity == type_link.CENTROID_LINK):
131  self.__centers = self.__pointer_data.copy(); # used in case of usage of centroid links
132 
133 
134  def process(self):
135  """!
136  @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
137 
138  @see get_clusters()
139 
140  """
141 
142  if (self.__ccore is True):
143  self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity);
144 
145  else:
146  self.__clusters = [[index] for index in range(0, len(self.__pointer_data))];
147 
148  current_number_clusters = len(self.__clusters);
149 
150  while (current_number_clusters > self.__number_clusters):
152  current_number_clusters = len(self.__clusters);
153 
154 
155  def get_clusters(self):
156  """!
157  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
158 
159  @remark Results of clustering can be obtained using corresponding gets methods.
160 
161  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
162 
163  @see process()
164 
165  """
166 
167  return self.__clusters;
168 
169 
171  """!
172  @brief Returns clustering result representation type that indicate how clusters are encoded.
173 
174  @return (type_encoding) Clustering result representation.
175 
176  @see get_clusters()
177 
178  """
179 
180  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
181 
182 
183  def __merge_similar_clusters(self):
184  """!
185  @brief Merges the most similar clusters in line with link type.
186 
187  """
188 
189  if (self.__similarity == type_link.AVERAGE_LINK):
191 
192  elif (self.__similarity == type_link.CENTROID_LINK):
194 
195  elif (self.__similarity == type_link.COMPLETE_LINK):
197 
198  elif (self.__similarity == type_link.SINGLE_LINK):
199  self.__merge_by_signle_link();
200 
201  else:
202  raise NameError('Not supported similarity is used');
203 
204 
205  def __merge_by_average_link(self):
206  """!
207  @brief Merges the most similar clusters in line with average link type.
208 
209  """
210 
211  minimum_average_distance = float('Inf');
212 
213  for index_cluster1 in range(0, len(self.__clusters)):
214  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
215 
216  # Find farthest objects
217  candidate_average_distance = 0.0;
218  for index_object1 in self.__clusters[index_cluster1]:
219  for index_object2 in self.__clusters[index_cluster2]:
220  candidate_average_distance += euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
221 
222  candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]));
223 
224  if (candidate_average_distance < minimum_average_distance):
225  minimum_average_distance = candidate_average_distance;
226  indexes = [index_cluster1, index_cluster2];
227 
228  self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
229  self.__clusters.pop(indexes[1]); # remove merged cluster.
230 
231 
232  def __merge_by_centroid_link(self):
233  """!
234  @brief Merges the most similar clusters in line with centroid link type.
235 
236  """
237 
238  minimum_centroid_distance = float('Inf');
239  indexes = None;
240 
241  for index1 in range(0, len(self.__centers)):
242  for index2 in range(index1 + 1, len(self.__centers)):
243  distance = euclidean_distance_square(self.__centers[index1], self.__centers[index2]);
244  if (distance < minimum_centroid_distance):
245  minimum_centroid_distance = distance;
246  indexes = [index1, index2];
247 
248  self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
249  self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]);
250 
251  self.__clusters.pop(indexes[1]); # remove merged cluster.
252  self.__centers.pop(indexes[1]); # remove merged center.
253 
254 
255  def __merge_by_complete_link(self):
256  """!
257  @brief Merges the most similar clusters in line with complete link type.
258 
259  """
260 
261  minimum_complete_distance = float('Inf');
262  indexes = None;
263 
264  for index_cluster1 in range(0, len(self.__clusters)):
265  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
266  candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2);
267 
268  if (candidate_maximum_distance < minimum_complete_distance):
269  minimum_complete_distance = candidate_maximum_distance;
270  indexes = [index_cluster1, index_cluster2];
271 
272  self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
273  self.__clusters.pop(indexes[1]); # remove merged cluster.
274 
275 
276  def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
277  """!
278  @brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
279 
280  @param[in] (uint) Index of the first cluster.
281  @param[in] (uint) Index of the second cluster.
282 
283  @return The farthest euclidean distance between two clusters.
284 
285  """
286  candidate_maximum_distance = 0.0;
287  for index_object1 in self.__clusters[index_cluster1]:
288  for index_object2 in self.__clusters[index_cluster2]:
289  distance = euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
290 
291  if (distance > candidate_maximum_distance):
292  candidate_maximum_distance = distance;
293 
294  return candidate_maximum_distance;
295 
296 
297  def __merge_by_signle_link(self):
298  """!
299  @brief Merges the most similar clusters in line with single link type.
300 
301  """
302 
303  minimum_single_distance = float('Inf');
304  indexes = None;
305 
306  for index_cluster1 in range(0, len(self.__clusters)):
307  for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
308  candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2);
309 
310  if (candidate_minimum_distance < minimum_single_distance):
311  minimum_single_distance = candidate_minimum_distance;
312  indexes = [index_cluster1, index_cluster2];
313 
314  self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
315  self.__clusters.pop(indexes[1]); # remove merged cluster.
316 
317 
318  def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
319  """!
320  @brief Finds two nearest objects in two specified clusters and returns distance between them.
321 
322  @param[in] (uint) Index of the first cluster.
323  @param[in] (uint) Index of the second cluster.
324 
325  @return The nearest euclidean distance between two clusters.
326 
327  """
328  candidate_minimum_distance = float('Inf');
329 
330  for index_object1 in self.__clusters[index_cluster1]:
331  for index_object2 in self.__clusters[index_cluster2]:
332  distance = euclidean_distance_square(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
333  if (distance < candidate_minimum_distance):
334  candidate_minimum_distance = distance;
335 
336  return candidate_minimum_distance;
337 
338 
339  def __calculate_center(self, cluster):
340  """!
341  @brief Calculates new center.
342 
343  @return (list) New value of the center of the specified cluster.
344 
345  """
346 
347  dimension = len(self.__pointer_data[cluster[0]]);
348  center = [0] * dimension;
349  for index_point in cluster:
350  for index_dimension in range(0, dimension):
351  center[index_dimension] += self.__pointer_data[index_point][index_dimension];
352 
353  for index_dimension in range(0, dimension):
354  center[index_dimension] /= len(cluster);
355 
356  return center;
def __calculate_farthest_distance(self, index_cluster1, index_cluster2)
Finds two farthest objects in two specified clusters in terms and returns distance between them...
def __merge_by_signle_link(self)
Merges the most similar clusters in line with single link type.
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
Module for representing clustering results.
Definition: encoder.py:1
def __merge_by_centroid_link(self)
Merges the most similar clusters in line with centroid link type.
Class represents agglomerative algorithm for cluster analysis.
def __init__(self, data, number_clusters, link=None, ccore=True)
Constructor of agglomerative hierarchical algorithm.
def __merge_by_average_link(self)
Merges the most similar clusters in line with average link type.
def __calculate_center(self, cluster)
Calculates new center.
def process(self)
Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
def __calculate_nearest_distance(self, index_cluster1, index_cluster2)
Finds two nearest objects in two specified clusters and returns distance between them.
def __merge_similar_clusters(self)
Merges the most similar clusters in line with link type.
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
def __merge_by_complete_link(self)
Merges the most similar clusters in line with complete link type.