rock.py
1 """!
2 
3 @brief Cluster analysis algorithm: ROCK
4 @details Implementation based on paper @cite inproceedings::rock::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from pyclustering.cluster.encoder import type_encoding
29 
30 from pyclustering.utils import euclidean_distance
31 
32 from pyclustering.core.wrapper import ccore_library
33 
34 import pyclustering.core.rock_wrapper as wrapper
35 
36 
37 class rock:
38  """!
39  @brief Class represents clustering algorithm ROCK.
40 
41  Example:
42  @code
43  from pyclustering.cluster import cluster_visualizer
44  from pyclustering.cluster.rock import rock
45  from pyclustering.samples.definitions import FCPS_SAMPLES
46  from pyclustering.utils import read_sample
47 
48  # Read sample for clustering from file.
49  sample = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA)
50 
51  # Create instance of ROCK algorithm for cluster analysis. Seven clusters should be allocated.
52  rock_instance = rock(sample, 1.0, 7)
53 
54  # Run cluster analysis.
55  rock_instance.process()
56 
57  # Obtain results of clustering.
58  clusters = rock_instance.get_clusters()
59 
60  # Visualize clustering results.
61  visualizer = cluster_visualizer()
62  visualizer.append_clusters(clusters, sample)
63  visualizer.show()
64  @endcode
65 
66  """
67 
68  def __init__(self, data, eps, number_clusters, threshold=0.5, ccore=True):
69  """!
70  @brief Constructor of clustering algorithm ROCK.
71 
72  @param[in] data (list): Input data - list of points where each point is represented by list of coordinates.
73  @param[in] eps (double): Connectivity radius (similarity threshold), points are neighbors if distance between them is less than connectivity radius.
74  @param[in] number_clusters (uint): Defines number of clusters that should be allocated from the input data set.
75  @param[in] threshold (double): Value that defines degree of normalization that influences on choice of clusters for merging during processing.
76  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
77 
78  """
79 
80  self.__pointer_data = data
81  self.__eps = eps
82  self.__number_clusters = number_clusters
83  self.__threshold = threshold
84 
85  self.__clusters = None
86 
87  self.__ccore = ccore
88  if self.__ccore:
89  self.__ccore = ccore_library.workable()
90 
91  self.__verify_arguments()
92 
93  self.__degree_normalization = 1.0 + 2.0 * ((1.0 - threshold) / (1.0 + threshold))
94 
95  self.__adjacency_matrix = None
97 
98 
99  def process(self):
100  """!
101  @brief Performs cluster analysis in line with rules of ROCK algorithm.
102 
103  @return (rock) Returns itself (ROCK instance).
104 
105  @see get_clusters()
106 
107  """
108 
109  # TODO: (Not related to specification, just idea) First iteration should be investigated. Euclidean distance should be used for clustering between two
110  # points and rock algorithm between clusters because we consider non-categorical samples. But it is required more investigations.
111 
112  if self.__ccore is True:
113  self.__clusters = wrapper.rock(self.__pointer_data, self.__eps, self.__number_clusters, self.__threshold)
114 
115  else:
116  self.__clusters = [[index] for index in range(len(self.__pointer_data))]
117 
118  while len(self.__clusters) > self.__number_clusters:
119  indexes = self.__find_pair_clusters(self.__clusters)
120 
121  if indexes != [-1, -1]:
122  self.__clusters[indexes[0]] += self.__clusters[indexes[1]]
123  self.__clusters.pop(indexes[1]) # remove merged cluster.
124  else:
125  break # totally separated clusters have been allocated
126  return self
127 
128 
129  def get_clusters(self):
130  """!
131  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
132 
133  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
134 
135  @see process()
136 
137  """
138 
139  return self.__clusters
140 
141 
143  """!
144  @brief Returns clustering result representation type that indicate how clusters are encoded.
145 
146  @return (type_encoding) Clustering result representation.
147 
148  @see get_clusters()
149 
150  """
151 
152  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
153 
154 
155  def __find_pair_clusters(self, clusters):
156  """!
157  @brief Returns pair of clusters that are best candidates for merging in line with goodness measure.
158  The pair of clusters for which the above goodness measure is maximum is the best pair of clusters to be merged.
159 
160  @param[in] clusters (list): List of clusters that have been allocated during processing, each cluster is represented by list of indexes of points from the input data set.
161 
162  @return (list) List that contains two indexes of clusters (from list 'clusters') that should be merged on this step.
163  It can be equals to [-1, -1] when no links between clusters.
164 
165  """
166 
167  maximum_goodness = 0.0
168  cluster_indexes = [-1, -1]
169 
170  for i in range(0, len(clusters)):
171  for j in range(i + 1, len(clusters)):
172  goodness = self.__calculate_goodness(clusters[i], clusters[j])
173  if goodness > maximum_goodness:
174  maximum_goodness = goodness
175  cluster_indexes = [i, j]
176 
177  return cluster_indexes
178 
179 
180  def __calculate_links(self, cluster1, cluster2):
181  """!
182  @brief Returns number of link between two clusters.
183  @details Link between objects (points) exists only if distance between them less than connectivity radius.
184 
185  @param[in] cluster1 (list): The first cluster.
186  @param[in] cluster2 (list): The second cluster.
187 
188  @return (uint) Number of links between two clusters.
189 
190  """
191 
192  number_links = 0
193 
194  for index1 in cluster1:
195  for index2 in cluster2:
196  number_links += self.__adjacency_matrix[index1][index2]
197 
198  return number_links
199 
200 
201  def __create_adjacency_matrix(self):
202  """!
203  @brief Creates 2D adjacency matrix (list of lists) where each element described existence of link between points (means that points are neighbors).
204 
205  """
206 
207  size_data = len(self.__pointer_data)
208 
209  self.__adjacency_matrix = [[0 for i in range(size_data)] for j in range(size_data)]
210  for i in range(0, size_data):
211  for j in range(i + 1, size_data):
212  distance = euclidean_distance(self.__pointer_data[i], self.__pointer_data[j])
213  if (distance <= self.__eps):
214  self.__adjacency_matrix[i][j] = 1
215  self.__adjacency_matrix[j][i] = 1
216 
217 
218 
219  def __calculate_goodness(self, cluster1, cluster2):
220  """!
221  @brief Calculates coefficient 'goodness measurement' between two clusters. The coefficient defines level of suitability of clusters for merging.
222 
223  @param[in] cluster1 (list): The first cluster.
224  @param[in] cluster2 (list): The second cluster.
225 
226  @return Goodness measure between two clusters.
227 
228  """
229 
230  number_links = self.__calculate_links(cluster1, cluster2)
231  devider = (len(cluster1) + len(cluster2)) ** self.__degree_normalization - len(cluster1) ** self.__degree_normalization - len(cluster2) ** self.__degree_normalization
232 
233  return number_links / devider
234 
235 
236  def __verify_arguments(self):
237  """!
238  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
239 
240  """
241  if len(self.__pointer_data) == 0:
242  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
243 
244  if self.__eps < 0:
245  raise ValueError("Connectivity radius (current value: '%d') should be greater or equal to 0." % self.__eps)
246 
247  if self.__threshold < 0 or self.__threshold > 1:
248  raise ValueError("Threshold (current value: '%d') should be in range (0, 1)." % self.__threshold)
249 
250  if (self.__number_clusters is not None) and (self.__number_clusters <= 0):
251  raise ValueError("Amount of clusters (current value: '%d') should be greater than 0." %
252  self.__number_clusters)
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: rock.py:142
Class represents clustering algorithm ROCK.
Definition: rock.py:37
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
Module for representing clustering results.
Definition: encoder.py:1
def process(self)
Performs cluster analysis in line with rules of ROCK algorithm.
Definition: rock.py:99
def __calculate_goodness(self, cluster1, cluster2)
Calculates coefficient &#39;goodness measurement&#39; between two clusters.
Definition: rock.py:219
def __find_pair_clusters(self, clusters)
Returns pair of clusters that are best candidates for merging in line with goodness measure...
Definition: rock.py:155
def __calculate_links(self, cluster1, cluster2)
Returns number of link between two clusters.
Definition: rock.py:180
def __create_adjacency_matrix(self)
Creates 2D adjacency matrix (list of lists) where each element described existence of link between po...
Definition: rock.py:201
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: rock.py:129
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: rock.py:236
def __init__(self, data, eps, number_clusters, threshold=0.5, ccore=True)
Constructor of clustering algorithm ROCK.
Definition: rock.py:68