rock.py
1 """!
2 
3 @brief Cluster analysis algorithm: ROCK
4 @details Implementation based on paper @cite inproceedings::rock::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from pyclustering.cluster.encoder import type_encoding;
29 
30 from pyclustering.utils import euclidean_distance;
31 
32 from pyclustering.core.wrapper import ccore_library;
33 
34 import pyclustering.core.rock_wrapper as wrapper;
35 
36 
37 class rock:
38  """!
39  @brief Class represents clustering algorithm ROCK.
40 
41  Example:
42  @code
43  from pyclustering.cluster import cluster_visualizer
44  from pyclustering.cluster.rock import rock
45  from pyclustering.samples.definitions import FCPS_SAMPLES
46  from pyclustering.utils import read_sample
47 
48  # Read sample for clustering from file.
49  sample = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA)
50 
51  # Create instance of ROCK algorithm for cluster analysis. Seven clusters should be allocated.
52  rock_instance = rock(sample, 1.0, 7)
53 
54  # Run cluster analysis.
55  rock_instance.process()
56 
57  # Obtain results of clustering.
58  clusters = rock_instance.get_clusters()
59 
60  # Visualize clustering results.
61  visualizer = cluster_visualizer()
62  visualizer.append_clusters(clusters, sample)
63  visualizer.show()
64  @endcode
65 
66  """
67 
68  def __init__(self, data, eps, number_clusters, threshold = 0.5, ccore = True):
69  """!
70  @brief Constructor of clustering algorithm ROCK.
71 
72  @param[in] data (list): Input data - list of points where each point is represented by list of coordinates.
73  @param[in] eps (double): Connectivity radius (similarity threshold), points are neighbors if distance between them is less than connectivity radius.
74  @param[in] number_clusters (uint): Defines number of clusters that should be allocated from the input data set.
75  @param[in] threshold (double): Value that defines degree of normalization that influences on choice of clusters for merging during processing.
76  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
77 
78  """
79 
80  self.__pointer_data = data;
81  self.__eps = eps;
82  self.__number_clusters = number_clusters;
83  self.__threshold = threshold;
84 
85  self.__clusters = None;
86 
87  self.__ccore = ccore;
88  if (self.__ccore):
89  self.__ccore = ccore_library.workable();
90 
91  self.__degree_normalization = 1.0 + 2.0 * ( (1.0 - threshold) / (1.0 + threshold) );
92 
93  self.__adjacency_matrix = None;
95 
96 
97  def process(self):
98  """!
99  @brief Performs cluster analysis in line with rules of ROCK algorithm.
100 
101  @remark Results of clustering can be obtained using corresponding get methods.
102 
103  @see get_clusters()
104 
105  """
106 
107  # TODO: (Not related to specification, just idea) First iteration should be investigated. Euclidean distance should be used for clustering between two
108  # points and rock algorithm between clusters because we consider non-categorical samples. But it is required more investigations.
109 
110  if (self.__ccore is True):
111  self.__clusters = wrapper.rock(self.__pointer_data, self.__eps, self.__number_clusters, self.__threshold);
112 
113  else:
114  self.__clusters = [[index] for index in range(len(self.__pointer_data))];
115 
116  while (len(self.__clusters) > self.__number_clusters):
117  indexes = self.__find_pair_clusters(self.__clusters);
118 
119  if (indexes != [-1, -1]):
120  self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
121  self.__clusters.pop(indexes[1]); # remove merged cluster.
122  else:
123  break; # totally separated clusters have been allocated
124 
125 
126  def get_clusters(self):
127  """!
128  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
129 
130  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
131 
132  @see process()
133 
134  """
135 
136  return self.__clusters;
137 
138 
140  """!
141  @brief Returns clustering result representation type that indicate how clusters are encoded.
142 
143  @return (type_encoding) Clustering result representation.
144 
145  @see get_clusters()
146 
147  """
148 
149  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
150 
151 
152  def __find_pair_clusters(self, clusters):
153  """!
154  @brief Returns pair of clusters that are best candidates for merging in line with goodness measure.
155  The pair of clusters for which the above goodness measure is maximum is the best pair of clusters to be merged.
156 
157  @param[in] clusters (list): List of clusters that have been allocated during processing, each cluster is represented by list of indexes of points from the input data set.
158 
159  @return (list) List that contains two indexes of clusters (from list 'clusters') that should be merged on this step.
160  It can be equals to [-1, -1] when no links between clusters.
161 
162  """
163 
164  maximum_goodness = 0.0;
165  cluster_indexes = [-1, -1];
166 
167  for i in range(0, len(clusters)):
168  for j in range(i + 1, len(clusters)):
169  goodness = self.__calculate_goodness(clusters[i], clusters[j]);
170  if (goodness > maximum_goodness):
171  maximum_goodness = goodness;
172  cluster_indexes = [i, j];
173 
174  return cluster_indexes;
175 
176 
177  def __calculate_links(self, cluster1, cluster2):
178  """!
179  @brief Returns number of link between two clusters.
180  @details Link between objects (points) exists only if distance between them less than connectivity radius.
181 
182  @param[in] cluster1 (list): The first cluster.
183  @param[in] cluster2 (list): The second cluster.
184 
185  @return (uint) Number of links between two clusters.
186 
187  """
188 
189  number_links = 0;
190 
191  for index1 in cluster1:
192  for index2 in cluster2:
193  number_links += self.__adjacency_matrix[index1][index2];
194 
195  return number_links;
196 
197 
198  def __create_adjacency_matrix(self):
199  """!
200  @brief Creates 2D adjacency matrix (list of lists) where each element described existence of link between points (means that points are neighbors).
201 
202  """
203 
204  size_data = len(self.__pointer_data);
205 
206  self.__adjacency_matrix = [ [ 0 for i in range(size_data) ] for j in range(size_data) ];
207  for i in range(0, size_data):
208  for j in range(i + 1, size_data):
209  distance = euclidean_distance(self.__pointer_data[i], self.__pointer_data[j]);
210  if (distance <= self.__eps):
211  self.__adjacency_matrix[i][j] = 1;
212  self.__adjacency_matrix[j][i] = 1;
213 
214 
215 
216  def __calculate_goodness(self, cluster1, cluster2):
217  """!
218  @brief Calculates coefficient 'goodness measurement' between two clusters. The coefficient defines level of suitability of clusters for merging.
219 
220  @param[in] cluster1 (list): The first cluster.
221  @param[in] cluster2 (list): The second cluster.
222 
223  @return Goodness measure between two clusters.
224 
225  """
226 
227  number_links = self.__calculate_links(cluster1, cluster2);
228  devider = (len(cluster1) + len(cluster2)) ** self.__degree_normalization - len(cluster1) ** self.__degree_normalization - len(cluster2) ** self.__degree_normalization;
229 
230  return (number_links / devider);
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: rock.py:139
Class represents clustering algorithm ROCK.
Definition: rock.py:37
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
Module for representing clustering results.
Definition: encoder.py:1
def process(self)
Performs cluster analysis in line with rules of ROCK algorithm.
Definition: rock.py:97
def __calculate_goodness(self, cluster1, cluster2)
Calculates coefficient &#39;goodness measurement&#39; between two clusters.
Definition: rock.py:216
def __find_pair_clusters(self, clusters)
Returns pair of clusters that are best candidates for merging in line with goodness measure...
Definition: rock.py:152
def __calculate_links(self, cluster1, cluster2)
Returns number of link between two clusters.
Definition: rock.py:177
def __create_adjacency_matrix(self)
Creates 2D adjacency matrix (list of lists) where each element described existence of link between po...
Definition: rock.py:198
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: rock.py:126
def __init__(self, data, eps, number_clusters, threshold=0.5, ccore=True)
Constructor of clustering algorithm ROCK.
Definition: rock.py:68