rock.py
1 """!
2 
3 @brief Cluster analysis algorithm: ROCK
4 @details Implementation based on paper @cite inproceedings::rock::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 from pyclustering.cluster.encoder import type_encoding;
29 
30 from pyclustering.utils import euclidean_distance;
31 
32 from pyclustering.core.wrapper import ccore_library;
33 
34 import pyclustering.core.rock_wrapper as wrapper;
35 
36 
37 class rock:
38  """!
39  @brief Class represents clustering algorithm ROCK.
40  @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
41 
42  Example:
43  @code
44  # Read sample for clustering from some file
45  sample = read_sample(path_to_sample);
46 
47  # Create instance of ROCK algorithm for cluster analysis
48  # Five clusters should be allocated
49  rock_instance = rock(sample, 1.0, 5);
50 
51  # Run cluster analysis
52  rock_instance.process();
53 
54  # Obtain results of clustering
55  clusters = rock_instance.get_clusters();
56  @endcode
57 
58  """
59 
60  def __init__(self, data, eps, number_clusters, threshold = 0.5, ccore = True):
61  """!
62  @brief Constructor of clustering algorithm ROCK.
63 
64  @param[in] data (list): Input data - list of points where each point is represented by list of coordinates.
65  @param[in] eps (double): Connectivity radius (similarity threshold), points are neighbors if distance between them is less than connectivity radius.
66  @param[in] number_clusters (uint): Defines number of clusters that should be allocated from the input data set.
67  @param[in] threshold (double): Value that defines degree of normalization that influences on choice of clusters for merging during processing.
68  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
69 
70  """
71 
72  self.__pointer_data = data;
73  self.__eps = eps;
74  self.__number_clusters = number_clusters;
75  self.__threshold = threshold;
76 
77  self.__clusters = None;
78 
79  self.__ccore = ccore;
80  if (self.__ccore):
81  self.__ccore = ccore_library.workable();
82 
83  self.__degree_normalization = 1.0 + 2.0 * ( (1.0 - threshold) / (1.0 + threshold) );
84 
85  self.__adjacency_matrix = None;
87 
88 
89  def process(self):
90  """!
91  @brief Performs cluster analysis in line with rules of ROCK algorithm.
92 
93  @remark Results of clustering can be obtained using corresponding get methods.
94 
95  @see get_clusters()
96 
97  """
98 
99  # TODO: (Not related to specification, just idea) First iteration should be investigated. Euclidean distance should be used for clustering between two
100  # points and rock algorithm between clusters because we consider non-categorical samples. But it is required more investigations.
101 
102  if (self.__ccore is True):
103  self.__clusters = wrapper.rock(self.__pointer_data, self.__eps, self.__number_clusters, self.__threshold);
104 
105  else:
106  self.__clusters = [[index] for index in range(len(self.__pointer_data))];
107 
108  while (len(self.__clusters) > self.__number_clusters):
109  indexes = self.__find_pair_clusters(self.__clusters);
110 
111  if (indexes != [-1, -1]):
112  self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
113  self.__clusters.pop(indexes[1]); # remove merged cluster.
114  else:
115  break; # totally separated clusters have been allocated
116 
117 
118  def get_clusters(self):
119  """!
120  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
121 
122  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
123 
124  @see process()
125 
126  """
127 
128  return self.__clusters;
129 
130 
132  """!
133  @brief Returns clustering result representation type that indicate how clusters are encoded.
134 
135  @return (type_encoding) Clustering result representation.
136 
137  @see get_clusters()
138 
139  """
140 
141  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
142 
143 
144  def __find_pair_clusters(self, clusters):
145  """!
146  @brief Returns pair of clusters that are best candidates for merging in line with goodness measure.
147  The pair of clusters for which the above goodness measure is maximum is the best pair of clusters to be merged.
148 
149  @param[in] clusters (list): List of clusters that have been allocated during processing, each cluster is represented by list of indexes of points from the input data set.
150 
151  @return (list) List that contains two indexes of clusters (from list 'clusters') that should be merged on this step.
152  It can be equals to [-1, -1] when no links between clusters.
153 
154  """
155 
156  maximum_goodness = 0.0;
157  cluster_indexes = [-1, -1];
158 
159  for i in range(0, len(clusters)):
160  for j in range(i + 1, len(clusters)):
161  goodness = self.__calculate_goodness(clusters[i], clusters[j]);
162  if (goodness > maximum_goodness):
163  maximum_goodness = goodness;
164  cluster_indexes = [i, j];
165 
166  return cluster_indexes;
167 
168 
169  def __calculate_links(self, cluster1, cluster2):
170  """!
171  @brief Returns number of link between two clusters.
172  @details Link between objects (points) exists only if distance between them less than connectivity radius.
173 
174  @param[in] cluster1 (list): The first cluster.
175  @param[in] cluster2 (list): The second cluster.
176 
177  @return (uint) Number of links between two clusters.
178 
179  """
180 
181  number_links = 0;
182 
183  for index1 in cluster1:
184  for index2 in cluster2:
185  number_links += self.__adjacency_matrix[index1][index2];
186 
187  return number_links;
188 
189 
190  def __create_adjacency_matrix(self):
191  """!
192  @brief Creates 2D adjacency matrix (list of lists) where each element described existence of link between points (means that points are neighbors).
193 
194  """
195 
196  size_data = len(self.__pointer_data);
197 
198  self.__adjacency_matrix = [ [ 0 for i in range(size_data) ] for j in range(size_data) ];
199  for i in range(0, size_data):
200  for j in range(i + 1, size_data):
201  distance = euclidean_distance(self.__pointer_data[i], self.__pointer_data[j]);
202  if (distance <= self.__eps):
203  self.__adjacency_matrix[i][j] = 1;
204  self.__adjacency_matrix[j][i] = 1;
205 
206 
207 
208  def __calculate_goodness(self, cluster1, cluster2):
209  """!
210  @brief Calculates coefficient 'goodness measurement' between two clusters. The coefficient defines level of suitability of clusters for merging.
211 
212  @param[in] cluster1 (list): The first cluster.
213  @param[in] cluster2 (list): The second cluster.
214 
215  @return Goodness measure between two clusters.
216 
217  """
218 
219  number_links = self.__calculate_links(cluster1, cluster2);
220  devider = (len(cluster1) + len(cluster2)) ** self.__degree_normalization - len(cluster1) ** self.__degree_normalization - len(cluster2) ** self.__degree_normalization;
221 
222  return (number_links / devider);
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: rock.py:131
Class represents clustering algorithm ROCK.
Definition: rock.py:37
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
Module for representing clustering results.
Definition: encoder.py:1
def process(self)
Performs cluster analysis in line with rules of ROCK algorithm.
Definition: rock.py:89
def __calculate_goodness(self, cluster1, cluster2)
Calculates coefficient &#39;goodness measurement&#39; between two clusters.
Definition: rock.py:208
def __find_pair_clusters(self, clusters)
Returns pair of clusters that are best candidates for merging in line with goodness measure...
Definition: rock.py:144
def __calculate_links(self, cluster1, cluster2)
Returns number of link between two clusters.
Definition: rock.py:169
def __create_adjacency_matrix(self)
Creates 2D adjacency matrix (list of lists) where each element described existence of link between po...
Definition: rock.py:190
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data...
Definition: rock.py:118
def __init__(self, data, eps, number_clusters, threshold=0.5, ccore=True)
Constructor of clustering algorithm ROCK.
Definition: rock.py:60