pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
rock.py
1 """!
2 
3 @brief Cluster analysis algorithm: ROCK
4 @details Implementation based on paper @cite inproceedings::rock::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 
13 from pyclustering.cluster.encoder import type_encoding
14 
15 from pyclustering.utils import euclidean_distance
16 
17 from pyclustering.core.wrapper import ccore_library
18 
19 import pyclustering.core.rock_wrapper as wrapper
20 
21 
22 class rock:
23  """!
24  @brief The class represents clustering algorithm ROCK.
25 
26  Example:
27  @code
28  from pyclustering.cluster import cluster_visualizer
29  from pyclustering.cluster.rock import rock
30  from pyclustering.samples.definitions import FCPS_SAMPLES
31  from pyclustering.utils import read_sample
32 
33  # Read sample for clustering from file.
34  sample = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA)
35 
36  # Create instance of ROCK algorithm for cluster analysis. Seven clusters should be allocated.
37  rock_instance = rock(sample, 1.0, 7)
38 
39  # Run cluster analysis.
40  rock_instance.process()
41 
42  # Obtain results of clustering.
43  clusters = rock_instance.get_clusters()
44 
45  # Visualize clustering results.
46  visualizer = cluster_visualizer()
47  visualizer.append_clusters(clusters, sample)
48  visualizer.show()
49  @endcode
50 
51  """
52 
53  def __init__(self, data, eps, number_clusters, threshold=0.5, ccore=True):
54  """!
55  @brief Constructor of clustering algorithm ROCK.
56 
57  @param[in] data (list): Input data - list of points where each point is represented by list of coordinates.
58  @param[in] eps (double): Connectivity radius (similarity threshold), points are neighbors if distance between them is less than connectivity radius.
59  @param[in] number_clusters (uint): Defines number of clusters that should be allocated from the input data set.
60  @param[in] threshold (double): Value that defines degree of normalization that influences on choice of clusters for merging during processing.
61  @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
62 
63  """
64 
65  self.__pointer_data = data
66  self.__eps = eps
67  self.__number_clusters = number_clusters
68  self.__threshold = threshold
69 
70  self.__clusters = None
71 
72  self.__ccore = ccore
73  if self.__ccore:
74  self.__ccore = ccore_library.workable()
75 
76  self.__verify_arguments()
77 
78  self.__degree_normalization = 1.0 + 2.0 * ((1.0 - threshold) / (1.0 + threshold))
79 
80  self.__adjacency_matrix = None
82 
83 
84  def process(self):
85  """!
86  @brief Performs cluster analysis in line with rules of ROCK algorithm.
87 
88  @return (rock) Returns itself (ROCK instance).
89 
90  @see get_clusters()
91 
92  """
93 
94  # TODO: (Not related to specification, just idea) First iteration should be investigated. Euclidean distance should be used for clustering between two
95  # points and rock algorithm between clusters because we consider non-categorical samples. But it is required more investigations.
96 
97  if self.__ccore is True:
98  self.__clusters = wrapper.rock(self.__pointer_data, self.__eps, self.__number_clusters, self.__threshold)
99 
100  else:
101  self.__clusters = [[index] for index in range(len(self.__pointer_data))]
102 
103  while len(self.__clusters) > self.__number_clusters:
104  indexes = self.__find_pair_clusters(self.__clusters)
105 
106  if indexes != [-1, -1]:
107  self.__clusters[indexes[0]] += self.__clusters[indexes[1]]
108  self.__clusters.pop(indexes[1]) # remove merged cluster.
109  else:
110  break # totally separated clusters have been allocated
111  return self
112 
113 
114  def get_clusters(self):
115  """!
116  @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
117 
118  @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
119 
120  @see process()
121 
122  """
123 
124  return self.__clusters
125 
126 
128  """!
129  @brief Returns clustering result representation type that indicate how clusters are encoded.
130 
131  @return (type_encoding) Clustering result representation.
132 
133  @see get_clusters()
134 
135  """
136 
137  return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
138 
139 
140  def __find_pair_clusters(self, clusters):
141  """!
142  @brief Returns pair of clusters that are best candidates for merging in line with goodness measure.
143  The pair of clusters for which the above goodness measure is maximum is the best pair of clusters to be merged.
144 
145  @param[in] clusters (list): List of clusters that have been allocated during processing, each cluster is represented by list of indexes of points from the input data set.
146 
147  @return (list) List that contains two indexes of clusters (from list 'clusters') that should be merged on this step.
148  It can be equals to [-1, -1] when no links between clusters.
149 
150  """
151 
152  maximum_goodness = 0.0
153  cluster_indexes = [-1, -1]
154 
155  for i in range(0, len(clusters)):
156  for j in range(i + 1, len(clusters)):
157  goodness = self.__calculate_goodness(clusters[i], clusters[j])
158  if goodness > maximum_goodness:
159  maximum_goodness = goodness
160  cluster_indexes = [i, j]
161 
162  return cluster_indexes
163 
164 
165  def __calculate_links(self, cluster1, cluster2):
166  """!
167  @brief Returns number of link between two clusters.
168  @details Link between objects (points) exists only if distance between them less than connectivity radius.
169 
170  @param[in] cluster1 (list): The first cluster.
171  @param[in] cluster2 (list): The second cluster.
172 
173  @return (uint) Number of links between two clusters.
174 
175  """
176 
177  number_links = 0
178 
179  for index1 in cluster1:
180  for index2 in cluster2:
181  number_links += self.__adjacency_matrix[index1][index2]
182 
183  return number_links
184 
185 
186  def __create_adjacency_matrix(self):
187  """!
188  @brief Creates 2D adjacency matrix (list of lists) where each element described existence of link between points (means that points are neighbors).
189 
190  """
191 
192  size_data = len(self.__pointer_data)
193 
194  self.__adjacency_matrix = [[0 for i in range(size_data)] for j in range(size_data)]
195  for i in range(0, size_data):
196  for j in range(i + 1, size_data):
197  distance = euclidean_distance(self.__pointer_data[i], self.__pointer_data[j])
198  if (distance <= self.__eps):
199  self.__adjacency_matrix[i][j] = 1
200  self.__adjacency_matrix[j][i] = 1
201 
202 
203 
204  def __calculate_goodness(self, cluster1, cluster2):
205  """!
206  @brief Calculates coefficient 'goodness measurement' between two clusters. The coefficient defines level of suitability of clusters for merging.
207 
208  @param[in] cluster1 (list): The first cluster.
209  @param[in] cluster2 (list): The second cluster.
210 
211  @return Goodness measure between two clusters.
212 
213  """
214 
215  number_links = self.__calculate_links(cluster1, cluster2)
216  devider = (len(cluster1) + len(cluster2)) ** self.__degree_normalization - len(cluster1) ** self.__degree_normalization - len(cluster2) ** self.__degree_normalization
217 
218  return number_links / devider
219 
220 
221  def __verify_arguments(self):
222  """!
223  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
224 
225  """
226  if len(self.__pointer_data) == 0:
227  raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))
228 
229  if self.__eps < 0:
230  raise ValueError("Connectivity radius (current value: '%d') should be greater or equal to 0." % self.__eps)
231 
232  if self.__threshold < 0 or self.__threshold > 1:
233  raise ValueError("Threshold (current value: '%d') should be in range (0, 1)." % self.__threshold)
234 
235  if (self.__number_clusters is not None) and (self.__number_clusters <= 0):
236  raise ValueError("Amount of clusters (current value: '%d') should be greater than 0." %
237  self.__number_clusters)
pyclustering.cluster.rock.rock.__calculate_goodness
def __calculate_goodness(self, cluster1, cluster2)
Calculates coefficient 'goodness measurement' between two clusters.
Definition: rock.py:204
pyclustering.cluster.rock.rock.__number_clusters
__number_clusters
Definition: rock.py:67
pyclustering.cluster.rock.rock.__ccore
__ccore
Definition: rock.py:72
pyclustering.cluster.rock.rock.__create_adjacency_matrix
def __create_adjacency_matrix(self)
Creates 2D adjacency matrix (list of lists) where each element described existence of link between po...
Definition: rock.py:186
pyclustering.cluster.rock.rock.__adjacency_matrix
__adjacency_matrix
Definition: rock.py:80
pyclustering.cluster.rock.rock.__eps
__eps
Definition: rock.py:66
pyclustering.cluster.rock.rock
The class represents clustering algorithm ROCK.
Definition: rock.py:22
pyclustering.cluster.rock.rock.__threshold
__threshold
Definition: rock.py:68
pyclustering.cluster.rock.rock.__init__
def __init__(self, data, eps, number_clusters, threshold=0.5, ccore=True)
Constructor of clustering algorithm ROCK.
Definition: rock.py:53
pyclustering.cluster.rock.rock.get_cluster_encoding
def get_cluster_encoding(self)
Returns clustering result representation type that indicate how clusters are encoded.
Definition: rock.py:127
pyclustering.cluster.rock.rock.__calculate_links
def __calculate_links(self, cluster1, cluster2)
Returns number of link between two clusters.
Definition: rock.py:165
pyclustering.cluster.rock.rock.__clusters
__clusters
Definition: rock.py:70
pyclustering.cluster.rock.rock.__find_pair_clusters
def __find_pair_clusters(self, clusters)
Returns pair of clusters that are best candidates for merging in line with goodness measure.
Definition: rock.py:140
pyclustering.cluster.rock.rock.__pointer_data
__pointer_data
Definition: rock.py:65
pyclustering.cluster.rock.rock.get_clusters
def get_clusters(self)
Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
Definition: rock.py:114
pyclustering.cluster.rock.rock.__verify_arguments
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: rock.py:221
pyclustering.cluster.rock.rock.process
def process(self)
Performs cluster analysis in line with rules of ROCK algorithm.
Definition: rock.py:84
pyclustering.utils
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
pyclustering.cluster.rock.rock.__degree_normalization
__degree_normalization
Definition: rock.py:78
pyclustering.cluster.encoder
Module for representing clustering results.
Definition: encoder.py:1