pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
fcm.py
1 """!
2 
3 @brief Cluster analysis algorithm: Fuzzy C-Means
4 @details Implementation based on paper @cite book::pattern_recognition_with_fuzzy.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright BSD-3-Clause
9 
10 """
11 
12 
13 import numpy
14 
15 import pyclustering.core.fcm_wrapper as wrapper
16 
17 from pyclustering.core.wrapper import ccore_library
18 
19 
20 class fcm:
21  """!
22  @brief Class represents Fuzzy C-means (FCM) clustering algorithm.
23  @details Fuzzy clustering is a form of clustering in which each data point can belong to more than one cluster.
24 
25  Fuzzy C-Means algorithm uses two general formulas for cluster analysis. The first is to updated membership of each
26  point:
27  \f[w_{ij}=\frac{1}{\sum_{k=0}^{c}\left ( \frac{\left \| x_{i}-c_{j} \right \|}{\left \| x_{i}-c_{k} \right \|} \right )^{\frac{2}{m-1}}}\f]
28 
29  The second formula is used to update centers in line with obtained centers:
30  \f[c_{k}=\frac{\sum_{i=0}^{N}w_{k}\left ( x_{i} \right )^{m}x_{i}}{\sum_{i=0}^{N}w_{k}\left ( x_{i} \right )^{m}}\f]
31 
32  Fuzzy C-Means clustering results depend on initial centers. Algorithm K-Means++ can used for center initialization
33  from module 'pyclustering.cluster.center_initializer'.
34 
35  CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
36 
37  Here is an example how to perform cluster analysis using Fuzzy C-Means algorithm:
38  @code
39  from pyclustering.samples.definitions import FAMOUS_SAMPLES
40  from pyclustering.cluster import cluster_visualizer
41  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
42  from pyclustering.cluster.fcm import fcm
43  from pyclustering.utils import read_sample
44 
45  # load list of points for cluster analysis
46  sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL)
47 
48  # initialize
49  initial_centers = kmeans_plusplus_initializer(sample, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
50 
51  # create instance of Fuzzy C-Means algorithm
52  fcm_instance = fcm(sample, initial_centers)
53 
54  # run cluster analysis and obtain results
55  fcm_instance.process()
56  clusters = fcm_instance.get_clusters()
57  centers = fcm_instance.get_centers()
58 
59  # visualize clustering results
60  visualizer = cluster_visualizer()
61  visualizer.append_clusters(clusters, sample)
62  visualizer.append_cluster(centers, marker='*', markersize=10)
63  visualizer.show()
64  @endcode
65 
66  The next example shows how to perform image segmentation using Fuzzy C-Means algorithm:
67  @code
68  from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
69  from pyclustering.cluster.fcm import fcm
70  from pyclustering.utils import read_image, draw_image_mask_segments
71 
72  # load list of points for cluster analysis
73  data = read_image("stpetersburg_admiral.jpg")
74 
75  # initialize
76  initial_centers = kmeans_plusplus_initializer(data, 3, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
77 
78  # create instance of Fuzzy C-Means algorithm
79  fcm_instance = fcm(data, initial_centers)
80 
81  # run cluster analysis and obtain results
82  fcm_instance.process()
83  clusters = fcm_instance.get_clusters()
84 
85  # visualize segmentation results
86  draw_image_mask_segments("stpetersburg_admiral.jpg", clusters)
87  @endcode
88 
89  @image html fcm_segmentation_stpetersburg.png "Image segmentation using Fuzzy C-Means algorithm."
90 
91  """
92 
93  def __init__(self, data, initial_centers, **kwargs):
94  """!
95  @brief Initialize Fuzzy C-Means algorithm.
96 
97  @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
98  @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
99  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'tolerance', 'itermax', 'm').
100 
101  <b>Keyword Args:</b><br>
102  - ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
103  - tolerance (float): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
104  - itermax (uint): Maximum number of iterations that is used for clustering process (by default: 200).
105  - m (float): Hyper-parameter that controls how fuzzy the cluster will be. The higher it is, the fuzzier the cluster will be in the end.
106  This parameter should be greater than 1 (by default: 2).
107 
108  """
109 
110  self.__data = data
111  self.__clusters = []
112  self.__centers = initial_centers
113  self.__membership = []
114 
115  self.__tolerance = kwargs.get('tolerance', 0.001)
116  self.__itermax = kwargs.get('itermax', 200)
117  self.__m = kwargs.get('m', 2)
118 
119  self.__degree = 2.0 / (self.__m - 1)
120 
121  self.__ccore = kwargs.get('ccore', True)
122  if self.__ccore is True:
123  self.__ccore = ccore_library.workable()
124 
125  self.__verify_arguments()
126 
127 
128  def process(self):
129  """!
130  @brief Performs cluster analysis in line with Fuzzy C-Means algorithm.
131 
132  @return (fcm) Returns itself (Fuzzy C-Means instance).
133 
134  @see get_clusters()
135  @see get_centers()
136  @see get_membership()
137 
138  """
139  if self.__ccore is True:
140  self.__process_by_ccore()
141  else:
142  self.__process_by_python()
143 
144  return self
145 
146 
147  def get_clusters(self):
148  """!
149  @brief Returns allocated clusters that consists of points that most likely (in line with membership) belong to
150  these clusters.
151 
152  @remark Allocated clusters can be returned only after data processing (use method process()). Otherwise empty list is returned.
153 
154  @return (list) List of allocated clusters, each cluster contains indexes from input data.
155 
156  @see process()
157  @see get_centers()
158  @see get_membership()
159 
160  """
161  return self.__clusters
162 
163 
164  def get_centers(self):
165  """!
166  @brief Returns list of centers of allocated clusters.
167 
168  @return (array_like) Cluster centers.
169 
170  @see process()
171  @see get_clusters()
172  @see get_membership()
173 
174  """
175  return self.__centers
176 
177 
178  def get_membership(self):
179  """!
180  @brief Returns cluster membership (probability) for each point in data.
181 
182  @return (array_like) Membership for each point in format [[Px1(c1), Px1(c2), ...], [Px2(c1), Px2(c2), ...], ...],
183  where [Px1(c1), Px1(c2), ...] membership for point x1.
184 
185  @see process()
186  @see get_clusters()
187  @see get_centers()
188 
189  """
190  return self.__membership
191 
192 
193  def __process_by_ccore(self):
194  """!
195  @brief Performs cluster analysis using C/C++ implementation.
196 
197  """
198  result = wrapper.fcm_algorithm(self.__data, self.__centers, self.__m, self.__tolerance, self.__itermax)
199 
200  self.__clusters = result[wrapper.fcm_package_indexer.INDEX_CLUSTERS]
201  self.__centers = result[wrapper.fcm_package_indexer.INDEX_CENTERS]
202  self.__membership = result[wrapper.fcm_package_indexer.INDEX_MEMBERSHIP]
203 
204 
205  def __process_by_python(self):
206  """!
207  @brief Performs cluster analysis using Python implementation.
208 
209  """
210  self.__data = numpy.array(self.__data)
211  self.__centers = numpy.array(self.__centers)
212 
213  self.__membership = numpy.zeros((len(self.__data), len(self.__centers)))
214 
215  change = float('inf')
216  iteration = 0
217 
218  while change > self.__tolerance and iteration < self.__itermax:
219  self.__update_membership()
220  centers = self.__calculate_centers()
221  change = self.__calculate_changes(centers)
222 
223  self.__centers = centers
224  iteration += 1
225 
226  self.__extract_clusters()
227 
228 
229  def __calculate_centers(self):
230  """!
231  @brief Calculate center using membership of each cluster.
232 
233  @return (list) Updated clusters as list of clusters. Each cluster contains indexes of objects from data.
234 
235  @return (numpy.array) Updated centers.
236 
237  """
238  dimension = self.__data.shape[1]
239  centers = numpy.zeros((len(self.__centers), dimension))
240 
241  for i in range(len(self.__centers)):
242  # multiplication '@' requires python version 3.5
243  centers[i] = numpy.divide(self.__membership[:, i] @ self.__data, numpy.sum(self.__membership[:, i]))
244 
245  return centers
246 
247 
248  def __update_membership(self):
249  """!
250  @brief Update membership for each point in line with current cluster centers.
251 
252  """
253  data_difference = numpy.zeros((len(self.__centers), len(self.__data)))
254 
255  for i in range(len(self.__centers)):
256  data_difference[i] = numpy.sum(numpy.square(self.__data - self.__centers[i]), axis=1)
257 
258  for i in range(len(self.__data)):
259  for j in range(len(self.__centers)):
260  divider = sum([pow(data_difference[j][i] / data_difference[k][i], self.__degree) for k in range(len(self.__centers)) if data_difference[k][i] != 0.0])
261 
262  if divider != 0.0:
263  self.__membership[i][j] = 1.0 / divider
264  else:
265  self.__membership[i][j] = 1.0
266 
267 
268  def __calculate_changes(self, updated_centers):
269  """!
270  @brief Calculate changes between centers.
271 
272  @return (float) Maximum change between centers.
273 
274  """
275  changes = numpy.sum(numpy.square(self.__centers - updated_centers), axis=1).T
276  return numpy.max(changes)
277 
278 
279  def __extract_clusters(self):
280  self.__clusters = [[] for i in range(len(self.__centers))]
281  belongs = numpy.argmax(self.__membership, axis=1)
282 
283  for i in range(len(belongs)):
284  self.__clusters[belongs[i]].append(i)
285 
286 
287  def __verify_arguments(self):
288  """!
289  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
290 
291  """
292  if len(self.__data) == 0:
293  raise ValueError("Input data is empty (size: '%d')." % len(self.__data))
294 
295  if len(self.__centers) == 0:
296  raise ValueError("Initial centers are empty (size: '%d')." % len(self.__centers))
pyclustering.cluster.fcm.fcm.__tolerance
__tolerance
Definition: fcm.py:115
pyclustering.cluster.fcm.fcm.__verify_arguments
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: fcm.py:287
pyclustering.cluster.fcm.fcm.__clusters
__clusters
Definition: fcm.py:111
pyclustering.cluster.fcm.fcm.__process_by_python
def __process_by_python(self)
Performs cluster analysis using Python implementation.
Definition: fcm.py:205
pyclustering.cluster.fcm.fcm.__update_membership
def __update_membership(self)
Update membership for each point in line with current cluster centers.
Definition: fcm.py:248
pyclustering.cluster.fcm.fcm.__membership
__membership
Definition: fcm.py:113
pyclustering.cluster.fcm.fcm.__calculate_changes
def __calculate_changes(self, updated_centers)
Calculate changes between centers.
Definition: fcm.py:268
pyclustering.cluster.fcm.fcm.__ccore
__ccore
Definition: fcm.py:121
pyclustering.cluster.fcm.fcm.get_centers
def get_centers(self)
Returns list of centers of allocated clusters.
Definition: fcm.py:164
pyclustering.cluster.fcm.fcm.__degree
__degree
Definition: fcm.py:119
pyclustering.cluster.fcm.fcm.get_membership
def get_membership(self)
Returns cluster membership (probability) for each point in data.
Definition: fcm.py:178
pyclustering.cluster.fcm.fcm.get_clusters
def get_clusters(self)
Returns allocated clusters that consists of points that most likely (in line with membership) belong ...
Definition: fcm.py:147
pyclustering.cluster.fcm.fcm.__data
__data
Definition: fcm.py:110
pyclustering.cluster.fcm.fcm.__init__
def __init__(self, data, initial_centers, **kwargs)
Initialize Fuzzy C-Means algorithm.
Definition: fcm.py:93
pyclustering.cluster.fcm.fcm.__extract_clusters
def __extract_clusters(self)
Definition: fcm.py:279
pyclustering.cluster.fcm.fcm.__calculate_centers
def __calculate_centers(self)
Calculate center using membership of each cluster.
Definition: fcm.py:229
pyclustering.cluster.fcm.fcm
Class represents Fuzzy C-means (FCM) clustering algorithm.
Definition: fcm.py:20
pyclustering.cluster.fcm.fcm.__m
__m
Definition: fcm.py:117
pyclustering.cluster.fcm.fcm.__itermax
__itermax
Definition: fcm.py:116
pyclustering.cluster.fcm.fcm.process
def process(self)
Performs cluster analysis in line with Fuzzy C-Means algorithm.
Definition: fcm.py:128
pyclustering.cluster.fcm.fcm.__centers
__centers
Definition: fcm.py:112
pyclustering.cluster.fcm.fcm.__process_by_ccore
def __process_by_ccore(self)
Performs cluster analysis using C/C++ implementation.
Definition: fcm.py:193