ema.py
1 """!
2 
3 @brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model.
4 @details Implementation based on paper @cite article::ema::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2020
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import random
30 import warnings
31 
32 from pyclustering.cluster import cluster_visualizer
33 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
34 from pyclustering.cluster.kmeans import kmeans
35 
36 from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_square
37 
38 from enum import IntEnum
39 
40 try:
41  import matplotlib.pyplot as plt
42  import matplotlib.animation as animation
43  from matplotlib import patches
44 except Exception as error_instance:
45  warnings.warn("Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization "
46  "functionality is not available (details: '%s')." % str(error_instance))
47 
48 def gaussian(data, mean, covariance):
49  """!
50  @brief Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covariance in case
51  multi-dimensional data.
52 
53  @param[in] data (list): Data that is used for gaussian calculation.
54  @param[in] mean (float|numpy.array): Mathematical expectation used for calculation.
55  @param[in] covariance (float|numpy.array): Variance or covariance matrix for calculation.
56 
57  @return (list) Value of gaussian function for each point in dataset.
58 
59  """
60  dimension = float(len(data[0]))
61 
62  if dimension != 1.0:
63  inv_variance = numpy.linalg.pinv(covariance)
64  else:
65  inv_variance = 1.0 / covariance
66 
67  divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance))
68  if divider != 0.0:
69  right_const = 1.0 / divider
70  else:
71  right_const = float('inf')
72 
73  result = []
74 
75  for point in data:
76  mean_delta = point - mean
77  point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) )
78  result.append(point_gaussian)
79 
80  return result
81 
82 
83 
84 class ema_init_type(IntEnum):
85  """!
86  @brief Enumeration of initialization types for Expectation-Maximization algorithm.
87 
88  """
89 
90 
92  RANDOM_INITIALIZATION = 0
93 
94 
98  KMEANS_INITIALIZATION = 1
99 
100 
101 
103  """!
104  @brief Provides services for preparing initial means and covariances for Expectation-Maximization algorithm.
105  @details Initialization strategy is defined by enumerator 'ema_init_type': random initialization and
106  kmeans with kmeans++ initialization. Here an example of initialization using kmeans strategy:
107 
108  @code
109  from pyclustering.utils import read_sample
110  from pyclustering.samples.definitions import FAMOUS_SAMPLES
111  from pyclustering.cluster.ema import ema_initializer
112 
113  sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL)
114  amount_clusters = 2
115 
116  initial_means, initial_covariance = ema_initializer(sample, amount_clusters).initialize()
117  print(initial_means)
118  print(initial_covariance)
119  @endcode
120 
121  """
122 
123  __MAX_GENERATION_ATTEMPTS = 10
124 
125  def __init__(self, sample, amount):
126  """!
127  @brief Constructs EM initializer.
128 
129  @param[in] sample (list): Data that will be used by the EM algorithm.
130  @param[in] amount (uint): Amount of clusters that should be allocated by the EM algorithm.
131 
132  """
133  self.__sample = sample
134  self.__amount = amount
135 
136 
137  def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
138  """!
139  @brief Calculates initial parameters for EM algorithm: means and covariances using
140  specified strategy.
141 
142  @param[in] init_type (ema_init_type): Strategy for initialization.
143 
144  @return (float|list, float|numpy.array) Initial means and variance (covariance matrix in case multi-dimensional data).
145 
146  """
147  if init_type == ema_init_type.KMEANS_INITIALIZATION:
148  return self.__initialize_kmeans()
149 
150  elif init_type == ema_init_type.RANDOM_INITIALIZATION:
151  return self.__initialize_random()
152 
153  raise NameError("Unknown type of EM algorithm initialization is specified.")
154 
155 
156  def __calculate_initial_clusters(self, centers):
157  """!
158  @brief Calculate Euclidean distance to each point from the each cluster.
159  @brief Nearest points are captured by according clusters and as a result clusters are updated.
160 
161  @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
162 
163  """
164 
165  clusters = [[] for _ in range(len(centers))]
166  for index_point in range(len(self.__sample)):
167  index_optim, dist_optim = -1, 0.0
168 
169  for index in range(len(centers)):
170  dist = euclidean_distance_square(self.__sample[index_point], centers[index])
171 
172  if (dist < dist_optim) or (index == 0):
173  index_optim, dist_optim = index, dist
174 
175  clusters[index_optim].append(index_point)
176 
177  return clusters
178 
179 
180  def __calculate_initial_covariances(self, initial_clusters):
181  covariances = []
182  for initial_cluster in initial_clusters:
183  if len(initial_cluster) > 1:
184  cluster_sample = [self.__sample[index_point] for index_point in initial_cluster]
185  covariances.append(numpy.cov(cluster_sample, rowvar=False))
186  else:
187  dimension = len(self.__sample[0])
188  covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
189 
190  return covariances
191 
192 
193  def __initialize_random(self):
194  initial_means = []
195 
196  for _ in range(self.__amount):
197  mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ]
198  attempts = 0
199  while (mean in initial_means) and (attempts < ema_initializer.__MAX_GENERATION_ATTEMPTS):
200  mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ]
201  attempts += 1
202 
203  if attempts == ema_initializer.__MAX_GENERATION_ATTEMPTS:
204  mean = [ value + (random.random() - 0.5) * value * 0.2 for value in mean ]
205 
206  initial_means.append(mean)
207 
208  initial_clusters = self.__calculate_initial_clusters(initial_means)
209  initial_covariance = self.__calculate_initial_covariances(initial_clusters)
210 
211  return initial_means, initial_covariance
212 
213 
214  def __initialize_kmeans(self):
215  initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize()
216  kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True)
217  kmeans_instance.process()
218 
219  means = kmeans_instance.get_centers()
220 
221  covariances = []
222  initial_clusters = kmeans_instance.get_clusters()
223  for initial_cluster in initial_clusters:
224  if len(initial_cluster) > 1:
225  cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ]
226  covariances.append(numpy.cov(cluster_sample, rowvar=False))
227  else:
228  dimension = len(self.__sample[0])
229  covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
230 
231  return means, covariances
232 
233 
234 
236  """!
237  @brief Observer of EM algorithm for collecting algorithm state on each step.
238  @details It can be used to obtain whole picture about clustering process of EM algorithm. Allocated clusters,
239  means and covariances are stored in observer on each step. Here an example of usage:
240 
241  @code
242  from pyclustering.cluster.ema import ema, ema_observer
243  from pyclustering.utils import read_sample
244  from pyclustering.samples.definitions import SIMPLE_SAMPLES
245 
246  # Read data from text file.
247  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
248 
249  # Create EM observer.
250  observer = ema_observer()
251 
252  # Create EM algorithm to allocated four clusters and pass observer to it.
253  ema_instance = ema(sample, 4, observer=observer)
254 
255  # Run clustering process.
256  ema_instance.process()
257 
258  # Print amount of steps that were done by the algorithm.
259  print("EMA steps:", observer.get_iterations())
260 
261  # Print evolution of means and covariances.
262  print("Means evolution:", observer.get_evolution_means())
263  print("Covariances evolution:", observer.get_evolution_covariances())
264 
265  # Print evolution of clusters.
266  print("Clusters evolution:", observer.get_evolution_clusters())
267 
268  # Print final clusters.
269  print("Allocated clusters:", observer.get_evolution_clusters()[-1])
270  @endcode
271 
272  """
273  def __init__(self):
274  """!
275  @brief Initializes EM observer.
276 
277  """
278  self.__means_evolution = []
279  self.__covariances_evolution = []
280  self.__clusters_evolution = []
281 
282 
283  def __len__(self):
284  """!
285  @return (uint) Amount of iterations that were done by the EM algorithm.
286 
287  """
288  return len(self.__means_evolution)
289 
290 
291  def get_iterations(self):
292  """!
293  @return (uint) Amount of iterations that were done by the EM algorithm.
294 
295  """
296  return len(self.__means_evolution)
297 
298 
300  """!
301  @return (list) Mean of each cluster on each step of clustering.
302 
303  """
304  return self.__means_evolution
305 
306 
308  """!
309  @return (list) Covariance matrix (or variance in case of one-dimensional data) of each cluster on each step of clustering.
310 
311  """
312  return self.__covariances_evolution
313 
314 
316  """!
317  @return (list) Allocated clusters on each step of clustering.
318 
319  """
320  return self.__clusters_evolution
321 
322 
323  def notify(self, means, covariances, clusters):
324  """!
325  @brief This method is used by the algorithm to notify observer about changes where the algorithm
326  should provide new values: means, covariances and allocated clusters.
327 
328  @param[in] means (list): Mean of each cluster on currect step.
329  @param[in] covariances (list): Covariances of each cluster on current step.
330  @param[in] clusters (list): Allocated cluster on current step.
331 
332  """
333  self.__means_evolution.append(means)
334  self.__covariances_evolution.append(covariances)
335  self.__clusters_evolution.append(clusters)
336 
337 
338 
340  """!
341  @brief Visualizer of EM algorithm's results.
342  @details Provides services for visualization of particular features of the algorithm, for example,
343  in case of two-dimensional dataset it shows covariance ellipses.
344 
345  """
346 
347  @staticmethod
348  def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
349  """!
350  @brief Draws clusters and in case of two-dimensional dataset draws their ellipses.
351 
352  @param[in] clusters (list): Clusters that were allocated by the algorithm.
353  @param[in] sample (list): Dataset that were used for clustering.
354  @param[in] covariances (list): Covariances of the clusters.
355  @param[in] means (list): Means of the clusters.
356  @param[in] figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used
357  for visualization.
358  @param[in] display (bool): If 'True' then figure will be shown by the method, otherwise it should be
359  shown manually using matplotlib function 'plt.show()'.
360 
361  @return (figure) Figure where clusters were drawn.
362 
363  """
364 
365  visualizer = cluster_visualizer()
366  visualizer.append_clusters(clusters, sample)
367 
368  if figure is None:
369  figure = visualizer.show(display = False)
370  else:
371  visualizer.show(figure = figure, display = False)
372 
373  if len(sample[0]) == 2:
374  ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means)
375 
376  if display is True:
377  plt.show()
378 
379  return figure
380 
381 
382  @staticmethod
383  def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
384  """!
385  @brief Animates clustering process that is performed by EM algorithm.
386 
387  @param[in] data (list): Dataset that is used for clustering.
388  @param[in] observer (ema_observer): EM observer that was used for collection information about clustering process.
389  @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
390  @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
391  @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
392 
393  """
394 
395  figure = plt.figure()
396 
397  def init_frame():
398  return frame_generation(0)
399 
400  def frame_generation(index_iteration):
401  figure.clf()
402 
403  figure.suptitle("EM algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold')
404 
405  clusters = observer.get_evolution_clusters()[index_iteration]
406  covariances = observer.get_evolution_covariances()[index_iteration]
407  means = observer.get_evolution_means()[index_iteration]
408 
409  ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False)
410  figure.subplots_adjust(top = 0.85)
411 
412  return [ figure.gca() ]
413 
414  iterations = len(observer)
415  cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000)
416 
417  if save_movie is not None:
418  cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500)
419  else:
420  plt.show()
421 
422 
423  @staticmethod
424  def __draw_ellipses(figure, visualizer, clusters, covariances, means):
425  ax = figure.get_axes()[0]
426 
427  for index in range(len(clusters)):
428  angle, width, height = calculate_ellipse_description(covariances[index])
429  color = visualizer.get_cluster_color(index, 0)
430 
431  ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color)
432 
433 
434  @staticmethod
435  def __draw_ellipse(ax, x, y, angle, width, height, color):
436  if (width > 0.0) and (height > 0.0):
437  ax.plot(x, y, color=color, marker='x', markersize=6)
438  ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=-angle, linewidth=2, fill=True, zorder=2, color=color)
439  ax.add_patch(ellipse)
440 
441 
442 
443 class ema:
444  """!
445  @brief Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
446  @details The algorithm provides only clustering services (unsupervised learning).
447  Here an example of data clustering process:
448  @code
449  from pyclustering.cluster.ema import ema, ema_visualizer
450  from pyclustering.utils import read_sample
451  from pyclustering.samples.definitions import FCPS_SAMPLES
452 
453  # Read data from text file.
454  sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
455 
456  # Create EM algorithm to allocated four clusters.
457  ema_instance = ema(sample, 3)
458 
459  # Run clustering process.
460  ema_instance.process()
461 
462  # Get clustering results.
463  clusters = ema_instance.get_clusters()
464  covariances = ema_instance.get_covariances()
465  means = ema_instance.get_centers()
466 
467  # Visualize obtained clustering results.
468  ema_visualizer.show_clusters(clusters, sample, covariances, means)
469  @endcode
470 
471  Here is clustering results of the Expectation-Maximization clustering algorithm where popular sample 'OldFaithful' was used.
472  Initial random means and covariances were used in the example. The first step is presented on the left side of the figure and
473  final result (the last step) is on the right side:
474  @image html ema_old_faithful_clustering.png
475 
476  @see ema_visualizer
477  @see ema_observer
478 
479  """
480  def __init__(self, data, amount_clusters, means=None, variances=None, observer=None, tolerance=0.00001, iterations=100):
481  """!
482  @brief Initializes Expectation-Maximization algorithm for cluster analysis.
483 
484  @param[in] data (list): Dataset that should be analysed and where each point (object) is represented by the list of coordinates.
485  @param[in] amount_clusters (uint): Amount of clusters that should be allocated.
486  @param[in] means (list): Initial means of clusters (amount of means should be equal to amount of clusters for allocation).
487  If this parameter is 'None' then K-Means algorithm with K-Means++ method will be used for initialization by default.
488  @param[in] variances (list): Initial cluster variances (or covariances in case of multi-dimensional data). Amount of
489  covariances should be equal to amount of clusters that should be allocated. If this parameter is 'None' then
490  K-Means algorithm with K-Means++ method will be used for initialization by default.
491  @param[in] observer (ema_observer): Observer for gathering information about clustering process.
492  @param[in] tolerance (float): Defines stop condition of the algorithm (when difference between current and
493  previous log-likelihood estimation is less then 'tolerance' then clustering is over).
494  @param[in] iterations (uint): Additional stop condition parameter that defines maximum number of steps that can be
495  performed by the algorithm during clustering process.
496 
497  """
498 
499  self.__data = numpy.array(data)
500  self.__amount_clusters = amount_clusters
501  self.__tolerance = tolerance
502  self.__iterations = iterations
503  self.__observer = observer
504 
505  self.__means = means
506  self.__variances = variances
507 
508  self.__verify_arguments()
509 
510  if (means is None) or (variances is None):
511  self.__means, self.__variances = ema_initializer(data, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION)
512 
513  if len(self.__means) != amount_clusters:
514  self.__amount_clusters = len(self.__means)
515 
516  self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ]
517  self.__pic = [1.0] * amount_clusters
518  self.__clusters = []
519  self.__gaussians = [ [] for _ in range(amount_clusters) ]
520  self.__stop = False
521 
522 
523  def process(self):
524  """!
525  @brief Run clustering process of the algorithm.
526 
527  @return (ema) Returns itself (EMA instance).
528 
529  """
530 
531  previous_likelihood = -200000
532  current_likelihood = -100000
533 
534  current_iteration = 0
535  while(self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations):
536  self.__expectation_step()
537  self.__maximization_step()
538 
539  current_iteration += 1
540 
541  self.__extract_clusters()
542  self.__notify()
543 
544  previous_likelihood = current_likelihood
545  current_likelihood = self.__log_likelihood()
546  self.__stop = self.__get_stop_condition()
547 
549  return self
550 
551 
552  def get_clusters(self):
553  """!
554  @return (list) Allocated clusters where each cluster is represented by list of indexes of points from dataset,
555  for example, two cluster may have following representation [[0, 1, 4], [2, 3, 5, 6]].
556 
557  """
558  return self.__clusters
559 
560 
561  def get_centers(self):
562  """!
563  @return (list) Corresponding centers (means) of clusters.
564 
565  """
566 
567  return self.__means
568 
569 
570  def get_covariances(self):
571  """!
572  @return (list) Corresponding variances (or covariances in case of multi-dimensional data) of clusters.
573 
574  """
575 
576  return self.__variances
577 
578 
579  def get_probabilities(self):
580  """!
581  @brief Returns 2-dimensional list with belong probability of each object from data to cluster correspondingly,
582  where that first index is for cluster and the second is for point.
583 
584  @code
585  # Get belong probablities
586  probabilities = ema_instance.get_probabilities();
587 
588  # Show porbability of the fifth element in the first and in the second cluster
589  index_point = 5;
590  print("Probability in the first cluster:", probabilities[0][index_point]);
591  print("Probability in the first cluster:", probabilities[1][index_point]);
592  @endcode
593 
594  @return (list) 2-dimensional list with belong probability of each object from data to cluster.
595 
596  """
597 
598  return self.__rc
599 
600 
601  def __erase_empty_clusters(self):
602  clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], []
603 
604  for index_cluster in range(len(self.__clusters)):
605  if len(self.__clusters[index_cluster]) > 0:
606  clusters.append(self.__clusters[index_cluster])
607  means.append(self.__means[index_cluster])
608  variances.append(self.__variances[index_cluster])
609  pic.append(self.__pic[index_cluster])
610  gaussians.append(self.__gaussians[index_cluster])
611  rc.append(self.__rc[index_cluster])
612 
613  if len(self.__clusters) != len(clusters):
614  self.__clusters, self.__means, self.__variances, self.__pic = clusters, means, variances, pic
615  self.__gaussians, self.__rc = gaussians, rc
616  self.__amount_clusters = len(self.__clusters)
617 
618 
619  def __notify(self):
620  if self.__observer is not None:
621  self.__observer.notify(self.__means, self.__variances, self.__clusters)
622 
623 
624  def __extract_clusters(self):
625  self.__clusters = [[] for _ in range(self.__amount_clusters)]
626  for index_point in range(len(self.__data)):
627  candidates = []
628  for index_cluster in range(self.__amount_clusters):
629  candidates.append((index_cluster, self.__rc[index_cluster][index_point]))
630 
631  index_winner = max(candidates, key=lambda candidate: candidate[1])[0]
632  self.__clusters[index_winner].append(index_point)
633 
635 
636 
637  def __log_likelihood(self):
638  likelihood = 0.0
639 
640  for index_point in range(len(self.__data)):
641  particle = 0.0
642  for index_cluster in range(self.__amount_clusters):
643  particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point]
644 
645  if particle > 0.0:
646  likelihood += numpy.log(particle)
647 
648  return likelihood
649 
650 
651  def __probabilities(self, index_cluster, index_point):
652  divider = 0.0
653  for i in range(self.__amount_clusters):
654  divider += self.__pic[i] * self.__gaussians[i][index_point]
655 
656  if (divider != 0.0) and (divider != float('inf')):
657  return self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider
658 
659  return 1.0
660 
661 
662  def __expectation_step(self):
663  self.__gaussians = [ [] for _ in range(self.__amount_clusters) ]
664  for index in range(self.__amount_clusters):
665  self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index])
666 
667  self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ]
668  for index_cluster in range(self.__amount_clusters):
669  for index_point in range(len(self.__data)):
670  self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point)
671 
672 
673  def __maximization_step(self):
674  self.__pic = []
675  self.__means = []
676  self.__variances = []
677 
678  amount_impossible_clusters = 0
679 
680  for index_cluster in range(self.__amount_clusters):
681  mc = numpy.sum(self.__rc[index_cluster])
682 
683  if mc == 0.0:
684  amount_impossible_clusters += 1
685  continue
686 
687  self.__pic.append( mc / len(self.__data) )
688  self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) )
689  self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) )
690 
691  self.__amount_clusters -= amount_impossible_clusters
692 
693 
694  def __get_stop_condition(self):
695  for covariance in self.__variances:
696  if numpy.linalg.norm(covariance) == 0.0:
697  return True
698 
699  return False
700 
701 
702  def __update_covariance(self, means, rc, mc):
703  covariance = 0.0
704  for index_point in range(len(self.__data)):
705  deviation = numpy.array([self.__data[index_point] - means])
706  covariance += rc[index_point] * deviation.T.dot(deviation)
707 
708  covariance = covariance / mc
709  return covariance
710 
711 
712  def __update_mean(self, rc, mc):
713  mean = 0.0
714  for index_point in range(len(self.__data)):
715  mean += rc[index_point] * self.__data[index_point]
716 
717  mean = mean / mc
718  return mean
719 
720 
721  def __normalize_probabilities(self):
722  for index_point in range(len(self.__data)):
723  probability = 0.0
724  for index_cluster in range(len(self.__clusters)):
725  probability += self.__rc[index_cluster][index_point]
726 
727  if abs(probability - 1.0) > 0.000001:
728  self.__normalize_probability(index_point, probability)
729 
730 
731  def __normalize_probability(self, index_point, probability):
732  if probability == 0.0:
733  return
734 
735  normalization = 1.0 / probability
736 
737  for index_cluster in range(len(self.__clusters)):
738  self.__rc[index_cluster][index_point] *= normalization
739 
740 
741  def __verify_arguments(self):
742  """!
743  @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
744 
745  """
746  if len(self.__data) == 0:
747  raise ValueError("Input data is empty (size: '%d')." % len(self.__data))
748 
749  if self.__amount_clusters < 1:
750  raise ValueError("Amount of clusters (current value '%d') should be greater or equal to 1." %
751  self.__amount_clusters)
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:390
pyclustering module for cluster analysis.
Definition: __init__.py:1
def __init__(self, sample, amount)
Constructs EM initializer.
Definition: ema.py:125
The module contains K-Means algorithm and other related services.
Definition: kmeans.py:1
def __update_covariance(self, means, rc, mc)
Definition: ema.py:702
def __probabilities(self, index_cluster, index_point)
Definition: ema.py:651
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def animate_cluster_allocation(data, observer, animation_velocity=75, movie_fps=1, save_movie=None)
Animates clustering process that is performed by EM algorithm.
Definition: ema.py:383
def __extract_clusters(self)
Definition: ema.py:624
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
Definition: ema.py:443
def get_covariances(self)
Definition: ema.py:570
def __get_stop_condition(self)
Definition: ema.py:694
def gaussian(data, mean, covariance)
Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covar...
Definition: ema.py:48
def notify(self, means, covariances, clusters)
This method is used by the algorithm to notify observer about changes where the algorithm should prov...
Definition: ema.py:323
def __expectation_step(self)
Definition: ema.py:662
Observer of EM algorithm for collecting algorithm state on each step.
Definition: ema.py:235
Class implements K-Means clustering algorithm.
Definition: kmeans.py:272
def __calculate_initial_clusters(self, centers)
Calculate Euclidean distance to each point from the each cluster.
Definition: ema.py:156
def initialize(self, init_type=ema_init_type.KMEANS_INITIALIZATION)
Calculates initial parameters for EM algorithm: means and covariances using specified strategy...
Definition: ema.py:137
def __init__(self)
Initializes EM observer.
Definition: ema.py:273
def show_clusters(clusters, sample, covariances, means, figure=None, display=True)
Draws clusters and in case of two-dimensional dataset draws their ellipses.
Definition: ema.py:348
def __normalize_probabilities(self)
Definition: ema.py:721
Provides services for preparing initial means and covariances for Expectation-Maximization algorithm...
Definition: ema.py:102
def __update_mean(self, rc, mc)
Definition: ema.py:712
def __maximization_step(self)
Definition: ema.py:673
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def process(self)
Run clustering process of the algorithm.
Definition: ema.py:523
def get_clusters(self)
Definition: ema.py:552
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
Definition: ema.py:741
def get_probabilities(self)
Returns 2-dimensional list with belong probability of each object from data to cluster correspondingl...
Definition: ema.py:579
Visualizer of EM algorithm&#39;s results.
Definition: ema.py:339
def __calculate_initial_covariances(self, initial_clusters)
Definition: ema.py:180
Enumeration of initialization types for Expectation-Maximization algorithm.
Definition: ema.py:84
def __log_likelihood(self)
Definition: ema.py:637
def __normalize_probability(self, index_point, probability)
Definition: ema.py:731
def __init__(self, data, amount_clusters, means=None, variances=None, observer=None, tolerance=0.00001, iterations=100)
Initializes Expectation-Maximization algorithm for cluster analysis.
Definition: ema.py:480
def __erase_empty_clusters(self)
Definition: ema.py:601