ema.py
1 """!
2 
3 @brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model.
4 @details Implementation based on paper @cite article::ema::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2019
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import random
30 import warnings
31 
32 from pyclustering.cluster import cluster_visualizer
33 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
34 from pyclustering.cluster.kmeans import kmeans
35 
36 from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_square
37 
38 from enum import IntEnum
39 
40 try:
41  import matplotlib.pyplot as plt
42  import matplotlib.animation as animation
43  from matplotlib import patches
44 except Exception as error_instance:
45  warnings.warn("Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization "
46  "functionality is not available (details: '%s')." % str(error_instance))
47 
48 def gaussian(data, mean, covariance):
49  """!
50  @brief Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covariance in case
51  multi-dimensional data.
52 
53  @param[in] data (list): Data that is used for gaussian calculation.
54  @param[in] mean (float|numpy.array): Mathematical expectation used for calculation.
55  @param[in] covariance (float|numpy.array): Variance or covariance matrix for calculation.
56 
57  @return (list) Value of gaussian function for each point in dataset.
58 
59  """
60  dimension = float(len(data[0]))
61 
62  if dimension != 1.0:
63  inv_variance = numpy.linalg.pinv(covariance)
64  else:
65  inv_variance = 1.0 / covariance
66 
67  divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance))
68  if divider != 0.0:
69  right_const = 1.0 / divider
70  else:
71  right_const = float('inf')
72 
73  result = []
74 
75  for point in data:
76  mean_delta = point - mean
77  point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) )
78  result.append(point_gaussian)
79 
80  return result
81 
82 
83 
84 class ema_init_type(IntEnum):
85  """!
86  @brief Enumeration of initialization types for Expectation-Maximization algorithm.
87 
88  """
89 
90 
92  RANDOM_INITIALIZATION = 0
93 
94 
98  KMEANS_INITIALIZATION = 1
99 
100 
101 
103  """!
104  @brief Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm.
105  @details Initialization strategy is defined by enumerator 'ema_init_type': random initialization and
106  kmeans with kmeans++ initialization. Here an example of initialization using kmeans strategy:
107 
108  @code
109  from pyclustering.utils import read_sample
110  from pyclustering.samples.definitions import FAMOUS_SAMPLES
111  from pyclustering.cluster.ema import ema_initializer
112 
113  sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL)
114  amount_clusters = 2
115 
116  initial_means, initial_covariance = ema_initializer(sample, amount_clusters).initialize()
117  print(initial_means)
118  print(initial_covariance)
119  @endcode
120 
121  """
122 
123  __MAX_GENERATION_ATTEMPTS = 10
124 
125  def __init__(self, sample, amount):
126  """!
127  @brief Constructs EM initializer.
128 
129  @param[in] sample (list): Data that will be used by the EM algorithm.
130  @param[in] amount (uint): Amount of clusters that should be allocated by the EM algorithm.
131 
132  """
133  self.__sample = sample
134  self.__amount = amount
135 
136 
137  def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
138  """!
139  @brief Calculates initial parameters for EM algorithm: means and covariances using
140  specified strategy.
141 
142  @param[in] init_type (ema_init_type): Strategy for initialization.
143 
144  @return (float|list, float|numpy.array) Initial means and variance (covariance matrix in case multi-dimensional data).
145 
146  """
147  if init_type == ema_init_type.KMEANS_INITIALIZATION:
148  return self.__initialize_kmeans()
149 
150  elif init_type == ema_init_type.RANDOM_INITIALIZATION:
151  return self.__initialize_random()
152 
153  raise NameError("Unknown type of EM algorithm initialization is specified.")
154 
155 
156  def __calculate_initial_clusters(self, centers):
157  """!
158  @brief Calculate Euclidean distance to each point from the each cluster.
159  @brief Nearest points are captured by according clusters and as a result clusters are updated.
160 
161  @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
162 
163  """
164 
165  clusters = [[] for _ in range(len(centers))]
166  for index_point in range(len(self.__sample)):
167  index_optim, dist_optim = -1, 0.0
168 
169  for index in range(len(centers)):
170  dist = euclidean_distance_square(self.__sample[index_point], centers[index])
171 
172  if (dist < dist_optim) or (index is 0):
173  index_optim, dist_optim = index, dist
174 
175  clusters[index_optim].append(index_point)
176 
177  return clusters
178 
179 
180  def __calculate_initial_covariances(self, initial_clusters):
181  covariances = []
182  for initial_cluster in initial_clusters:
183  if len(initial_cluster) > 1:
184  cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ]
185  covariances.append(numpy.cov(cluster_sample, rowvar = False))
186  else:
187  dimension = len(self.__sample[0])
188  covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
189 
190  return covariances
191 
192 
193  def __initialize_random(self):
194  initial_means = []
195 
196  for _ in range(self.__amount):
197  mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ]
198  attempts = 0
199  while (mean in initial_means) and (attempts < ema_initializer.__MAX_GENERATION_ATTEMPTS):
200  mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ]
201  attempts += 1
202 
203  if attempts == ema_initializer.__MAX_GENERATION_ATTEMPTS:
204  mean = [ value + (random.random() - 0.5) * value * 0.2 for value in mean ]
205 
206  initial_means.append(mean)
207 
208  initial_clusters = self.__calculate_initial_clusters(initial_means)
209  initial_covariance = self.__calculate_initial_covariances(initial_clusters)
210 
211  return initial_means, initial_covariance
212 
213 
214  def __initialize_kmeans(self):
215  initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize()
216  kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True)
217  kmeans_instance.process()
218 
219  means = kmeans_instance.get_centers()
220 
221  covariances = []
222  initial_clusters = kmeans_instance.get_clusters()
223  for initial_cluster in initial_clusters:
224  if len(initial_cluster) > 1:
225  cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ]
226  covariances.append(numpy.cov(cluster_sample, rowvar = False))
227  else:
228  dimension = len(self.__sample[0])
229  covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
230 
231  return means, covariances
232 
233 
234 
236  """!
237  @brief Observer of EM algorithm for collecting algorithm state on each step.
238  @details It can be used to obtain whole picture about clustering process of EM algorithm. Allocated clusters,
239  means and covariances are stored in observer on each step. Here an example of usage:
240 
241  @code
242  from pyclustering.cluster.ema import ema, ema_observer
243  from pyclustering.utils import read_sample
244  from pyclustering.samples.definitions import SIMPLE_SAMPLES
245 
246  # Read data from text file.
247  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
248 
249  # Create EM observer.
250  observer = ema_observer()
251 
252  # Create EM algorithm to allocated four clusters and pass observer to it.
253  ema_instance = ema(sample, 4, observer=observer)
254 
255  # Run clustering process.
256  ema_instance.process()
257 
258  # Print amount of steps that were done by the algorithm.
259  print("EMA steps:", observer.get_iterations())
260 
261  # Print evolution of means and covariances.
262  print("Means evolution:", observer.get_evolution_means())
263  print("Covariances evolution:", observer.get_evolution_covariances())
264 
265  # Print evolution of clusters.
266  print("Clusters evolution:", observer.get_evolution_clusters())
267 
268  # Print final clusters.
269  print("Allocated clusters:", observer.get_evolution_clusters()[-1])
270  @endcode
271 
272  """
273  def __init__(self):
274  """!
275  @brief Initializes EM observer.
276 
277  """
278  self.__means_evolution = []
279  self.__covariances_evolution = []
280  self.__clusters_evolution = []
281 
282 
283  def __len__(self):
284  """!
285  @return (uint) Amount of iterations that were done by the EM algorithm.
286 
287  """
288  return len(self.__means_evolution)
289 
290 
291  def get_iterations(self):
292  """!
293  @return (uint) Amount of iterations that were done by the EM algorithm.
294 
295  """
296  return len(self.__means_evolution)
297 
298 
300  """!
301  @return (list) Mean of each cluster on each step of clustering.
302 
303  """
304  return self.__means_evolution
305 
306 
308  """!
309  @return (list) Covariance matrix (or variance in case of one-dimensional data) of each cluster on each step of clustering.
310 
311  """
312  return self.__covariances_evolution
313 
314 
316  """!
317  @return (list) Allocated clusters on each step of clustering.
318 
319  """
320  return self.__clusters_evolution
321 
322 
323  def notify(self, means, covariances, clusters):
324  """!
325  @brief This method is used by the algorithm to notify observer about changes where the algorithm
326  should provide new values: means, covariances and allocated clusters.
327 
328  @param[in] means (list): Mean of each cluster on currect step.
329  @param[in] covariances (list): Covariances of each cluster on current step.
330  @param[in] clusters (list): Allocated cluster on current step.
331 
332  """
333  self.__means_evolution.append(means)
334  self.__covariances_evolution.append(covariances)
335  self.__clusters_evolution.append(clusters)
336 
337 
338 
340  """!
341  @brief Visualizer of EM algorithm's results.
342  @details Provides services for visualization of particular features of the algorithm, for example,
343  in case of two-dimensional dataset it shows covariance ellipses.
344 
345  """
346 
347  @staticmethod
348  def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
349  """!
350  @brief Draws clusters and in case of two-dimensional dataset draws their ellipses.
351 
352  @param[in] clusters (list): Clusters that were allocated by the algorithm.
353  @param[in] sample (list): Dataset that were used for clustering.
354  @param[in] covariances (list): Covariances of the clusters.
355  @param[in] means (list): Means of the clusters.
356  @param[in] figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used
357  for visualization.
358  @param[in] display (bool): If 'True' then figure will be shown by the method, otherwise it should be
359  shown manually using matplotlib function 'plt.show()'.
360 
361  @return (figure) Figure where clusters were drawn.
362 
363  """
364 
365  visualizer = cluster_visualizer()
366  visualizer.append_clusters(clusters, sample)
367 
368  if figure is None:
369  figure = visualizer.show(display = False)
370  else:
371  visualizer.show(figure = figure, display = False)
372 
373  if len(sample[0]) == 2:
374  ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means)
375 
376  if display is True:
377  plt.show()
378 
379  return figure
380 
381 
382  @staticmethod
383  def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
384  """!
385  @brief Animates clustering process that is performed by EM algorithm.
386 
387  @param[in] data (list): Dataset that is used for clustering.
388  @param[in] observer (ema_observer): EM observer that was used for collection information about clustering process.
389  @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
390  @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
391  @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
392 
393  """
394 
395  figure = plt.figure()
396 
397  def init_frame():
398  return frame_generation(0)
399 
400  def frame_generation(index_iteration):
401  figure.clf()
402 
403  figure.suptitle("EM algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold')
404 
405  clusters = observer.get_evolution_clusters()[index_iteration]
406  covariances = observer.get_evolution_covariances()[index_iteration]
407  means = observer.get_evolution_means()[index_iteration]
408 
409  ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False)
410  figure.subplots_adjust(top = 0.85)
411 
412  return [ figure.gca() ]
413 
414  iterations = len(observer)
415  cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000)
416 
417  if save_movie is not None:
418  cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500)
419  else:
420  plt.show()
421 
422 
423  @staticmethod
424  def __draw_ellipses(figure, visualizer, clusters, covariances, means):
425  ax = figure.get_axes()[0]
426 
427  for index in range(len(clusters)):
428  angle, width, height = calculate_ellipse_description(covariances[index])
429  color = visualizer.get_cluster_color(index, 0)
430 
431  ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color)
432 
433 
434  @staticmethod
435  def __draw_ellipse(ax, x, y, angle, width, height, color):
436  if (width > 0.0) and (height > 0.0):
437  ax.plot(x, y, color=color, marker='x', markersize=6)
438  ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=-angle, linewidth=2, fill=True, zorder=2, color=color)
439  ax.add_patch(ellipse)
440 
441 
442 
443 class ema:
444  """!
445  @brief Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
446  @details The algorithm provides only clustering services (unsupervised learning).
447  Here an example of data clustering process:
448  @code
449  from pyclustering.cluster.ema import ema, ema_visualizer
450  from pyclustering.utils import read_sample
451  from pyclustering.samples.definitions import FCPS_SAMPLES
452 
453  # Read data from text file.
454  sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
455 
456  # Create EM algorithm to allocated four clusters.
457  ema_instance = ema(sample, 3)
458 
459  # Run clustering process.
460  ema_instance.process()
461 
462  # Get clustering results.
463  clusters = ema_instance.get_clusters()
464  covariances = ema_instance.get_covariances()
465  means = ema_instance.get_centers()
466 
467  # Visualize obtained clustering results.
468  ema_visualizer.show_clusters(clusters, sample, covariances, means)
469  @endcode
470 
471  Here is clustering results of the Expectation-Maximization clustering algorithm where popular sample 'OldFaithful' was used.
472  Initial random means and covariances were used in the example. The first step is presented on the left side of the figure and
473  final result (the last step) is on the right side:
474  @image html ema_old_faithful_clustering.png
475 
476  @see ema_visualizer
477  @see ema_observer
478 
479  """
480  def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
481  """!
482  @brief Initializes Expectation-Maximization algorithm for cluster analysis.
483 
484  @param[in] data (list): Dataset that should be analysed and where each point (object) is represented by the list of coordinates.
485  @param[in] amount_clusters (uint): Amount of clusters that should be allocated.
486  @param[in] means (list): Initial means of clusters (amount of means should be equal to amount of clusters for allocation).
487  If this parameter is 'None' then K-Means algorithm with K-Means++ method will be used for initialization by default.
488  @param[in] variances (list): Initial cluster variances (or covariances in case of multi-dimensional data). Amount of
489  covariances should be equal to amount of clusters that should be allocated. If this parameter is 'None' then
490  K-Means algorithm with K-Means++ method will be used for initialization by default.
491  @param[in] observer (ema_observer): Observer for gathering information about clustering process.
492  @param[in] tolerance (float): Defines stop condition of the algorithm (when difference between current and
493  previous log-likelihood estimation is less then 'tolerance' then clustering is over).
494  @param[in] iterations (uint): Additional stop condition parameter that defines maximum number of steps that can be
495  performed by the algorithm during clustering process.
496 
497  """
498 
499  self.__data = numpy.array(data)
500  self.__amount_clusters = amount_clusters
501  self.__tolerance = tolerance
502  self.__iterations = iterations
503  self.__observer = observer
504 
505  self.__means = means
506  self.__variances = variances
507 
508  if (means is None) or (variances is None):
509  self.__means, self.__variances = ema_initializer(data, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION)
510 
511  if len(self.__means) != amount_clusters:
512  self.__amount_clusters = len(self.__means)
513 
514  self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ]
515  self.__pic = [1.0] * amount_clusters
516  self.__clusters = []
517  self.__gaussians = [ [] for _ in range(amount_clusters) ]
518  self.__stop = False
519 
520 
521  def process(self):
522  """!
523  @brief Run clustering process of the algorithm.
524  @details This method should be called before call 'get_clusters()'.
525 
526  """
527 
528  previous_likelihood = -200000
529  current_likelihood = -100000
530 
531  current_iteration = 0
532  while(self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations):
533  self.__expectation_step()
534  self.__maximization_step()
535 
536  current_iteration += 1
537 
538  self.__extract_clusters()
539  self.__notify()
540 
541  previous_likelihood = current_likelihood
542  current_likelihood = self.__log_likelihood()
543  self.__stop = self.__get_stop_condition()
544 
546 
547 
548  def get_clusters(self):
549  """!
550  @return (list) Allocated clusters where each cluster is represented by list of indexes of points from dataset,
551  for example, two cluster may have following representation [[0, 1, 4], [2, 3, 5, 6]].
552 
553  """
554  return self.__clusters
555 
556 
557  def get_centers(self):
558  """!
559  @return (list) Corresponding centers (means) of clusters.
560 
561  """
562 
563  return self.__means
564 
565 
566  def get_covariances(self):
567  """!
568  @return (list) Corresponding variances (or covariances in case of multi-dimensional data) of clusters.
569 
570  """
571 
572  return self.__variances
573 
574 
575  def get_probabilities(self):
576  """!
577  @brief Returns 2-dimensional list with belong probability of each object from data to cluster correspondingly,
578  where that first index is for cluster and the second is for point.
579 
580  @code
581  # Get belong probablities
582  probabilities = ema_instance.get_probabilities();
583 
584  # Show porbability of the fifth element in the first and in the second cluster
585  index_point = 5;
586  print("Probability in the first cluster:", probabilities[0][index_point]);
587  print("Probability in the first cluster:", probabilities[1][index_point]);
588  @endcode
589 
590  @return (list) 2-dimensional list with belong probability of each object from data to cluster.
591 
592  """
593 
594  return self.__rc
595 
596 
597  def __erase_empty_clusters(self):
598  clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], []
599 
600  for index_cluster in range(len(self.__clusters)):
601  if len(self.__clusters[index_cluster]) > 0:
602  clusters.append(self.__clusters[index_cluster])
603  means.append(self.__means[index_cluster])
604  variances.append(self.__variances[index_cluster])
605  pic.append(self.__pic[index_cluster])
606  gaussians.append(self.__gaussians[index_cluster])
607  rc.append(self.__rc[index_cluster])
608 
609  if len(self.__clusters) != len(clusters):
610  self.__clusters, self.__means, self.__variances, self.__pic = clusters, means, variances, pic
611  self.__gaussians, self.__rc = gaussians, rc
612  self.__amount_clusters = len(self.__clusters)
613 
614 
615  def __notify(self):
616  if self.__observer is not None:
617  self.__observer.notify(self.__means, self.__variances, self.__clusters)
618 
619 
620  def __extract_clusters(self):
621  self.__clusters = [ [] for _ in range(self.__amount_clusters) ]
622  for index_point in range(len(self.__data)):
623  candidates = []
624  for index_cluster in range(self.__amount_clusters):
625  candidates.append((index_cluster, self.__rc[index_cluster][index_point]))
626 
627  index_winner = max(candidates, key = lambda candidate : candidate[1])[0]
628  self.__clusters[index_winner].append(index_point)
629 
631 
632 
633  def __log_likelihood(self):
634  likelihood = 0.0
635 
636  for index_point in range(len(self.__data)):
637  particle = 0.0
638  for index_cluster in range(self.__amount_clusters):
639  particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point]
640 
641  if particle > 0.0:
642  likelihood += numpy.log(particle)
643 
644  return likelihood
645 
646 
647  def __probabilities(self, index_cluster, index_point):
648  divider = 0.0
649  for i in range(self.__amount_clusters):
650  divider += self.__pic[i] * self.__gaussians[i][index_point]
651 
652  if (divider != 0.0) and (divider != float('inf')):
653  return self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider
654 
655  return 1.0
656 
657 
658  def __expectation_step(self):
659  self.__gaussians = [ [] for _ in range(self.__amount_clusters) ]
660  for index in range(self.__amount_clusters):
661  self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index])
662 
663  self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ]
664  for index_cluster in range(self.__amount_clusters):
665  for index_point in range(len(self.__data)):
666  self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point)
667 
668 
669  def __maximization_step(self):
670  self.__pic = []
671  self.__means = []
672  self.__variances = []
673 
674  amount_impossible_clusters = 0
675 
676  for index_cluster in range(self.__amount_clusters):
677  mc = numpy.sum(self.__rc[index_cluster])
678 
679  if mc == 0.0:
680  amount_impossible_clusters += 1
681  continue
682 
683  self.__pic.append( mc / len(self.__data) )
684  self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) )
685  self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) )
686 
687  self.__amount_clusters -= amount_impossible_clusters
688 
689 
690  def __get_stop_condition(self):
691  for covariance in self.__variances:
692  if numpy.linalg.norm(covariance) == 0.0:
693  return True
694 
695  return False
696 
697 
698  def __update_covariance(self, means, rc, mc):
699  covariance = 0.0
700  for index_point in range(len(self.__data)):
701  deviation = numpy.array( [ self.__data[index_point] - means ])
702  covariance += rc[index_point] * deviation.T.dot(deviation)
703 
704  covariance = covariance / mc
705  return covariance
706 
707 
708  def __update_mean(self, rc, mc):
709  mean = 0.0
710  for index_point in range(len(self.__data)):
711  mean += rc[index_point] * self.__data[index_point]
712 
713  mean = mean / mc
714  return mean
715 
716 
717  def __normalize_probabilities(self):
718  for index_point in range(len(self.__data)):
719  probability = 0.0
720  for index_cluster in range(len(self.__clusters)):
721  probability += self.__rc[index_cluster][index_point]
722 
723  if abs(probability - 1.0) > 0.000001:
724  self.__normalize_probability(index_point, probability)
725 
726 
727  def __normalize_probability(self, index_point, probability):
728  if probability == 0.0:
729  return
730 
731  normalization = 1.0 / probability
732 
733  for index_cluster in range(len(self.__clusters)):
734  self.__rc[index_cluster][index_point] *= normalization
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:359
pyclustering module for cluster analysis.
Definition: __init__.py:1
def __init__(self, sample, amount)
Constructs EM initializer.
Definition: ema.py:125
Cluster analysis algorithm: K-Means.
Definition: kmeans.py:1
def __update_covariance(self, means, rc, mc)
Definition: ema.py:698
def __probabilities(self, index_cluster, index_point)
Definition: ema.py:647
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def animate_cluster_allocation(data, observer, animation_velocity=75, movie_fps=1, save_movie=None)
Animates clustering process that is performed by EM algorithm.
Definition: ema.py:383
def __extract_clusters(self)
Definition: ema.py:620
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
Definition: ema.py:443
def get_covariances(self)
Definition: ema.py:566
def __get_stop_condition(self)
Definition: ema.py:690
def gaussian(data, mean, covariance)
Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covar...
Definition: ema.py:48
def notify(self, means, covariances, clusters)
This method is used by the algorithm to notify observer about changes where the algorithm should prov...
Definition: ema.py:323
def __expectation_step(self)
Definition: ema.py:658
Observer of EM algorithm for collecting algorithm state on each step.
Definition: ema.py:235
Class represents K-Means clustering algorithm.
Definition: kmeans.py:272
def __calculate_initial_clusters(self, centers)
Calculate Euclidean distance to each point from the each cluster.
Definition: ema.py:156
def initialize(self, init_type=ema_init_type.KMEANS_INITIALIZATION)
Calculates initial parameters for EM algorithm: means and covariances using specified strategy...
Definition: ema.py:137
def __init__(self)
Initializes EM observer.
Definition: ema.py:273
def show_clusters(clusters, sample, covariances, means, figure=None, display=True)
Draws clusters and in case of two-dimensional dataset draws their ellipses.
Definition: ema.py:348
def __normalize_probabilities(self)
Definition: ema.py:717
Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm...
Definition: ema.py:102
def __update_mean(self, rc, mc)
Definition: ema.py:708
def __maximization_step(self)
Definition: ema.py:669
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def process(self)
Run clustering process of the algorithm.
Definition: ema.py:521
def get_clusters(self)
Definition: ema.py:548
def get_probabilities(self)
Returns 2-dimensional list with belong probability of each object from data to cluster correspondingl...
Definition: ema.py:575
Visualizer of EM algorithm&#39;s results.
Definition: ema.py:339
def __calculate_initial_covariances(self, initial_clusters)
Definition: ema.py:180
Enumeration of initialization types for Expectation-Maximization algorithm.
Definition: ema.py:84
def __log_likelihood(self)
Definition: ema.py:633
def __normalize_probability(self, index_point, probability)
Definition: ema.py:727
def __init__(self, data, amount_clusters, means=None, variances=None, observer=None, tolerance=0.00001, iterations=100)
Initializes Expectation-Maximization algorithm for cluster analysis.
Definition: ema.py:480
def __erase_empty_clusters(self)
Definition: ema.py:597