ema.py
1 """!
2 
3 @brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model.
4 @details Implementation based on paper @cite article::ema::1.
5 
6 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @date 2014-2018
8 @copyright GNU Public License
9 
10 @cond GNU_PUBLIC_LICENSE
11  PyClustering is free software: you can redistribute it and/or modify
12  it under the terms of the GNU General Public License as published by
13  the Free Software Foundation, either version 3 of the License, or
14  (at your option) any later version.
15 
16  PyClustering is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with this program. If not, see <http://www.gnu.org/licenses/>.
23 @endcond
24 
25 """
26 
27 
28 import numpy
29 import random
30 import warnings
31 
32 from pyclustering.cluster import cluster_visualizer
33 from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
34 from pyclustering.cluster.kmeans import kmeans
35 
36 from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_square
37 
38 from enum import IntEnum
39 
40 try:
41  import matplotlib.pyplot as plt
42  import matplotlib.animation as animation
43  from matplotlib import patches
44 except Exception as error_instance:
45  warnings.warn("Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization "
46  "functionality is not available (details: '%s')." % str(error_instance))
47 
48 def gaussian(data, mean, covariance):
49  """!
50  @brief Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covariance in case
51  multi-dimensional data.
52 
53  @param[in] data (list): Data that is used for gaussian calculation.
54  @param[in] mean (float|numpy.array): Mathematical expectation used for calculation.
55  @param[in] covariance (float|numpy.array): Variance or covariance matrix for calculation.
56 
57  @return (list) Value of gaussian function for each point in dataset.
58 
59  """
60  dimension = float(len(data[0]))
61 
62  if dimension != 1.0:
63  inv_variance = numpy.linalg.pinv(covariance)
64  else:
65  inv_variance = 1.0 / covariance
66 
67  divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance))
68  if divider != 0.0:
69  right_const = 1.0 / divider
70  else:
71  right_const = float('inf')
72 
73  result = []
74 
75  for point in data:
76  mean_delta = point - mean
77  point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) )
78  result.append(point_gaussian)
79 
80  return result
81 
82 
83 
84 class ema_init_type(IntEnum):
85  """!
86  @brief Enumeration of initialization types for Expectation-Maximization algorithm.
87 
88  """
89 
90 
92  RANDOM_INITIALIZATION = 0
93 
94 
98  KMEANS_INITIALIZATION = 1
99 
100 
101 
103  """!
104  @brief Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm.
105  @details Initialization strategy is defined by enumerator 'ema_init_type': random initialization and
106  kmeans with kmeans++ initialization. Here an example of initialization using kmeans strategy:
107 
108  @code
109  from pyclustering.utils import read_sample;
110  from pyclustering.samples.definitions import FAMOUS_SAMPLES;
111  from pyclustering.cluster.ema import ema_initializer;
112 
113  sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL);
114  amount_clusters = 2;
115 
116  initial_means, initial_covariance = ema_initializer(sample, amount_clusters).initialize(initializer);
117  print(initial_means);
118  print(initial_covariance);
119  @endcode
120 
121  """
122 
123  __MAX_GENERATION_ATTEMPTS = 10
124 
125  def __init__(self, sample, amount):
126  """!
127  @brief Constructs EM initializer.
128 
129  @param[in] sample (list): Data that will be used by the EM algorithm.
130  @param[in] amount (uint): Amount of clusters that should be allocated by the EM algorithm.
131 
132  """
133  self.__sample = sample
134  self.__amount = amount
135 
136 
137  def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
138  """!
139  @brief Calculates initial parameters for EM algorithm: means and covariances using
140  specified strategy.
141 
142  @param[in] init_type (ema_init_type): Strategy for initialization.
143 
144  @return (float|list, float|numpy.array) Initial means and variance (covariance matrix in case multi-dimensional data).
145 
146  """
147  if init_type == ema_init_type.KMEANS_INITIALIZATION:
148  return self.__initialize_kmeans()
149 
150  elif init_type == ema_init_type.RANDOM_INITIALIZATION:
151  return self.__initialize_random()
152 
153  raise NameError("Unknown type of EM algorithm initialization is specified.")
154 
155 
156  def __calculate_initial_clusters(self, centers):
157  """!
158  @brief Calculate Euclidean distance to each point from the each cluster.
159  @brief Nearest points are captured by according clusters and as a result clusters are updated.
160 
161  @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
162 
163  """
164 
165  clusters = [[] for _ in range(len(centers))]
166  for index_point in range(len(self.__sample)):
167  index_optim, dist_optim = -1, 0.0
168 
169  for index in range(len(centers)):
170  dist = euclidean_distance_square(self.__sample[index_point], centers[index])
171 
172  if (dist < dist_optim) or (index is 0):
173  index_optim, dist_optim = index, dist
174 
175  clusters[index_optim].append(index_point)
176 
177  return clusters
178 
179 
180  def __calculate_initial_covariances(self, initial_clusters):
181  covariances = []
182  for initial_cluster in initial_clusters:
183  if len(initial_cluster) > 1:
184  cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ]
185  covariances.append(numpy.cov(cluster_sample, rowvar = False))
186  else:
187  dimension = len(self.__sample[0])
188  covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
189 
190  return covariances
191 
192 
193  def __initialize_random(self):
194  initial_means = []
195 
196  for _ in range(self.__amount):
197  mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ]
198  attempts = 0
199  while (mean in initial_means) and (attempts < ema_initializer.__MAX_GENERATION_ATTEMPTS):
200  mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ]
201  attempts += 1
202 
203  if attempts == ema_initializer.__MAX_GENERATION_ATTEMPTS:
204  mean = [ value + (random.random() - 0.5) * value * 0.2 for value in mean ]
205 
206  initial_means.append(mean)
207 
208  initial_clusters = self.__calculate_initial_clusters(initial_means)
209  initial_covariance = self.__calculate_initial_covariances(initial_clusters)
210 
211  return initial_means, initial_covariance
212 
213 
214  def __initialize_kmeans(self):
215  initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize()
216  kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True)
217  kmeans_instance.process()
218 
219  means = kmeans_instance.get_centers()
220 
221  covariances = []
222  initial_clusters = kmeans_instance.get_clusters()
223  for initial_cluster in initial_clusters:
224  if len(initial_cluster) > 1:
225  cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ]
226  covariances.append(numpy.cov(cluster_sample, rowvar = False))
227  else:
228  dimension = len(self.__sample[0])
229  covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
230 
231  return means, covariances
232 
233 
234 
236  """!
237  @brief Observer of EM algorithm for collecting algorithm state on each step.
238  @details It can be used to obtain whole picture about clustering process of EM algorithm. Allocated clusters,
239  means and covariances are stored in observer on each step. Here an example of usage:
240 
241  @code
242  from pyclustering.cluster.ema import ema, ema_observer;
243  from pyclustering.utils import read_sample;
244  from pyclustering.samples.definitions import SIMPLE_SAMPLES;
245 
246  # Read data from text file
247  sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
248 
249  # Create EM observer
250  observer = ema_observer();
251 
252  # Create EM algorithm to allocated four clusters and pass observer to it
253  ema_instance = ema(sample, 4, observer=observer);
254 
255  # Run clustering process
256  ema_instance.process();
257 
258  # Print amount of steps that were done by the algorithm
259  print("EMA steps:", observer.get_iterations());
260 
261  # Print evolution of means and covariances
262  print("Means evolution:", observer.get_evolution_means());
263  print("Covariances evolution:", observer.get_evolution_covariances());
264 
265  # Print evolution of clusters
266  print("Clusters evolution:", observer.get_evolution_clusters());
267 
268  # Print final clusters
269  print("Allocated clusters:", observer.get_evolution_clusters()[-1]);
270  @endcode
271 
272  """
273  def __init__(self):
274  """!
275  @brief Initializes EM observer.
276 
277  """
278  self.__means_evolution = []
279  self.__covariances_evolution = []
280  self.__clusters_evolution = []
281 
282 
283  def __len__(self):
284  """!
285  @return (uint) Amount of iterations that were done by the EM algorithm.
286 
287  """
288  return len(self.__means_evolution)
289 
290 
291  def get_iterations(self):
292  """!
293  @return (uint) Amount of iterations that were done by the EM algorithm.
294 
295  """
296  return len(self.__means_evolution)
297 
298 
300  """!
301  @return (list) Mean of each cluster on each step of clustering.
302 
303  """
304  return self.__means_evolution
305 
306 
308  """!
309  @return (list) Covariance matrix (or variance in case of one-dimensional data) of each cluster on each step of clustering.
310 
311  """
312  return self.__covariances_evolution
313 
314 
316  """!
317  @return (list) Allocated clusters on each step of clustering.
318 
319  """
320  return self.__clusters_evolution
321 
322 
323  def notify(self, means, covariances, clusters):
324  """!
325  @brief This method is used by the algorithm to notify observer about changes where the algorithm
326  should provide new values: means, covariances and allocated clusters.
327 
328  @param[in] means (list): Mean of each cluster on currect step.
329  @param[in] covariances (list): Covariances of each cluster on current step.
330  @param[in] clusters (list): Allocated cluster on current step.
331 
332  """
333  self.__means_evolution.append(means)
334  self.__covariances_evolution.append(covariances)
335  self.__clusters_evolution.append(clusters)
336 
337 
338 
340  """!
341  @brief Visualizer of EM algorithm's results.
342  @details Provides services for visualization of particular features of the algorithm, for example,
343  in case of two-dimensional dataset it shows covariance ellipses.
344 
345  """
346 
347  @staticmethod
348  def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
349  """!
350  @brief Draws clusters and in case of two-dimensional dataset draws their ellipses.
351 
352  @param[in] clusters (list): Clusters that were allocated by the algorithm.
353  @param[in] sample (list): Dataset that were used for clustering.
354  @param[in] covariances (list): Covariances of the clusters.
355  @param[in] means (list): Means of the clusters.
356  @param[in] figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used
357  for visualization.
358  @param[in] display (bool): If 'True' then figure will be shown by the method, otherwise it should be
359  shown manually using matplotlib function 'plt.show()'.
360 
361  @return (figure) Figure where clusters were drawn.
362 
363  """
364 
365  visualizer = cluster_visualizer()
366  visualizer.append_clusters(clusters, sample)
367 
368  if figure is None:
369  figure = visualizer.show(display = False)
370  else:
371  visualizer.show(figure = figure, display = False)
372 
373  if len(sample[0]) == 2:
374  ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means)
375 
376  if display is True:
377  plt.show()
378 
379  return figure
380 
381 
382  @staticmethod
383  def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
384  """!
385  @brief Animates clustering process that is performed by EM algorithm.
386 
387  @param[in] data (list): Dataset that is used for clustering.
388  @param[in] observer (ema_observer): EM observer that was used for collection information about clustering process.
389  @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
390  @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
391  @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
392 
393  """
394 
395  figure = plt.figure()
396 
397  def init_frame():
398  return frame_generation(0)
399 
400  def frame_generation(index_iteration):
401  figure.clf()
402 
403  figure.suptitle("EM algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold')
404 
405  clusters = observer.get_evolution_clusters()[index_iteration]
406  covariances = observer.get_evolution_covariances()[index_iteration]
407  means = observer.get_evolution_means()[index_iteration]
408 
409  ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False)
410  figure.subplots_adjust(top = 0.85)
411 
412  return [ figure.gca() ]
413 
414  iterations = len(observer)
415  cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000)
416 
417  if save_movie is not None:
418  cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500)
419  else:
420  plt.show()
421 
422 
423  @staticmethod
424  def __draw_ellipses(figure, visualizer, clusters, covariances, means):
425  ax = figure.get_axes()[0]
426 
427  for index in range(len(clusters)):
428  angle, width, height = calculate_ellipse_description(covariances[index])
429  color = visualizer.get_cluster_color(index, 0)
430 
431  ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color)
432 
433 
434  @staticmethod
435  def __draw_ellipse(ax, x, y, angle, width, height, color):
436  if (width > 0.0) and (height > 0.0):
437  ax.plot(x, y, color=color, marker='x', markersize=6)
438  ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=-angle, linewidth=2, fill=True, zorder=2, color=color)
439  ax.add_patch(ellipse)
440 
441 
442 
443 class ema:
444  """!
445  @brief Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
446  @details The algorithm provides only clustering services (unsupervised learning).
447  Here an example of data clustering process:
448  @code
449  # Read dataset from text file
450  sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL);
451 
452  # Amount of cluster that should be allocated
453  amount = 2;
454 
455  # Prepare initial means and covariances using K-Means initializer
456  initializer = ema_init_type.KMEANS_INITIALIZATION;
457  initial_means, initial_covariance = ema_initializer(sample, amount).initialize(initializer);
458 
459  # Lets create observer to see clustering process
460  observer = ema_observer();
461 
462  # Create instance of the EM algorithm
463  ema_instance = ema(sample, amount, initial_means, initial_covariance, observer=observer);
464 
465  # Run clustering process
466  ema_instance.process();
467 
468  # Extract clusters
469  clusters = ema_instance.get_clusters();
470  print("Obtained clusters:", clusters);
471 
472  # Display allocated clusters using visualizer
473  covariances = ema_instance.get_covariances();
474  means = ema_instance.get_centers();
475  ema_visualizer.show_clusters(clusters, sample, covariances, means);
476 
477  # Show animation process
478  ema_visualizer.animate_cluster_allocation(sample, observer);
479 
480  @endcode
481 
482  Here is clustering results of the Expectation-Maximization clustering algorithm where popular sample 'OldFaithful' was used.
483  Initial random means and covariances were used in the example. The first step is presented on the left side of the figure and
484  final result (the last step) is on the right side:
485  @image html ema_old_faithful_clustering.png
486 
487  @see ema_visualizer
488  @see ema_observer
489 
490  """
491  def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
492  """!
493  @brief Initializes Expectation-Maximization algorithm for cluster analysis.
494 
495  @param[in] data (list): Dataset that should be analysed and where each point (object) is represented by the list of coordinates.
496  @param[in] amount_clusters (uint): Amount of clusters that should be allocated.
497  @param[in] means (list): Initial means of clusters (amount of means should be equal to amount of clusters for allocation).
498  If this parameter is 'None' then K-Means algorithm with K-Means++ method will be used for initialization by default.
499  @param[in] variances (list): Initial cluster variances (or covariances in case of multi-dimensional data). Amount of
500  covariances should be equal to amount of clusters that should be allocated. If this parameter is 'None' then
501  K-Means algorithm with K-Means++ method will be used for initialization by default.
502  @param[in] observer (ema_observer): Observer for gathering information about clustering process.
503  @param[in] tolerance (float): Defines stop condition of the algorithm (when difference between current and
504  previous log-likelihood estimation is less then 'tolerance' then clustering is over).
505  @param[in] iterations (uint): Additional stop condition parameter that defines maximum number of steps that can be
506  performed by the algorithm during clustering process.
507 
508  """
509 
510  self.__data = numpy.array(data)
511  self.__amount_clusters = amount_clusters
512  self.__tolerance = tolerance
513  self.__iterations = iterations
514  self.__observer = observer
515 
516  self.__means = means
517  self.__variances = variances
518 
519  if (means is None) or (variances is None):
520  self.__means, self.__variances = ema_initializer(data, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION)
521 
522  if len(self.__means) != amount_clusters:
523  self.__amount_clusters = len(self.__means)
524 
525  self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ]
526  self.__pic = [1.0] * amount_clusters
527  self.__clusters = []
528  self.__gaussians = [ [] for _ in range(amount_clusters) ]
529  self.__stop = False
530 
531 
532  def process(self):
533  """!
534  @brief Run clustering process of the algorithm.
535  @details This method should be called before call 'get_clusters()'.
536 
537  """
538 
539  previous_likelihood = -200000
540  current_likelihood = -100000
541 
542  current_iteration = 0
543  while(self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations):
544  self.__expectation_step()
545  self.__maximization_step()
546 
547  current_iteration += 1
548 
549  self.__extract_clusters()
550  self.__notify()
551 
552  previous_likelihood = current_likelihood
553  current_likelihood = self.__log_likelihood()
554  self.__stop = self.__get_stop_condition()
555 
557 
558 
559  def get_clusters(self):
560  """!
561  @return (list) Allocated clusters where each cluster is represented by list of indexes of points from dataset,
562  for example, two cluster may have following representation [[0, 1, 4], [2, 3, 5, 6]].
563 
564  """
565  return self.__clusters
566 
567 
568  def get_centers(self):
569  """!
570  @return (list) Corresponding centers (means) of clusters.
571 
572  """
573 
574  return self.__means
575 
576 
577  def get_covariances(self):
578  """!
579  @return (list) Corresponding variances (or covariances in case of multi-dimensional data) of clusters.
580 
581  """
582 
583  return self.__variances
584 
585 
586  def get_probabilities(self):
587  """!
588  @brief Returns 2-dimensional list with belong probability of each object from data to cluster correspondingly,
589  where that first index is for cluster and the second is for point.
590 
591  @code
592  # Get belong probablities
593  probabilities = ema_instance.get_probabilities();
594 
595  # Show porbability of the fifth element in the first and in the second cluster
596  index_point = 5;
597  print("Probability in the first cluster:", probabilities[0][index_point]);
598  print("Probability in the first cluster:", probabilities[1][index_point]);
599  @endcode
600 
601  @return (list) 2-dimensional list with belong probability of each object from data to cluster.
602 
603  """
604 
605  return self.__rc
606 
607 
608  def __erase_empty_clusters(self):
609  clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], []
610 
611  for index_cluster in range(len(self.__clusters)):
612  if len(self.__clusters[index_cluster]) > 0:
613  clusters.append(self.__clusters[index_cluster])
614  means.append(self.__means[index_cluster])
615  variances.append(self.__variances[index_cluster])
616  pic.append(self.__pic[index_cluster])
617  gaussians.append(self.__gaussians[index_cluster])
618  rc.append(self.__rc[index_cluster])
619 
620  if len(self.__clusters) != len(clusters):
621  self.__clusters, self.__means, self.__variances, self.__pic = clusters, means, variances, pic
622  self.__gaussians, self.__rc = gaussians, rc
623  self.__amount_clusters = len(self.__clusters)
624 
625 
626  def __notify(self):
627  if self.__observer is not None:
628  self.__observer.notify(self.__means, self.__variances, self.__clusters)
629 
630 
631  def __extract_clusters(self):
632  self.__clusters = [ [] for _ in range(self.__amount_clusters) ]
633  for index_point in range(len(self.__data)):
634  candidates = []
635  for index_cluster in range(self.__amount_clusters):
636  candidates.append((index_cluster, self.__rc[index_cluster][index_point]))
637 
638  index_winner = max(candidates, key = lambda candidate : candidate[1])[0]
639  self.__clusters[index_winner].append(index_point)
640 
642 
643 
644  def __log_likelihood(self):
645  likelihood = 0.0
646 
647  for index_point in range(len(self.__data)):
648  particle = 0.0
649  for index_cluster in range(self.__amount_clusters):
650  particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point]
651 
652  if particle > 0.0:
653  likelihood += numpy.log(particle)
654 
655  return likelihood
656 
657 
658  def __probabilities(self, index_cluster, index_point):
659  divider = 0.0
660  for i in range(self.__amount_clusters):
661  divider += self.__pic[i] * self.__gaussians[i][index_point]
662 
663  if (divider != 0.0) and (divider != float('inf')):
664  return self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider
665 
666  return 1.0
667 
668 
669  def __expectation_step(self):
670  self.__gaussians = [ [] for _ in range(self.__amount_clusters) ]
671  for index in range(self.__amount_clusters):
672  self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index])
673 
674  self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ]
675  for index_cluster in range(self.__amount_clusters):
676  for index_point in range(len(self.__data)):
677  self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point)
678 
679 
680  def __maximization_step(self):
681  self.__pic = []
682  self.__means = []
683  self.__variances = []
684 
685  amount_impossible_clusters = 0
686 
687  for index_cluster in range(self.__amount_clusters):
688  mc = numpy.sum(self.__rc[index_cluster])
689 
690  if mc == 0.0:
691  amount_impossible_clusters += 1
692  continue
693 
694  self.__pic.append( mc / len(self.__data) )
695  self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) )
696  self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) )
697 
698  self.__amount_clusters -= amount_impossible_clusters
699 
700 
701  def __get_stop_condition(self):
702  for covariance in self.__variances:
703  if numpy.linalg.norm(covariance) == 0.0:
704  return True
705 
706  return False
707 
708 
709  def __update_covariance(self, means, rc, mc):
710  covariance = 0.0
711  for index_point in range(len(self.__data)):
712  deviation = numpy.array( [ self.__data[index_point] - means ])
713  covariance += rc[index_point] * deviation.T.dot(deviation)
714 
715  covariance = covariance / mc
716  return covariance
717 
718 
719  def __update_mean(self, rc, mc):
720  mean = 0.0
721  for index_point in range(len(self.__data)):
722  mean += rc[index_point] * self.__data[index_point]
723 
724  mean = mean / mc
725  return mean
726 
727 
728  def __normalize_probabilities(self):
729  for index_point in range(len(self.__data)):
730  probability = 0.0
731  for index_cluster in range(len(self.__clusters)):
732  probability += self.__rc[index_cluster][index_point]
733 
734  if abs(probability - 1.0) > 0.000001:
735  self.__normalize_probability(index_point, probability)
736 
737 
738  def __normalize_probability(self, index_point, probability):
739  if probability == 0.0:
740  return
741 
742  normalization = 1.0 / probability
743 
744  for index_cluster in range(len(self.__clusters)):
745  self.__rc[index_cluster][index_point] *= normalization
Common visualizer of clusters on 1D, 2D or 3D surface.
Definition: __init__.py:359
pyclustering module for cluster analysis.
Definition: __init__.py:1
def __init__(self, sample, amount)
Constructs EM initializer.
Definition: ema.py:125
Cluster analysis algorithm: K-Means.
Definition: kmeans.py:1
def __update_covariance(self, means, rc, mc)
Definition: ema.py:709
def __probabilities(self, index_cluster, index_point)
Definition: ema.py:658
Utils that are used by modules of pyclustering.
Definition: __init__.py:1
def animate_cluster_allocation(data, observer, animation_velocity=75, movie_fps=1, save_movie=None)
Animates clustering process that is performed by EM algorithm.
Definition: ema.py:383
def __extract_clusters(self)
Definition: ema.py:631
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
Definition: ema.py:443
def get_covariances(self)
Definition: ema.py:577
def __get_stop_condition(self)
Definition: ema.py:701
def gaussian(data, mean, covariance)
Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covar...
Definition: ema.py:48
def notify(self, means, covariances, clusters)
This method is used by the algorithm to notify observer about changes where the algorithm should prov...
Definition: ema.py:323
def __expectation_step(self)
Definition: ema.py:669
Observer of EM algorithm for collecting algorithm state on each step.
Definition: ema.py:235
Class represents K-Means clustering algorithm.
Definition: kmeans.py:272
def __calculate_initial_clusters(self, centers)
Calculate Euclidean distance to each point from the each cluster.
Definition: ema.py:156
def initialize(self, init_type=ema_init_type.KMEANS_INITIALIZATION)
Calculates initial parameters for EM algorithm: means and covariances using specified strategy...
Definition: ema.py:137
def __init__(self)
Initializes EM observer.
Definition: ema.py:273
def show_clusters(clusters, sample, covariances, means, figure=None, display=True)
Draws clusters and in case of two-dimensional dataset draws their ellipses.
Definition: ema.py:348
def __normalize_probabilities(self)
Definition: ema.py:728
Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm...
Definition: ema.py:102
def __update_mean(self, rc, mc)
Definition: ema.py:719
def __maximization_step(self)
Definition: ema.py:680
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def process(self)
Run clustering process of the algorithm.
Definition: ema.py:532
def get_clusters(self)
Definition: ema.py:559
def get_probabilities(self)
Returns 2-dimensional list with belong probability of each object from data to cluster correspondingl...
Definition: ema.py:586
Visualizer of EM algorithm&#39;s results.
Definition: ema.py:339
def __calculate_initial_covariances(self, initial_clusters)
Definition: ema.py:180
Enumeration of initialization types for Expectation-Maximization algorithm.
Definition: ema.py:84
def __log_likelihood(self)
Definition: ema.py:644
def __normalize_probability(self, index_point, probability)
Definition: ema.py:738
def __init__(self, data, amount_clusters, means=None, variances=None, observer=None, tolerance=0.00001, iterations=100)
Initializes Expectation-Maximization algorithm for cluster analysis.
Definition: ema.py:491
def __erase_empty_clusters(self)
Definition: ema.py:608