3 @brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model.
4 @details Implementation based on paper @cite article::ema::1.
6 @authors Andrei Novikov (pyclustering@yandex.ru)
8 @copyright BSD-3-Clause
20 from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_square
22 from enum
import IntEnum
24 import matplotlib.pyplot
as plt
25 import matplotlib.animation
as animation
26 from matplotlib
import patches
31 @brief Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covariance in case
32 multi-dimensional data.
34 @param[in] data (list): Data that is used for gaussian calculation.
35 @param[in] mean (float|numpy.array): Mathematical expectation used for calculation.
36 @param[in] covariance (float|numpy.array): Variance or covariance matrix for calculation.
38 @return (list) Value of gaussian function for each point in dataset.
41 dimension = float(len(data[0]))
44 inv_variance = numpy.linalg.pinv(covariance)
46 inv_variance = 1.0 / covariance
48 divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance))
50 right_const = 1.0 / divider
52 right_const = float(
'inf')
57 mean_delta = point - mean
58 point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) )
59 result.append(point_gaussian)
67 @brief Enumeration of initialization types for Expectation-Maximization algorithm.
73 RANDOM_INITIALIZATION = 0
79 KMEANS_INITIALIZATION = 1
85 @brief Provides services for preparing initial means and covariances for Expectation-Maximization algorithm.
86 @details Initialization strategy is defined by enumerator 'ema_init_type': random initialization and
87 kmeans with kmeans++ initialization. Here an example of initialization using kmeans strategy:
90 from pyclustering.utils import read_sample
91 from pyclustering.samples.definitions import FAMOUS_SAMPLES
92 from pyclustering.cluster.ema import ema_initializer
94 sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL)
97 initial_means, initial_covariance = ema_initializer(sample, amount_clusters).initialize()
99 print(initial_covariance)
104 __MAX_GENERATION_ATTEMPTS = 10
108 @brief Constructs EM initializer.
110 @param[in] sample (list): Data that will be used by the EM algorithm.
111 @param[in] amount (uint): Amount of clusters that should be allocated by the EM algorithm.
118 def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
120 @brief Calculates initial parameters for EM algorithm: means and covariances using
123 @param[in] init_type (ema_init_type): Strategy for initialization.
125 @return (float|list, float|numpy.array) Initial means and variance (covariance matrix in case multi-dimensional data).
128 if init_type == ema_init_type.KMEANS_INITIALIZATION:
131 elif init_type == ema_init_type.RANDOM_INITIALIZATION:
134 raise NameError(
"Unknown type of EM algorithm initialization is specified.")
137 def __calculate_initial_clusters(self, centers):
139 @brief Calculate Euclidean distance to each point from the each cluster.
140 @brief Nearest points are captured by according clusters and as a result clusters are updated.
142 @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
146 clusters = [[]
for _
in range(len(centers))]
147 for index_point
in range(len(self.
__sample)):
148 index_optim, dist_optim = -1, 0.0
150 for index
in range(len(centers)):
151 dist = euclidean_distance_square(self.
__sample[index_point], centers[index])
153 if (dist < dist_optim)
or (index == 0):
154 index_optim, dist_optim = index, dist
156 clusters[index_optim].append(index_point)
161 def __calculate_initial_covariances(self, initial_clusters):
163 for initial_cluster
in initial_clusters:
164 if len(initial_cluster) > 1:
165 cluster_sample = [self.
__sample[index_point]
for index_point
in initial_cluster]
166 covariances.append(numpy.cov(cluster_sample, rowvar=
False))
169 covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
174 def __initialize_random(self):
180 while (mean
in initial_means)
and (attempts < ema_initializer.__MAX_GENERATION_ATTEMPTS):
184 if attempts == ema_initializer.__MAX_GENERATION_ATTEMPTS:
185 mean = [ value + (random.random() - 0.5) * value * 0.2
for value
in mean ]
187 initial_means.append(mean)
192 return initial_means, initial_covariance
195 def __initialize_kmeans(self):
197 kmeans_instance =
kmeans(self.
__sample, initial_centers, ccore =
True)
198 kmeans_instance.process()
200 means = kmeans_instance.get_centers()
203 initial_clusters = kmeans_instance.get_clusters()
204 for initial_cluster
in initial_clusters:
205 if len(initial_cluster) > 1:
206 cluster_sample = [ self.
__sample[index_point]
for index_point
in initial_cluster ]
207 covariances.append(numpy.cov(cluster_sample, rowvar=
False))
210 covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
212 return means, covariances
218 @brief Observer of EM algorithm for collecting algorithm state on each step.
219 @details It can be used to obtain whole picture about clustering process of EM algorithm. Allocated clusters,
220 means and covariances are stored in observer on each step. Here an example of usage:
223 from pyclustering.cluster.ema import ema, ema_observer
224 from pyclustering.utils import read_sample
225 from pyclustering.samples.definitions import SIMPLE_SAMPLES
227 # Read data from text file.
228 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
230 # Create EM observer.
231 observer = ema_observer()
233 # Create EM algorithm to allocated four clusters and pass observer to it.
234 ema_instance = ema(sample, 4, observer=observer)
236 # Run clustering process.
237 ema_instance.process()
239 # Print amount of steps that were done by the algorithm.
240 print("EMA steps:", observer.get_iterations())
242 # Print evolution of means and covariances.
243 print("Means evolution:", observer.get_evolution_means())
244 print("Covariances evolution:", observer.get_evolution_covariances())
246 # Print evolution of clusters.
247 print("Clusters evolution:", observer.get_evolution_clusters())
249 # Print final clusters.
250 print("Allocated clusters:", observer.get_evolution_clusters()[-1])
256 @brief Initializes EM observer.
266 @return (uint) Amount of iterations that were done by the EM algorithm.
274 @return (uint) Amount of iterations that were done by the EM algorithm.
282 @return (list) Mean of each cluster on each step of clustering.
290 @return (list) Covariance matrix (or variance in case of one-dimensional data) of each cluster on each step of clustering.
298 @return (list) Allocated clusters on each step of clustering.
304 def notify(self, means, covariances, clusters):
306 @brief This method is used by the algorithm to notify observer about changes where the algorithm
307 should provide new values: means, covariances and allocated clusters.
309 @param[in] means (list): Mean of each cluster on currect step.
310 @param[in] covariances (list): Covariances of each cluster on current step.
311 @param[in] clusters (list): Allocated cluster on current step.
322 @brief Visualizer of EM algorithm's results.
323 @details Provides services for visualization of particular features of the algorithm, for example,
324 in case of two-dimensional dataset it shows covariance ellipses.
329 def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
331 @brief Draws clusters and in case of two-dimensional dataset draws their ellipses.
333 @param[in] clusters (list): Clusters that were allocated by the algorithm.
334 @param[in] sample (list): Dataset that were used for clustering.
335 @param[in] covariances (list): Covariances of the clusters.
336 @param[in] means (list): Means of the clusters.
337 @param[in] figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used
339 @param[in] display (bool): If 'True' then figure will be shown by the method, otherwise it should be
340 shown manually using matplotlib function 'plt.show()'.
342 @return (figure) Figure where clusters were drawn.
347 visualizer.append_clusters(clusters, sample)
350 figure = visualizer.show(display =
False)
352 visualizer.show(figure = figure, display =
False)
354 if len(sample[0]) == 2:
355 ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means)
366 @brief Animates clustering process that is performed by EM algorithm.
368 @param[in] data (list): Dataset that is used for clustering.
369 @param[in] observer (ema_observer): EM observer that was used for collection information about clustering process.
370 @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
371 @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
372 @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
376 figure = plt.figure()
379 return frame_generation(0)
381 def frame_generation(index_iteration):
384 figure.suptitle(
"EM algorithm (iteration: " + str(index_iteration) +
")", fontsize = 18, fontweight =
'bold')
386 clusters = observer.get_evolution_clusters()[index_iteration]
387 covariances = observer.get_evolution_covariances()[index_iteration]
388 means = observer.get_evolution_means()[index_iteration]
390 ema_visualizer.show_clusters(clusters, data, covariances, means, figure,
False)
391 figure.subplots_adjust(top = 0.85)
393 return [ figure.gca() ]
395 iterations = len(observer)
396 cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000)
398 if save_movie
is not None:
399 cluster_animation.save(save_movie, writer =
'ffmpeg', fps = movie_fps, bitrate = 1500)
405 def __draw_ellipses(figure, visualizer, clusters, covariances, means):
406 ax = figure.get_axes()[0]
408 for index
in range(len(clusters)):
409 angle, width, height = calculate_ellipse_description(covariances[index])
410 color = visualizer.get_cluster_color(index, 0)
412 ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color)
416 def __draw_ellipse(ax, x, y, angle, width, height, color):
417 if (width > 0.0)
and (height > 0.0):
418 ax.plot(x, y, color=color, marker=
'x', markersize=6)
419 ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=-angle, linewidth=2, fill=
True, zorder=2, color=color)
420 ax.add_patch(ellipse)
426 @brief Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
427 @details The algorithm provides only clustering services (unsupervised learning).
428 Here an example of data clustering process:
430 from pyclustering.cluster.ema import ema, ema_visualizer
431 from pyclustering.utils import read_sample
432 from pyclustering.samples.definitions import FCPS_SAMPLES
434 # Read data from text file.
435 sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN)
437 # Create EM algorithm to allocated four clusters.
438 ema_instance = ema(sample, 3)
440 # Run clustering process.
441 ema_instance.process()
443 # Get clustering results.
444 clusters = ema_instance.get_clusters()
445 covariances = ema_instance.get_covariances()
446 means = ema_instance.get_centers()
448 # Visualize obtained clustering results.
449 ema_visualizer.show_clusters(clusters, sample, covariances, means)
452 Here is clustering results of the Expectation-Maximization clustering algorithm where popular sample 'OldFaithful' was used.
453 Initial random means and covariances were used in the example. The first step is presented on the left side of the figure and
454 final result (the last step) is on the right side:
455 @image html ema_old_faithful_clustering.png
461 def __init__(self, data, amount_clusters, means=None, variances=None, observer=None, tolerance=0.00001, iterations=100):
463 @brief Initializes Expectation-Maximization algorithm for cluster analysis.
465 @param[in] data (list): Dataset that should be analysed and where each point (object) is represented by the list of coordinates.
466 @param[in] amount_clusters (uint): Amount of clusters that should be allocated.
467 @param[in] means (list): Initial means of clusters (amount of means should be equal to amount of clusters for allocation).
468 If this parameter is 'None' then K-Means algorithm with K-Means++ method will be used for initialization by default.
469 @param[in] variances (list): Initial cluster variances (or covariances in case of multi-dimensional data). Amount of
470 covariances should be equal to amount of clusters that should be allocated. If this parameter is 'None' then
471 K-Means algorithm with K-Means++ method will be used for initialization by default.
472 @param[in] observer (ema_observer): Observer for gathering information about clustering process.
473 @param[in] tolerance (float): Defines stop condition of the algorithm (when difference between current and
474 previous log-likelihood estimation is less then 'tolerance' then clustering is over).
475 @param[in] iterations (uint): Additional stop condition parameter that defines maximum number of steps that can be
476 performed by the algorithm during clustering process.
480 self.
__data = numpy.array(data)
491 if (means
is None)
or (variances
is None):
494 if len(self.
__means) != amount_clusters:
497 self.
__rc = [ [0.0] * len(self.
__data)
for _
in range(amount_clusters) ]
498 self.
__pic = [1.0] * amount_clusters
500 self.
__gaussians = [ []
for _
in range(amount_clusters) ]
506 @brief Run clustering process of the algorithm.
508 @return (ema) Returns itself (EMA instance).
512 previous_likelihood = -200000
513 current_likelihood = -100000
515 current_iteration = 0
516 while(self.
__stop is False)
and (abs(previous_likelihood - current_likelihood) > self.
__tolerance)
and (current_iteration < self.
__iterations):
520 current_iteration += 1
525 previous_likelihood = current_likelihood
535 @return (list) Allocated clusters where each cluster is represented by list of indexes of points from dataset,
536 for example, two cluster may have following representation [[0, 1, 4], [2, 3, 5, 6]].
544 @return (list) Corresponding centers (means) of clusters.
553 @return (list) Corresponding variances (or covariances in case of multi-dimensional data) of clusters.
562 @brief Returns 2-dimensional list with belong probability of each object from data to cluster correspondingly,
563 where that first index is for cluster and the second is for point.
566 # Get belong probablities
567 probabilities = ema_instance.get_probabilities();
569 # Show porbability of the fifth element in the first and in the second cluster
571 print("Probability in the first cluster:", probabilities[0][index_point]);
572 print("Probability in the first cluster:", probabilities[1][index_point]);
575 @return (list) 2-dimensional list with belong probability of each object from data to cluster.
582 def __erase_empty_clusters(self):
583 clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], []
585 for index_cluster
in range(len(self.
__clusters)):
587 clusters.append(self.
__clusters[index_cluster])
588 means.append(self.
__means[index_cluster])
590 pic.append(self.
__pic[index_cluster])
592 rc.append(self.
__rc[index_cluster])
605 def __extract_clusters(self):
607 for index_point
in range(len(self.
__data)):
610 candidates.append((index_cluster, self.
__rc[index_cluster][index_point]))
612 index_winner = max(candidates, key=
lambda candidate: candidate[1])[0]
613 self.
__clusters[index_winner].append(index_point)
618 def __log_likelihood(self):
621 for index_point
in range(len(self.
__data)):
624 particle += self.
__pic[index_cluster] * self.
__gaussians[index_cluster][index_point]
627 likelihood += numpy.log(particle)
632 def __probabilities(self, index_cluster, index_point):
637 if (divider != 0.0)
and (divider != float(
'inf')):
638 return self.
__pic[index_cluster] * self.
__gaussians[index_cluster][index_point] / divider
643 def __expectation_step(self):
650 for index_point
in range(len(self.
__data)):
654 def __maximization_step(self):
659 amount_impossible_clusters = 0
662 mc = numpy.sum(self.
__rc[index_cluster])
665 amount_impossible_clusters += 1
675 def __get_stop_condition(self):
677 if numpy.linalg.norm(covariance) == 0.0:
683 def __update_covariance(self, means, rc, mc):
685 for index_point
in range(len(self.
__data)):
686 deviation = numpy.array([self.
__data[index_point] - means])
687 covariance += rc[index_point] * deviation.T.dot(deviation)
689 covariance = covariance / mc
693 def __update_mean(self, rc, mc):
695 for index_point
in range(len(self.
__data)):
696 mean += rc[index_point] * self.
__data[index_point]
702 def __normalize_probabilities(self):
703 for index_point
in range(len(self.
__data)):
705 for index_cluster
in range(len(self.
__clusters)):
706 probability += self.
__rc[index_cluster][index_point]
708 if abs(probability - 1.0) > 0.000001:
712 def __normalize_probability(self, index_point, probability):
713 if probability == 0.0:
716 normalization = 1.0 / probability
718 for index_cluster
in range(len(self.
__clusters)):
719 self.
__rc[index_cluster][index_point] *= normalization
722 def __verify_arguments(self):
724 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.
728 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__data))
731 raise ValueError(
"Amount of clusters (current value '%d') should be greater or equal to 1." %