3 @brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model. 4 @details Implementation based on paper @cite article::ema::1. 6 @authors Andrei Novikov (pyclustering@yandex.ru) 8 @copyright GNU Public License 10 @cond GNU_PUBLIC_LICENSE 11 PyClustering is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 16 PyClustering is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 21 You should have received a copy of the GNU General Public License 22 along with this program. If not, see <http://www.gnu.org/licenses/>. 36 from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_square
38 from enum
import IntEnum
41 import matplotlib.pyplot
as plt
42 import matplotlib.animation
as animation
43 from matplotlib
import patches
44 except Exception
as error_instance:
45 warnings.warn(
"Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization " 46 "functionality is not available (details: '%s')." % str(error_instance))
50 @brief Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covariance in case 51 multi-dimensional data. 53 @param[in] data (list): Data that is used for gaussian calculation. 54 @param[in] mean (float|numpy.array): Mathematical expectation used for calculation. 55 @param[in] covariance (float|numpy.array): Variance or covariance matrix for calculation. 57 @return (list) Value of gaussian function for each point in dataset. 60 dimension = float(len(data[0]))
63 inv_variance = numpy.linalg.pinv(covariance)
65 inv_variance = 1.0 / covariance
67 divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance))
69 right_const = 1.0 / divider
71 right_const = float(
'inf')
76 mean_delta = point - mean
77 point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) )
78 result.append(point_gaussian)
86 @brief Enumeration of initialization types for Expectation-Maximization algorithm. 92 RANDOM_INITIALIZATION = 0
98 KMEANS_INITIALIZATION = 1
104 @brief Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm. 105 @details Initialization strategy is defined by enumerator 'ema_init_type': random initialization and 106 kmeans with kmeans++ initialization. Here an example of initialization using kmeans strategy: 109 from pyclustering.utils import read_sample; 110 from pyclustering.samples.definitions import FAMOUS_SAMPLES; 111 from pyclustering.cluster.ema import ema_initializer; 113 sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL); 116 initial_means, initial_covariance = ema_initializer(sample, amount_clusters).initialize(initializer); 117 print(initial_means); 118 print(initial_covariance); 123 __MAX_GENERATION_ATTEMPTS = 10
127 @brief Constructs EM initializer. 129 @param[in] sample (list): Data that will be used by the EM algorithm. 130 @param[in] amount (uint): Amount of clusters that should be allocated by the EM algorithm. 137 def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
139 @brief Calculates initial parameters for EM algorithm: means and covariances using 142 @param[in] init_type (ema_init_type): Strategy for initialization. 144 @return (float|list, float|numpy.array) Initial means and variance (covariance matrix in case multi-dimensional data). 147 if init_type == ema_init_type.KMEANS_INITIALIZATION:
150 elif init_type == ema_init_type.RANDOM_INITIALIZATION:
153 raise NameError(
"Unknown type of EM algorithm initialization is specified.")
156 def __calculate_initial_clusters(self, centers):
158 @brief Calculate Euclidean distance to each point from the each cluster. 159 @brief Nearest points are captured by according clusters and as a result clusters are updated. 161 @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data. 165 clusters = [[]
for _
in range(len(centers))]
166 for index_point
in range(len(self.
__sample)):
167 index_optim, dist_optim = -1, 0.0
169 for index
in range(len(centers)):
170 dist = euclidean_distance_square(self.
__sample[index_point], centers[index])
172 if (dist < dist_optim)
or (index
is 0):
173 index_optim, dist_optim = index, dist
175 clusters[index_optim].append(index_point)
180 def __calculate_initial_covariances(self, initial_clusters):
182 for initial_cluster
in initial_clusters:
183 if len(initial_cluster) > 1:
184 cluster_sample = [ self.
__sample[index_point]
for index_point
in initial_cluster ]
185 covariances.append(numpy.cov(cluster_sample, rowvar =
False))
188 covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
193 def __initialize_random(self):
199 while (mean
in initial_means)
and (attempts < ema_initializer.__MAX_GENERATION_ATTEMPTS):
203 if attempts == ema_initializer.__MAX_GENERATION_ATTEMPTS:
204 mean = [ value + (random.random() - 0.5) * value * 0.2
for value
in mean ]
206 initial_means.append(mean)
211 return initial_means, initial_covariance
214 def __initialize_kmeans(self):
216 kmeans_instance =
kmeans(self.
__sample, initial_centers, ccore =
True)
217 kmeans_instance.process()
219 means = kmeans_instance.get_centers()
222 initial_clusters = kmeans_instance.get_clusters()
223 for initial_cluster
in initial_clusters:
224 if len(initial_cluster) > 1:
225 cluster_sample = [ self.
__sample[index_point]
for index_point
in initial_cluster ]
226 covariances.append(numpy.cov(cluster_sample, rowvar =
False))
229 covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
231 return means, covariances
237 @brief Observer of EM algorithm for collecting algorithm state on each step. 238 @details It can be used to obtain whole picture about clustering process of EM algorithm. Allocated clusters, 239 means and covariances are stored in observer on each step. Here an example of usage: 242 from pyclustering.cluster.ema import ema, ema_observer; 243 from pyclustering.utils import read_sample; 244 from pyclustering.samples.definitions import SIMPLE_SAMPLES; 246 # Read data from text file 247 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3); 250 observer = ema_observer(); 252 # Create EM algorithm to allocated four clusters and pass observer to it 253 ema_instance = ema(sample, 4, observer=observer); 255 # Run clustering process 256 ema_instance.process(); 258 # Print amount of steps that were done by the algorithm 259 print("EMA steps:", observer.get_iterations()); 261 # Print evolution of means and covariances 262 print("Means evolution:", observer.get_evolution_means()); 263 print("Covariances evolution:", observer.get_evolution_covariances()); 265 # Print evolution of clusters 266 print("Clusters evolution:", observer.get_evolution_clusters()); 268 # Print final clusters 269 print("Allocated clusters:", observer.get_evolution_clusters()[-1]); 275 @brief Initializes EM observer. 285 @return (uint) Amount of iterations that were done by the EM algorithm. 293 @return (uint) Amount of iterations that were done by the EM algorithm. 301 @return (list) Mean of each cluster on each step of clustering. 309 @return (list) Covariance matrix (or variance in case of one-dimensional data) of each cluster on each step of clustering. 317 @return (list) Allocated clusters on each step of clustering. 323 def notify(self, means, covariances, clusters):
325 @brief This method is used by the algorithm to notify observer about changes where the algorithm 326 should provide new values: means, covariances and allocated clusters. 328 @param[in] means (list): Mean of each cluster on currect step. 329 @param[in] covariances (list): Covariances of each cluster on current step. 330 @param[in] clusters (list): Allocated cluster on current step. 341 @brief Visualizer of EM algorithm's results. 342 @details Provides services for visualization of particular features of the algorithm, for example, 343 in case of two-dimensional dataset it shows covariance ellipses. 348 def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
350 @brief Draws clusters and in case of two-dimensional dataset draws their ellipses. 352 @param[in] clusters (list): Clusters that were allocated by the algorithm. 353 @param[in] sample (list): Dataset that were used for clustering. 354 @param[in] covariances (list): Covariances of the clusters. 355 @param[in] means (list): Means of the clusters. 356 @param[in] figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used 358 @param[in] display (bool): If 'True' then figure will be shown by the method, otherwise it should be 359 shown manually using matplotlib function 'plt.show()'. 361 @return (figure) Figure where clusters were drawn. 366 visualizer.append_clusters(clusters, sample)
369 figure = visualizer.show(display =
False)
371 visualizer.show(figure = figure, display =
False)
373 if len(sample[0]) == 2:
374 ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means)
385 @brief Animates clustering process that is performed by EM algorithm. 387 @param[in] data (list): Dataset that is used for clustering. 388 @param[in] observer (ema_observer): EM observer that was used for collection information about clustering process. 389 @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only). 390 @param[in] movie_fps (uint): Defines frames per second (for rendering movie only). 391 @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter. 395 figure = plt.figure()
398 return frame_generation(0)
400 def frame_generation(index_iteration):
403 figure.suptitle(
"EM algorithm (iteration: " + str(index_iteration) +
")", fontsize = 18, fontweight =
'bold')
405 clusters = observer.get_evolution_clusters()[index_iteration]
406 covariances = observer.get_evolution_covariances()[index_iteration]
407 means = observer.get_evolution_means()[index_iteration]
409 ema_visualizer.show_clusters(clusters, data, covariances, means, figure,
False)
410 figure.subplots_adjust(top = 0.85)
412 return [ figure.gca() ]
414 iterations = len(observer)
415 cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000)
417 if save_movie
is not None:
418 cluster_animation.save(save_movie, writer =
'ffmpeg', fps = movie_fps, bitrate = 1500)
424 def __draw_ellipses(figure, visualizer, clusters, covariances, means):
425 ax = figure.get_axes()[0]
427 for index
in range(len(clusters)):
428 angle, width, height = calculate_ellipse_description(covariances[index])
429 color = visualizer.get_cluster_color(index, 0)
431 ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color)
435 def __draw_ellipse(ax, x, y, angle, width, height, color):
436 if (width > 0.0)
and (height > 0.0):
437 ax.plot(x, y, color=color, marker=
'x', markersize=6)
438 ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=-angle, linewidth=2, fill=
True, zorder=2, color=color)
439 ax.add_patch(ellipse)
445 @brief Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM). 446 @details The algorithm provides only clustering services (unsupervised learning). 447 Here an example of data clustering process: 449 # Read dataset from text file 450 sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL); 452 # Amount of cluster that should be allocated 455 # Prepare initial means and covariances using K-Means initializer 456 initializer = ema_init_type.KMEANS_INITIALIZATION; 457 initial_means, initial_covariance = ema_initializer(sample, amount).initialize(initializer); 459 # Lets create observer to see clustering process 460 observer = ema_observer(); 462 # Create instance of the EM algorithm 463 ema_instance = ema(sample, amount, initial_means, initial_covariance, observer=observer); 465 # Run clustering process 466 ema_instance.process(); 469 clusters = ema_instance.get_clusters(); 470 print("Obtained clusters:", clusters); 472 # Display allocated clusters using visualizer 473 covariances = ema_instance.get_covariances(); 474 means = ema_instance.get_centers(); 475 ema_visualizer.show_clusters(clusters, sample, covariances, means); 477 # Show animation process 478 ema_visualizer.animate_cluster_allocation(sample, observer); 482 Here is clustering results of the Expectation-Maximization clustering algorithm where popular sample 'OldFaithful' was used. 483 Initial random means and covariances were used in the example. The first step is presented on the left side of the figure and 484 final result (the last step) is on the right side: 485 @image html ema_old_faithful_clustering.png 491 def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
493 @brief Initializes Expectation-Maximization algorithm for cluster analysis. 495 @param[in] data (list): Dataset that should be analysed and where each point (object) is represented by the list of coordinates. 496 @param[in] amount_clusters (uint): Amount of clusters that should be allocated. 497 @param[in] means (list): Initial means of clusters (amount of means should be equal to amount of clusters for allocation). 498 If this parameter is 'None' then K-Means algorithm with K-Means++ method will be used for initialization by default. 499 @param[in] variances (list): Initial cluster variances (or covariances in case of multi-dimensional data). Amount of 500 covariances should be equal to amount of clusters that should be allocated. If this parameter is 'None' then 501 K-Means algorithm with K-Means++ method will be used for initialization by default. 502 @param[in] observer (ema_observer): Observer for gathering information about clustering process. 503 @param[in] tolerance (float): Defines stop condition of the algorithm (when difference between current and 504 previous log-likelihood estimation is less then 'tolerance' then clustering is over). 505 @param[in] iterations (uint): Additional stop condition parameter that defines maximum number of steps that can be 506 performed by the algorithm during clustering process. 510 self.
__data = numpy.array(data)
519 if (means
is None)
or (variances
is None):
522 if len(self.
__means) != amount_clusters:
525 self.
__rc = [ [0.0] * len(self.
__data)
for _
in range(amount_clusters) ]
526 self.
__pic = [1.0] * amount_clusters
528 self.
__gaussians = [ []
for _
in range(amount_clusters) ]
534 @brief Run clustering process of the algorithm. 535 @details This method should be called before call 'get_clusters()'. 539 previous_likelihood = -200000
540 current_likelihood = -100000
542 current_iteration = 0
543 while(self.
__stop is False)
and (abs(previous_likelihood - current_likelihood) > self.
__tolerance)
and (current_iteration < self.
__iterations):
547 current_iteration += 1
552 previous_likelihood = current_likelihood
561 @return (list) Allocated clusters where each cluster is represented by list of indexes of points from dataset, 562 for example, two cluster may have following representation [[0, 1, 4], [2, 3, 5, 6]]. 570 @return (list) Corresponding centers (means) of clusters. 579 @return (list) Corresponding variances (or covariances in case of multi-dimensional data) of clusters. 588 @brief Returns 2-dimensional list with belong probability of each object from data to cluster correspondingly, 589 where that first index is for cluster and the second is for point. 592 # Get belong probablities 593 probabilities = ema_instance.get_probabilities(); 595 # Show porbability of the fifth element in the first and in the second cluster 597 print("Probability in the first cluster:", probabilities[0][index_point]); 598 print("Probability in the first cluster:", probabilities[1][index_point]); 601 @return (list) 2-dimensional list with belong probability of each object from data to cluster. 608 def __erase_empty_clusters(self):
609 clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], []
611 for index_cluster
in range(len(self.
__clusters)):
613 clusters.append(self.
__clusters[index_cluster])
614 means.append(self.
__means[index_cluster])
616 pic.append(self.
__pic[index_cluster])
618 rc.append(self.
__rc[index_cluster])
631 def __extract_clusters(self):
633 for index_point
in range(len(self.
__data)):
636 candidates.append((index_cluster, self.
__rc[index_cluster][index_point]))
638 index_winner = max(candidates, key =
lambda candidate : candidate[1])[0]
639 self.
__clusters[index_winner].append(index_point)
644 def __log_likelihood(self):
647 for index_point
in range(len(self.
__data)):
650 particle += self.
__pic[index_cluster] * self.
__gaussians[index_cluster][index_point]
653 likelihood += numpy.log(particle)
658 def __probabilities(self, index_cluster, index_point):
663 if (divider != 0.0)
and (divider != float(
'inf')):
664 return self.
__pic[index_cluster] * self.
__gaussians[index_cluster][index_point] / divider
669 def __expectation_step(self):
676 for index_point
in range(len(self.
__data)):
680 def __maximization_step(self):
685 amount_impossible_clusters = 0
688 mc = numpy.sum(self.
__rc[index_cluster])
691 amount_impossible_clusters += 1
701 def __get_stop_condition(self):
703 if numpy.linalg.norm(covariance) == 0.0:
709 def __update_covariance(self, means, rc, mc):
711 for index_point
in range(len(self.
__data)):
712 deviation = numpy.array( [ self.
__data[index_point] - means ])
713 covariance += rc[index_point] * deviation.T.dot(deviation)
715 covariance = covariance / mc
719 def __update_mean(self, rc, mc):
721 for index_point
in range(len(self.
__data)):
722 mean += rc[index_point] * self.
__data[index_point]
728 def __normalize_probabilities(self):
729 for index_point
in range(len(self.
__data)):
731 for index_cluster
in range(len(self.
__clusters)):
732 probability += self.
__rc[index_cluster][index_point]
734 if abs(probability - 1.0) > 0.000001:
738 def __normalize_probability(self, index_point, probability):
739 if probability == 0.0:
742 normalization = 1.0 / probability
744 for index_cluster
in range(len(self.
__clusters)):
745 self.
__rc[index_cluster][index_point] *= normalization
Common visualizer of clusters on 1D, 2D or 3D surface.
pyclustering module for cluster analysis.
def __init__(self, sample, amount)
Constructs EM initializer.
Cluster analysis algorithm: K-Means.
def __update_covariance(self, means, rc, mc)
def __probabilities(self, index_cluster, index_point)
def __initialize_kmeans(self)
Utils that are used by modules of pyclustering.
def animate_cluster_allocation(data, observer, animation_velocity=75, movie_fps=1, save_movie=None)
Animates clustering process that is performed by EM algorithm.
def __extract_clusters(self)
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def get_evolution_means(self)
Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
def get_covariances(self)
def __get_stop_condition(self)
def gaussian(data, mean, covariance)
Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covar...
def notify(self, means, covariances, clusters)
This method is used by the algorithm to notify observer about changes where the algorithm should prov...
def __expectation_step(self)
Observer of EM algorithm for collecting algorithm state on each step.
Class represents K-Means clustering algorithm.
def __calculate_initial_clusters(self, centers)
Calculate Euclidean distance to each point from the each cluster.
def get_evolution_covariances(self)
def get_evolution_clusters(self)
def initialize(self, init_type=ema_init_type.KMEANS_INITIALIZATION)
Calculates initial parameters for EM algorithm: means and covariances using specified strategy...
def __init__(self)
Initializes EM observer.
def show_clusters(clusters, sample, covariances, means, figure=None, display=True)
Draws clusters and in case of two-dimensional dataset draws their ellipses.
def __normalize_probabilities(self)
Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm...
def __update_mean(self, rc, mc)
def __maximization_step(self)
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def process(self)
Run clustering process of the algorithm.
def get_probabilities(self)
Returns 2-dimensional list with belong probability of each object from data to cluster correspondingl...
def __initialize_random(self)
Visualizer of EM algorithm's results.
def __calculate_initial_covariances(self, initial_clusters)
Enumeration of initialization types for Expectation-Maximization algorithm.
def __log_likelihood(self)
def __normalize_probability(self, index_point, probability)
def __init__(self, data, amount_clusters, means=None, variances=None, observer=None, tolerance=0.00001, iterations=100)
Initializes Expectation-Maximization algorithm for cluster analysis.
def __erase_empty_clusters(self)