3 @brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model. 4 @details Implementation based on paper @cite article::ema::1. 6 @authors Andrei Novikov (pyclustering@yandex.ru) 8 @copyright GNU Public License 10 @cond GNU_PUBLIC_LICENSE 11 PyClustering is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 16 PyClustering is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 21 You should have received a copy of the GNU General Public License 22 along with this program. If not, see <http://www.gnu.org/licenses/>. 36 from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_square
38 from enum
import IntEnum
41 import matplotlib.pyplot
as plt
42 import matplotlib.animation
as animation
43 from matplotlib
import patches
44 except Exception
as error_instance:
45 warnings.warn(
"Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization " 46 "functionality is not available (details: '%s')." % str(error_instance))
50 @brief Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covariance in case 51 multi-dimensional data. 53 @param[in] data (list): Data that is used for gaussian calculation. 54 @param[in] mean (float|numpy.array): Mathematical expectation used for calculation. 55 @param[in] covariance (float|numpy.array): Variance or covariance matrix for calculation. 57 @return (list) Value of gaussian function for each point in dataset. 60 dimension = float(len(data[0]))
63 inv_variance = numpy.linalg.pinv(covariance)
65 inv_variance = 1.0 / covariance
67 divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance))
69 right_const = 1.0 / divider
71 right_const = float(
'inf')
76 mean_delta = point - mean
77 point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) )
78 result.append(point_gaussian)
86 @brief Enumeration of initialization types for Expectation-Maximization algorithm. 92 RANDOM_INITIALIZATION = 0
98 KMEANS_INITIALIZATION = 1
104 @brief Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm. 105 @details Initialization strategy is defined by enumerator 'ema_init_type': random initialization and 106 kmeans with kmeans++ initialization. Here an example of initialization using kmeans strategy: 109 from pyclustering.utils import read_sample 110 from pyclustering.samples.definitions import FAMOUS_SAMPLES 111 from pyclustering.cluster.ema import ema_initializer 113 sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL) 116 initial_means, initial_covariance = ema_initializer(sample, amount_clusters).initialize() 118 print(initial_covariance) 123 __MAX_GENERATION_ATTEMPTS = 10
127 @brief Constructs EM initializer. 129 @param[in] sample (list): Data that will be used by the EM algorithm. 130 @param[in] amount (uint): Amount of clusters that should be allocated by the EM algorithm. 137 def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
139 @brief Calculates initial parameters for EM algorithm: means and covariances using 142 @param[in] init_type (ema_init_type): Strategy for initialization. 144 @return (float|list, float|numpy.array) Initial means and variance (covariance matrix in case multi-dimensional data). 147 if init_type == ema_init_type.KMEANS_INITIALIZATION:
150 elif init_type == ema_init_type.RANDOM_INITIALIZATION:
153 raise NameError(
"Unknown type of EM algorithm initialization is specified.")
156 def __calculate_initial_clusters(self, centers):
158 @brief Calculate Euclidean distance to each point from the each cluster. 159 @brief Nearest points are captured by according clusters and as a result clusters are updated. 161 @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data. 165 clusters = [[]
for _
in range(len(centers))]
166 for index_point
in range(len(self.
__sample)):
167 index_optim, dist_optim = -1, 0.0
169 for index
in range(len(centers)):
170 dist = euclidean_distance_square(self.
__sample[index_point], centers[index])
172 if (dist < dist_optim)
or (index
is 0):
173 index_optim, dist_optim = index, dist
175 clusters[index_optim].append(index_point)
180 def __calculate_initial_covariances(self, initial_clusters):
182 for initial_cluster
in initial_clusters:
183 if len(initial_cluster) > 1:
184 cluster_sample = [ self.
__sample[index_point]
for index_point
in initial_cluster ]
185 covariances.append(numpy.cov(cluster_sample, rowvar =
False))
188 covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
193 def __initialize_random(self):
199 while (mean
in initial_means)
and (attempts < ema_initializer.__MAX_GENERATION_ATTEMPTS):
203 if attempts == ema_initializer.__MAX_GENERATION_ATTEMPTS:
204 mean = [ value + (random.random() - 0.5) * value * 0.2
for value
in mean ]
206 initial_means.append(mean)
211 return initial_means, initial_covariance
214 def __initialize_kmeans(self):
216 kmeans_instance =
kmeans(self.
__sample, initial_centers, ccore =
True)
217 kmeans_instance.process()
219 means = kmeans_instance.get_centers()
222 initial_clusters = kmeans_instance.get_clusters()
223 for initial_cluster
in initial_clusters:
224 if len(initial_cluster) > 1:
225 cluster_sample = [ self.
__sample[index_point]
for index_point
in initial_cluster ]
226 covariances.append(numpy.cov(cluster_sample, rowvar =
False))
229 covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
231 return means, covariances
237 @brief Observer of EM algorithm for collecting algorithm state on each step. 238 @details It can be used to obtain whole picture about clustering process of EM algorithm. Allocated clusters, 239 means and covariances are stored in observer on each step. Here an example of usage: 242 from pyclustering.cluster.ema import ema, ema_observer 243 from pyclustering.utils import read_sample 244 from pyclustering.samples.definitions import SIMPLE_SAMPLES 246 # Read data from text file. 247 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3) 249 # Create EM observer. 250 observer = ema_observer() 252 # Create EM algorithm to allocated four clusters and pass observer to it. 253 ema_instance = ema(sample, 4, observer=observer) 255 # Run clustering process. 256 ema_instance.process() 258 # Print amount of steps that were done by the algorithm. 259 print("EMA steps:", observer.get_iterations()) 261 # Print evolution of means and covariances. 262 print("Means evolution:", observer.get_evolution_means()) 263 print("Covariances evolution:", observer.get_evolution_covariances()) 265 # Print evolution of clusters. 266 print("Clusters evolution:", observer.get_evolution_clusters()) 268 # Print final clusters. 269 print("Allocated clusters:", observer.get_evolution_clusters()[-1]) 275 @brief Initializes EM observer. 285 @return (uint) Amount of iterations that were done by the EM algorithm. 293 @return (uint) Amount of iterations that were done by the EM algorithm. 301 @return (list) Mean of each cluster on each step of clustering. 309 @return (list) Covariance matrix (or variance in case of one-dimensional data) of each cluster on each step of clustering. 317 @return (list) Allocated clusters on each step of clustering. 323 def notify(self, means, covariances, clusters):
325 @brief This method is used by the algorithm to notify observer about changes where the algorithm 326 should provide new values: means, covariances and allocated clusters. 328 @param[in] means (list): Mean of each cluster on currect step. 329 @param[in] covariances (list): Covariances of each cluster on current step. 330 @param[in] clusters (list): Allocated cluster on current step. 341 @brief Visualizer of EM algorithm's results. 342 @details Provides services for visualization of particular features of the algorithm, for example, 343 in case of two-dimensional dataset it shows covariance ellipses. 348 def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
350 @brief Draws clusters and in case of two-dimensional dataset draws their ellipses. 352 @param[in] clusters (list): Clusters that were allocated by the algorithm. 353 @param[in] sample (list): Dataset that were used for clustering. 354 @param[in] covariances (list): Covariances of the clusters. 355 @param[in] means (list): Means of the clusters. 356 @param[in] figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used 358 @param[in] display (bool): If 'True' then figure will be shown by the method, otherwise it should be 359 shown manually using matplotlib function 'plt.show()'. 361 @return (figure) Figure where clusters were drawn. 366 visualizer.append_clusters(clusters, sample)
369 figure = visualizer.show(display =
False)
371 visualizer.show(figure = figure, display =
False)
373 if len(sample[0]) == 2:
374 ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means)
385 @brief Animates clustering process that is performed by EM algorithm. 387 @param[in] data (list): Dataset that is used for clustering. 388 @param[in] observer (ema_observer): EM observer that was used for collection information about clustering process. 389 @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only). 390 @param[in] movie_fps (uint): Defines frames per second (for rendering movie only). 391 @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter. 395 figure = plt.figure()
398 return frame_generation(0)
400 def frame_generation(index_iteration):
403 figure.suptitle(
"EM algorithm (iteration: " + str(index_iteration) +
")", fontsize = 18, fontweight =
'bold')
405 clusters = observer.get_evolution_clusters()[index_iteration]
406 covariances = observer.get_evolution_covariances()[index_iteration]
407 means = observer.get_evolution_means()[index_iteration]
409 ema_visualizer.show_clusters(clusters, data, covariances, means, figure,
False)
410 figure.subplots_adjust(top = 0.85)
412 return [ figure.gca() ]
414 iterations = len(observer)
415 cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000)
417 if save_movie
is not None:
418 cluster_animation.save(save_movie, writer =
'ffmpeg', fps = movie_fps, bitrate = 1500)
424 def __draw_ellipses(figure, visualizer, clusters, covariances, means):
425 ax = figure.get_axes()[0]
427 for index
in range(len(clusters)):
428 angle, width, height = calculate_ellipse_description(covariances[index])
429 color = visualizer.get_cluster_color(index, 0)
431 ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color)
435 def __draw_ellipse(ax, x, y, angle, width, height, color):
436 if (width > 0.0)
and (height > 0.0):
437 ax.plot(x, y, color=color, marker=
'x', markersize=6)
438 ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=-angle, linewidth=2, fill=
True, zorder=2, color=color)
439 ax.add_patch(ellipse)
445 @brief Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM). 446 @details The algorithm provides only clustering services (unsupervised learning). 447 Here an example of data clustering process: 449 from pyclustering.cluster.ema import ema, ema_visualizer 450 from pyclustering.utils import read_sample 451 from pyclustering.samples.definitions import FCPS_SAMPLES 453 # Read data from text file. 454 sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN) 456 # Create EM algorithm to allocated four clusters. 457 ema_instance = ema(sample, 3) 459 # Run clustering process. 460 ema_instance.process() 462 # Get clustering results. 463 clusters = ema_instance.get_clusters() 464 covariances = ema_instance.get_covariances() 465 means = ema_instance.get_centers() 467 # Visualize obtained clustering results. 468 ema_visualizer.show_clusters(clusters, sample, covariances, means) 471 Here is clustering results of the Expectation-Maximization clustering algorithm where popular sample 'OldFaithful' was used. 472 Initial random means and covariances were used in the example. The first step is presented on the left side of the figure and 473 final result (the last step) is on the right side: 474 @image html ema_old_faithful_clustering.png 480 def __init__(self, data, amount_clusters, means=None, variances=None, observer=None, tolerance=0.00001, iterations=100):
482 @brief Initializes Expectation-Maximization algorithm for cluster analysis. 484 @param[in] data (list): Dataset that should be analysed and where each point (object) is represented by the list of coordinates. 485 @param[in] amount_clusters (uint): Amount of clusters that should be allocated. 486 @param[in] means (list): Initial means of clusters (amount of means should be equal to amount of clusters for allocation). 487 If this parameter is 'None' then K-Means algorithm with K-Means++ method will be used for initialization by default. 488 @param[in] variances (list): Initial cluster variances (or covariances in case of multi-dimensional data). Amount of 489 covariances should be equal to amount of clusters that should be allocated. If this parameter is 'None' then 490 K-Means algorithm with K-Means++ method will be used for initialization by default. 491 @param[in] observer (ema_observer): Observer for gathering information about clustering process. 492 @param[in] tolerance (float): Defines stop condition of the algorithm (when difference between current and 493 previous log-likelihood estimation is less then 'tolerance' then clustering is over). 494 @param[in] iterations (uint): Additional stop condition parameter that defines maximum number of steps that can be 495 performed by the algorithm during clustering process. 499 self.
__data = numpy.array(data)
510 if (means
is None)
or (variances
is None):
513 if len(self.
__means) != amount_clusters:
516 self.
__rc = [ [0.0] * len(self.
__data)
for _
in range(amount_clusters) ]
517 self.
__pic = [1.0] * amount_clusters
519 self.
__gaussians = [ []
for _
in range(amount_clusters) ]
525 @brief Run clustering process of the algorithm. 527 @return (ema) Returns itself (EMA instance). 531 previous_likelihood = -200000
532 current_likelihood = -100000
534 current_iteration = 0
535 while(self.
__stop is False)
and (abs(previous_likelihood - current_likelihood) > self.
__tolerance)
and (current_iteration < self.
__iterations):
539 current_iteration += 1
544 previous_likelihood = current_likelihood
554 @return (list) Allocated clusters where each cluster is represented by list of indexes of points from dataset, 555 for example, two cluster may have following representation [[0, 1, 4], [2, 3, 5, 6]]. 563 @return (list) Corresponding centers (means) of clusters. 572 @return (list) Corresponding variances (or covariances in case of multi-dimensional data) of clusters. 581 @brief Returns 2-dimensional list with belong probability of each object from data to cluster correspondingly, 582 where that first index is for cluster and the second is for point. 585 # Get belong probablities 586 probabilities = ema_instance.get_probabilities(); 588 # Show porbability of the fifth element in the first and in the second cluster 590 print("Probability in the first cluster:", probabilities[0][index_point]); 591 print("Probability in the first cluster:", probabilities[1][index_point]); 594 @return (list) 2-dimensional list with belong probability of each object from data to cluster. 601 def __erase_empty_clusters(self):
602 clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], []
604 for index_cluster
in range(len(self.
__clusters)):
606 clusters.append(self.
__clusters[index_cluster])
607 means.append(self.
__means[index_cluster])
609 pic.append(self.
__pic[index_cluster])
611 rc.append(self.
__rc[index_cluster])
624 def __extract_clusters(self):
626 for index_point
in range(len(self.
__data)):
629 candidates.append((index_cluster, self.
__rc[index_cluster][index_point]))
631 index_winner = max(candidates, key=
lambda candidate: candidate[1])[0]
632 self.
__clusters[index_winner].append(index_point)
637 def __log_likelihood(self):
640 for index_point
in range(len(self.
__data)):
643 particle += self.
__pic[index_cluster] * self.
__gaussians[index_cluster][index_point]
646 likelihood += numpy.log(particle)
651 def __probabilities(self, index_cluster, index_point):
656 if (divider != 0.0)
and (divider != float(
'inf')):
657 return self.
__pic[index_cluster] * self.
__gaussians[index_cluster][index_point] / divider
662 def __expectation_step(self):
669 for index_point
in range(len(self.
__data)):
673 def __maximization_step(self):
678 amount_impossible_clusters = 0
681 mc = numpy.sum(self.
__rc[index_cluster])
684 amount_impossible_clusters += 1
694 def __get_stop_condition(self):
696 if numpy.linalg.norm(covariance) == 0.0:
702 def __update_covariance(self, means, rc, mc):
704 for index_point
in range(len(self.
__data)):
705 deviation = numpy.array([self.
__data[index_point] - means])
706 covariance += rc[index_point] * deviation.T.dot(deviation)
708 covariance = covariance / mc
712 def __update_mean(self, rc, mc):
714 for index_point
in range(len(self.
__data)):
715 mean += rc[index_point] * self.
__data[index_point]
721 def __normalize_probabilities(self):
722 for index_point
in range(len(self.
__data)):
724 for index_cluster
in range(len(self.
__clusters)):
725 probability += self.
__rc[index_cluster][index_point]
727 if abs(probability - 1.0) > 0.000001:
731 def __normalize_probability(self, index_point, probability):
732 if probability == 0.0:
735 normalization = 1.0 / probability
737 for index_cluster
in range(len(self.
__clusters)):
738 self.
__rc[index_cluster][index_point] *= normalization
741 def __verify_arguments(self):
743 @brief Verify input parameters for the algorithm and throw exception in case of incorrectness. 747 raise ValueError(
"Input data is empty (size: '%d')." % len(self.
__data))
750 raise ValueError(
"Amount of clusters (current value '%d') should be greater or equal to 1." %
Common visualizer of clusters on 1D, 2D or 3D surface.
pyclustering module for cluster analysis.
def __init__(self, sample, amount)
Constructs EM initializer.
The module contains K-Means algorithm and other related services.
def __update_covariance(self, means, rc, mc)
def __probabilities(self, index_cluster, index_point)
def __initialize_kmeans(self)
Utils that are used by modules of pyclustering.
def animate_cluster_allocation(data, observer, animation_velocity=75, movie_fps=1, save_movie=None)
Animates clustering process that is performed by EM algorithm.
def __extract_clusters(self)
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def get_evolution_means(self)
Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
def get_covariances(self)
def __get_stop_condition(self)
def gaussian(data, mean, covariance)
Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covar...
def notify(self, means, covariances, clusters)
This method is used by the algorithm to notify observer about changes where the algorithm should prov...
def __expectation_step(self)
Observer of EM algorithm for collecting algorithm state on each step.
Class implements K-Means clustering algorithm.
def __calculate_initial_clusters(self, centers)
Calculate Euclidean distance to each point from the each cluster.
def get_evolution_covariances(self)
def get_evolution_clusters(self)
def initialize(self, init_type=ema_init_type.KMEANS_INITIALIZATION)
Calculates initial parameters for EM algorithm: means and covariances using specified strategy...
def __init__(self)
Initializes EM observer.
def show_clusters(clusters, sample, covariances, means, figure=None, display=True)
Draws clusters and in case of two-dimensional dataset draws their ellipses.
def __normalize_probabilities(self)
Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm...
def __update_mean(self, rc, mc)
def __maximization_step(self)
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def process(self)
Run clustering process of the algorithm.
def __verify_arguments(self)
Verify input parameters for the algorithm and throw exception in case of incorrectness.
def get_probabilities(self)
Returns 2-dimensional list with belong probability of each object from data to cluster correspondingl...
def __initialize_random(self)
Visualizer of EM algorithm's results.
def __calculate_initial_covariances(self, initial_clusters)
Enumeration of initialization types for Expectation-Maximization algorithm.
def __log_likelihood(self)
def __normalize_probability(self, index_point, probability)
def __init__(self, data, amount_clusters, means=None, variances=None, observer=None, tolerance=0.00001, iterations=100)
Initializes Expectation-Maximization algorithm for cluster analysis.
def __erase_empty_clusters(self)