3 @brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model. 4 @details Implementation based on paper @cite article::ema::1. 6 @authors Andrei Novikov (pyclustering@yandex.ru) 8 @copyright GNU Public License 10 @cond GNU_PUBLIC_LICENSE 11 PyClustering is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 16 PyClustering is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 21 You should have received a copy of the GNU General Public License 22 along with this program. If not, see <http://www.gnu.org/licenses/>. 36 from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_square
38 from enum
import IntEnum
41 import matplotlib.pyplot
as plt
42 import matplotlib.animation
as animation
43 from matplotlib
import patches
44 except Exception
as error_instance:
45 warnings.warn(
"Impossible to import matplotlib (please, install 'matplotlib'), pyclustering's visualization " 46 "functionality is not available (details: '%s')." % str(error_instance))
50 @brief Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covariance in case 51 multi-dimensional data. 53 @param[in] data (list): Data that is used for gaussian calculation. 54 @param[in] mean (float|numpy.array): Mathematical expectation used for calculation. 55 @param[in] covariance (float|numpy.array): Variance or covariance matrix for calculation. 57 @return (list) Value of gaussian function for each point in dataset. 60 dimension = float(len(data[0]))
63 inv_variance = numpy.linalg.pinv(covariance)
65 inv_variance = 1.0 / covariance
67 divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance))
69 right_const = 1.0 / divider
71 right_const = float(
'inf')
76 mean_delta = point - mean
77 point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) )
78 result.append(point_gaussian)
86 @brief Enumeration of initialization types for Expectation-Maximization algorithm. 92 RANDOM_INITIALIZATION = 0
98 KMEANS_INITIALIZATION = 1
104 @brief Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm. 105 @details Initialization strategy is defined by enumerator 'ema_init_type': random initialization and 106 kmeans with kmeans++ initialization. Here an example of initialization using kmeans strategy: 109 from pyclustering.utils import read_sample 110 from pyclustering.samples.definitions import FAMOUS_SAMPLES 111 from pyclustering.cluster.ema import ema_initializer 113 sample = read_sample(FAMOUS_SAMPLES.SAMPLE_OLD_FAITHFUL) 116 initial_means, initial_covariance = ema_initializer(sample, amount_clusters).initialize() 118 print(initial_covariance) 123 __MAX_GENERATION_ATTEMPTS = 10
127 @brief Constructs EM initializer. 129 @param[in] sample (list): Data that will be used by the EM algorithm. 130 @param[in] amount (uint): Amount of clusters that should be allocated by the EM algorithm. 137 def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
139 @brief Calculates initial parameters for EM algorithm: means and covariances using 142 @param[in] init_type (ema_init_type): Strategy for initialization. 144 @return (float|list, float|numpy.array) Initial means and variance (covariance matrix in case multi-dimensional data). 147 if init_type == ema_init_type.KMEANS_INITIALIZATION:
150 elif init_type == ema_init_type.RANDOM_INITIALIZATION:
153 raise NameError(
"Unknown type of EM algorithm initialization is specified.")
156 def __calculate_initial_clusters(self, centers):
158 @brief Calculate Euclidean distance to each point from the each cluster. 159 @brief Nearest points are captured by according clusters and as a result clusters are updated. 161 @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data. 165 clusters = [[]
for _
in range(len(centers))]
166 for index_point
in range(len(self.
__sample)):
167 index_optim, dist_optim = -1, 0.0
169 for index
in range(len(centers)):
170 dist = euclidean_distance_square(self.
__sample[index_point], centers[index])
172 if (dist < dist_optim)
or (index
is 0):
173 index_optim, dist_optim = index, dist
175 clusters[index_optim].append(index_point)
180 def __calculate_initial_covariances(self, initial_clusters):
182 for initial_cluster
in initial_clusters:
183 if len(initial_cluster) > 1:
184 cluster_sample = [ self.
__sample[index_point]
for index_point
in initial_cluster ]
185 covariances.append(numpy.cov(cluster_sample, rowvar =
False))
188 covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
193 def __initialize_random(self):
199 while (mean
in initial_means)
and (attempts < ema_initializer.__MAX_GENERATION_ATTEMPTS):
203 if attempts == ema_initializer.__MAX_GENERATION_ATTEMPTS:
204 mean = [ value + (random.random() - 0.5) * value * 0.2
for value
in mean ]
206 initial_means.append(mean)
211 return initial_means, initial_covariance
214 def __initialize_kmeans(self):
216 kmeans_instance =
kmeans(self.
__sample, initial_centers, ccore =
True)
217 kmeans_instance.process()
219 means = kmeans_instance.get_centers()
222 initial_clusters = kmeans_instance.get_clusters()
223 for initial_cluster
in initial_clusters:
224 if len(initial_cluster) > 1:
225 cluster_sample = [ self.
__sample[index_point]
for index_point
in initial_cluster ]
226 covariances.append(numpy.cov(cluster_sample, rowvar =
False))
229 covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0)
231 return means, covariances
237 @brief Observer of EM algorithm for collecting algorithm state on each step. 238 @details It can be used to obtain whole picture about clustering process of EM algorithm. Allocated clusters, 239 means and covariances are stored in observer on each step. Here an example of usage: 242 from pyclustering.cluster.ema import ema, ema_observer 243 from pyclustering.utils import read_sample 244 from pyclustering.samples.definitions import SIMPLE_SAMPLES 246 # Read data from text file. 247 sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3) 249 # Create EM observer. 250 observer = ema_observer() 252 # Create EM algorithm to allocated four clusters and pass observer to it. 253 ema_instance = ema(sample, 4, observer=observer) 255 # Run clustering process. 256 ema_instance.process() 258 # Print amount of steps that were done by the algorithm. 259 print("EMA steps:", observer.get_iterations()) 261 # Print evolution of means and covariances. 262 print("Means evolution:", observer.get_evolution_means()) 263 print("Covariances evolution:", observer.get_evolution_covariances()) 265 # Print evolution of clusters. 266 print("Clusters evolution:", observer.get_evolution_clusters()) 268 # Print final clusters. 269 print("Allocated clusters:", observer.get_evolution_clusters()[-1]) 275 @brief Initializes EM observer. 285 @return (uint) Amount of iterations that were done by the EM algorithm. 293 @return (uint) Amount of iterations that were done by the EM algorithm. 301 @return (list) Mean of each cluster on each step of clustering. 309 @return (list) Covariance matrix (or variance in case of one-dimensional data) of each cluster on each step of clustering. 317 @return (list) Allocated clusters on each step of clustering. 323 def notify(self, means, covariances, clusters):
325 @brief This method is used by the algorithm to notify observer about changes where the algorithm 326 should provide new values: means, covariances and allocated clusters. 328 @param[in] means (list): Mean of each cluster on currect step. 329 @param[in] covariances (list): Covariances of each cluster on current step. 330 @param[in] clusters (list): Allocated cluster on current step. 341 @brief Visualizer of EM algorithm's results. 342 @details Provides services for visualization of particular features of the algorithm, for example, 343 in case of two-dimensional dataset it shows covariance ellipses. 348 def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
350 @brief Draws clusters and in case of two-dimensional dataset draws their ellipses. 352 @param[in] clusters (list): Clusters that were allocated by the algorithm. 353 @param[in] sample (list): Dataset that were used for clustering. 354 @param[in] covariances (list): Covariances of the clusters. 355 @param[in] means (list): Means of the clusters. 356 @param[in] figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used 358 @param[in] display (bool): If 'True' then figure will be shown by the method, otherwise it should be 359 shown manually using matplotlib function 'plt.show()'. 361 @return (figure) Figure where clusters were drawn. 366 visualizer.append_clusters(clusters, sample)
369 figure = visualizer.show(display =
False)
371 visualizer.show(figure = figure, display =
False)
373 if len(sample[0]) == 2:
374 ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means)
385 @brief Animates clustering process that is performed by EM algorithm. 387 @param[in] data (list): Dataset that is used for clustering. 388 @param[in] observer (ema_observer): EM observer that was used for collection information about clustering process. 389 @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only). 390 @param[in] movie_fps (uint): Defines frames per second (for rendering movie only). 391 @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter. 395 figure = plt.figure()
398 return frame_generation(0)
400 def frame_generation(index_iteration):
403 figure.suptitle(
"EM algorithm (iteration: " + str(index_iteration) +
")", fontsize = 18, fontweight =
'bold')
405 clusters = observer.get_evolution_clusters()[index_iteration]
406 covariances = observer.get_evolution_covariances()[index_iteration]
407 means = observer.get_evolution_means()[index_iteration]
409 ema_visualizer.show_clusters(clusters, data, covariances, means, figure,
False)
410 figure.subplots_adjust(top = 0.85)
412 return [ figure.gca() ]
414 iterations = len(observer)
415 cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000)
417 if save_movie
is not None:
418 cluster_animation.save(save_movie, writer =
'ffmpeg', fps = movie_fps, bitrate = 1500)
424 def __draw_ellipses(figure, visualizer, clusters, covariances, means):
425 ax = figure.get_axes()[0]
427 for index
in range(len(clusters)):
428 angle, width, height = calculate_ellipse_description(covariances[index])
429 color = visualizer.get_cluster_color(index, 0)
431 ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color)
435 def __draw_ellipse(ax, x, y, angle, width, height, color):
436 if (width > 0.0)
and (height > 0.0):
437 ax.plot(x, y, color=color, marker=
'x', markersize=6)
438 ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=-angle, linewidth=2, fill=
True, zorder=2, color=color)
439 ax.add_patch(ellipse)
445 @brief Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM). 446 @details The algorithm provides only clustering services (unsupervised learning). 447 Here an example of data clustering process: 449 from pyclustering.cluster.ema import ema, ema_visualizer 450 from pyclustering.utils import read_sample 451 from pyclustering.samples.definitions import FCPS_SAMPLES 453 # Read data from text file. 454 sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN) 456 # Create EM algorithm to allocated four clusters. 457 ema_instance = ema(sample, 3) 459 # Run clustering process. 460 ema_instance.process() 462 # Get clustering results. 463 clusters = ema_instance.get_clusters() 464 covariances = ema_instance.get_covariances() 465 means = ema_instance.get_centers() 467 # Visualize obtained clustering results. 468 ema_visualizer.show_clusters(clusters, sample, covariances, means) 471 Here is clustering results of the Expectation-Maximization clustering algorithm where popular sample 'OldFaithful' was used. 472 Initial random means and covariances were used in the example. The first step is presented on the left side of the figure and 473 final result (the last step) is on the right side: 474 @image html ema_old_faithful_clustering.png 480 def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
482 @brief Initializes Expectation-Maximization algorithm for cluster analysis. 484 @param[in] data (list): Dataset that should be analysed and where each point (object) is represented by the list of coordinates. 485 @param[in] amount_clusters (uint): Amount of clusters that should be allocated. 486 @param[in] means (list): Initial means of clusters (amount of means should be equal to amount of clusters for allocation). 487 If this parameter is 'None' then K-Means algorithm with K-Means++ method will be used for initialization by default. 488 @param[in] variances (list): Initial cluster variances (or covariances in case of multi-dimensional data). Amount of 489 covariances should be equal to amount of clusters that should be allocated. If this parameter is 'None' then 490 K-Means algorithm with K-Means++ method will be used for initialization by default. 491 @param[in] observer (ema_observer): Observer for gathering information about clustering process. 492 @param[in] tolerance (float): Defines stop condition of the algorithm (when difference between current and 493 previous log-likelihood estimation is less then 'tolerance' then clustering is over). 494 @param[in] iterations (uint): Additional stop condition parameter that defines maximum number of steps that can be 495 performed by the algorithm during clustering process. 499 self.
__data = numpy.array(data)
508 if (means
is None)
or (variances
is None):
511 if len(self.
__means) != amount_clusters:
514 self.
__rc = [ [0.0] * len(self.
__data)
for _
in range(amount_clusters) ]
515 self.
__pic = [1.0] * amount_clusters
517 self.
__gaussians = [ []
for _
in range(amount_clusters) ]
523 @brief Run clustering process of the algorithm. 524 @details This method should be called before call 'get_clusters()'. 528 previous_likelihood = -200000
529 current_likelihood = -100000
531 current_iteration = 0
532 while(self.
__stop is False)
and (abs(previous_likelihood - current_likelihood) > self.
__tolerance)
and (current_iteration < self.
__iterations):
536 current_iteration += 1
541 previous_likelihood = current_likelihood
550 @return (list) Allocated clusters where each cluster is represented by list of indexes of points from dataset, 551 for example, two cluster may have following representation [[0, 1, 4], [2, 3, 5, 6]]. 559 @return (list) Corresponding centers (means) of clusters. 568 @return (list) Corresponding variances (or covariances in case of multi-dimensional data) of clusters. 577 @brief Returns 2-dimensional list with belong probability of each object from data to cluster correspondingly, 578 where that first index is for cluster and the second is for point. 581 # Get belong probablities 582 probabilities = ema_instance.get_probabilities(); 584 # Show porbability of the fifth element in the first and in the second cluster 586 print("Probability in the first cluster:", probabilities[0][index_point]); 587 print("Probability in the first cluster:", probabilities[1][index_point]); 590 @return (list) 2-dimensional list with belong probability of each object from data to cluster. 597 def __erase_empty_clusters(self):
598 clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], []
600 for index_cluster
in range(len(self.
__clusters)):
602 clusters.append(self.
__clusters[index_cluster])
603 means.append(self.
__means[index_cluster])
605 pic.append(self.
__pic[index_cluster])
607 rc.append(self.
__rc[index_cluster])
620 def __extract_clusters(self):
622 for index_point
in range(len(self.
__data)):
625 candidates.append((index_cluster, self.
__rc[index_cluster][index_point]))
627 index_winner = max(candidates, key =
lambda candidate : candidate[1])[0]
628 self.
__clusters[index_winner].append(index_point)
633 def __log_likelihood(self):
636 for index_point
in range(len(self.
__data)):
639 particle += self.
__pic[index_cluster] * self.
__gaussians[index_cluster][index_point]
642 likelihood += numpy.log(particle)
647 def __probabilities(self, index_cluster, index_point):
652 if (divider != 0.0)
and (divider != float(
'inf')):
653 return self.
__pic[index_cluster] * self.
__gaussians[index_cluster][index_point] / divider
658 def __expectation_step(self):
665 for index_point
in range(len(self.
__data)):
669 def __maximization_step(self):
674 amount_impossible_clusters = 0
677 mc = numpy.sum(self.
__rc[index_cluster])
680 amount_impossible_clusters += 1
690 def __get_stop_condition(self):
692 if numpy.linalg.norm(covariance) == 0.0:
698 def __update_covariance(self, means, rc, mc):
700 for index_point
in range(len(self.
__data)):
701 deviation = numpy.array( [ self.
__data[index_point] - means ])
702 covariance += rc[index_point] * deviation.T.dot(deviation)
704 covariance = covariance / mc
708 def __update_mean(self, rc, mc):
710 for index_point
in range(len(self.
__data)):
711 mean += rc[index_point] * self.
__data[index_point]
717 def __normalize_probabilities(self):
718 for index_point
in range(len(self.
__data)):
720 for index_cluster
in range(len(self.
__clusters)):
721 probability += self.
__rc[index_cluster][index_point]
723 if abs(probability - 1.0) > 0.000001:
727 def __normalize_probability(self, index_point, probability):
728 if probability == 0.0:
731 normalization = 1.0 / probability
733 for index_cluster
in range(len(self.
__clusters)):
734 self.
__rc[index_cluster][index_point] *= normalization
Common visualizer of clusters on 1D, 2D or 3D surface.
pyclustering module for cluster analysis.
def __init__(self, sample, amount)
Constructs EM initializer.
Cluster analysis algorithm: K-Means.
def __update_covariance(self, means, rc, mc)
def __probabilities(self, index_cluster, index_point)
def __initialize_kmeans(self)
Utils that are used by modules of pyclustering.
def animate_cluster_allocation(data, observer, animation_velocity=75, movie_fps=1, save_movie=None)
Animates clustering process that is performed by EM algorithm.
def __extract_clusters(self)
K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means...
def get_evolution_means(self)
Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
def get_covariances(self)
def __get_stop_condition(self)
def gaussian(data, mean, covariance)
Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covar...
def notify(self, means, covariances, clusters)
This method is used by the algorithm to notify observer about changes where the algorithm should prov...
def __expectation_step(self)
Observer of EM algorithm for collecting algorithm state on each step.
Class represents K-Means clustering algorithm.
def __calculate_initial_clusters(self, centers)
Calculate Euclidean distance to each point from the each cluster.
def get_evolution_covariances(self)
def get_evolution_clusters(self)
def initialize(self, init_type=ema_init_type.KMEANS_INITIALIZATION)
Calculates initial parameters for EM algorithm: means and covariances using specified strategy...
def __init__(self)
Initializes EM observer.
def show_clusters(clusters, sample, covariances, means, figure=None, display=True)
Draws clusters and in case of two-dimensional dataset draws their ellipses.
def __normalize_probabilities(self)
Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm...
def __update_mean(self, rc, mc)
def __maximization_step(self)
Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
def process(self)
Run clustering process of the algorithm.
def get_probabilities(self)
Returns 2-dimensional list with belong probability of each object from data to cluster correspondingl...
def __initialize_random(self)
Visualizer of EM algorithm's results.
def __calculate_initial_covariances(self, initial_clusters)
Enumeration of initialization types for Expectation-Maximization algorithm.
def __log_likelihood(self)
def __normalize_probability(self, index_point, probability)
def __init__(self, data, amount_clusters, means=None, variances=None, observer=None, tolerance=0.00001, iterations=100)
Initializes Expectation-Maximization algorithm for cluster analysis.
def __erase_empty_clusters(self)