import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
np.random.seed(13)
data = np.random.random(100)
plt.hist(data, bins=15, normed=True, color='black', alpha=0.5)
xs = np.linspace(0, 1, 100)
plt.title('Histogram of $U(0,1)$ samples')
<matplotlib.text.Text at 0xa7e560c>
def normal(x, mu, sigma):
"""Normal distribution PDF."""
return np.exp(-0.5 * ((x - mu) / sigma)**2) / (np.sqrt(2 * np.pi) * sigma)
def _fit_gmm(data, num_components, num_iters=100):
"""Fit a single GMM with EM with one random initialization."""
mu = np.random.choice(data, num_components, replace=False)
sigma = np.ones(num_components) * 0.1
prob_population = np.ones(num_components) / num_components
for i in xrange(num_iters):
# E-step.
probs = [normal(x, mu, sigma) for x in data]
probs = np.array([p / p.sum() for p in probs])
# M-step.
for k in xrange(num_components):
k_probs = probs[:, k]
mu[k] = (k_probs * data).sum() / k_probs.sum()
sigma[k] = np.sqrt((k_probs * (data - mu[k])**2).sum() / k_probs.sum())
prob_population[k] = k_probs.sum() / len(data)
log_likelihood = np.log(np.product([(normal(data[n], mu, sigma) * probs[n, :]).sum()
for n in xrange(len(data))]))
return mu, sigma, prob_population, log_likelihood
def fit_gmm(data, num_components, num_iters=10, num_random_inits=10):
"""Find the maximum likelihood GMM over several random initializations."""
best_results = None
best_ll = None
for attempt in xrange(num_random_inits):
mu, sigma, prob_population, log_likelihood = _fit_gmm(data, num_components, num_iters=num_iters)
if best_ll is None or log_likelihood > best_ll:
best_results = mu, sigma, prob_population, log_likelihood
best_ll = log_likelihood
return best_results
COLORS = 'bgrcmy'
def plot_gmm_fit(mu, sigma, prob_population, data):
"""Plot a GMM PDF along with the data it was estimated from."""
xs = np.linspace(0, 1, 100)
ys = [(normal(x, mu, sigma) * prob_population).sum() for x in xs]
plt.hist(data, normed=True, bins=15, color='lightgray', alpha=0.5, label='Real Data')
plt.plot(xs, ys, label="GMM ({0} components)".format(len(mu)), color='black', linewidth=5)
for k in xrange(len(mu)):
ys = [normal(x, mu[k], sigma[k]) * prob_population[k] for x in xs]
plt.plot(xs, ys, alpha=0.8, color=COLORS[k % len(COLORS)], linewidth=2)
plt.legend()
def fit_gmm_and_plot(data, num_components):
"""Fit and plot a GMM in one step."""
mu, sigma, prob_population, _ = fit_gmm(data, num_components, num_iters=1000, num_random_inits=1000)
plot_gmm_fit(mu, sigma, prob_population, data)
fit_gmm_and_plot(data, 1)
fit_gmm_and_plot(data, 2)
fit_gmm_and_plot(data, 3)
fit_gmm_and_plot(data, 4)
fit_gmm_and_plot(data, 5)
fit_gmm_and_plot(data, 10)
fit_gmm_and_plot(data, 25)