from numpy import random
from scipy.optimize import minimize, show_options
Пусть $s_1=\{s_{1i}\}$ и $s_2 = \{s_{2i}\}$ - две гауссианы. Мы предполагаем, что $s_1$ и $ s_2$ известны.
N = 1000
a = 0.3
s1 = normal(0, 0.08, size=N*a)
s2 = normal(0.6,0.12, size=N*(1-a))
s = concatenate([s1,s2])
hist(s, bins=20);
Модель - линейная комбинация двух гауссовых распределений: $$f(x|p) = \frac{\pi_1}{\sigma_1\sqrt{2\pi}}\left\{-\frac{(x-\mu_1)^2}{2\sigma_1^2}\right\}+\frac{\pi_2}{\sigma_2\sqrt{2\pi}}\left\{-\frac{(x-\mu_2)^2}{2\sigma_2^2}\right\},~\text{где}~p = [\mu_1, \sigma_1,\mu_2, \sigma_2,\pi_1],~ \pi_2 = 1-\pi_1.$$
Определим $f(x|p)$
def pdf_model(x, p):
mu1, sig1, mu2, sig2, pi_1 = p
return pi_1*normpdf(x, mu1, sig1) + (1-pi_1)*normpdf(x, mu2, sig2)
$$f_2(x|p) = \frac{\pi_2}{\sigma_2\sqrt{2\pi}}\left\{-\frac{(x-\mu_2)^2}{2\sigma_2^2}\right\}$$.
Смесь $f(x|p) = \pi_1f_1(x|p)+\pi_2f_2(x|p)$
$E$-шаг: при известном $p$ вычисляем значение вектора скрытых переменных $\gamma(i,k)$:$$\gamma(i,k) = \frac{\pi_kf_k(s_i|p)}{f_k(s_i|p)}$$
$М$-шаг: переоценка вектора параметров, используя текущее значение вектора скрытых переменных:
$N_k = \sum_{i=1}^{N}\gamma(i,k),~\text{где}~k = 1,2.~\text{}~ N = N_1+N_2$
$EM$-алгоритм
max_iter = 100
# Initial guess of parameters and initializations
p0 = array([-0.2,0.2,0.8,0.2,0.5])
mu1, sig1, mu2, sig2, pi_1 = p0
mu = array([mu1, mu2])
sig = array([sig1, sig2])
pi_ = array([pi_1, 1-pi_1])
gamma = zeros((2, s.size))
N_ = zeros(2)
p_new = p0
# EM loop
counter = 0
converged = False
while not converged:
# Compute the responsibility func. and new parameters
for k in [0,1]:
gamma[k,:] = pi_[k]*normpdf(s, mu[k], sig[k])/pdf_model(s, p_new)
N_[k] = 1.*gamma[k].sum()
mu[k] = sum(gamma[k]*s)/N_[k]
sig[k] = sqrt( sum(gamma[k]*(s-mu[k])**2)/N_[k] )
pi_[k] = N_[k]/s.size
p_new = [mu[0], sig[0], mu[1], sig[1], pi_[0]]
assert abs(N_.sum() - N)/float(N) < 1e-6
assert abs(pi_.sum() - 1) < 1e-6
# Convergence check
counter += 1
converged = counter >= max_iter
print "Means: %6.3f %6.3f" % (p_new[0], p_new[2])
print "Std dev: %6.3f %6.3f" % (p_new[1], p_new[3])
print "Mix (1): %6.3f " % p_new[4]
Means: -0.003 0.597 Std dev: 0.076 0.117 Mix (1): 0.300
print pi_.sum(), N_.sum()
1.0 1000.0
Для обобщения $EM$-алгоритма: $$\gamma(i,k) = \frac{\pi_kf_k(s_i|p)}{f_k(s_i|p)}w_i$$
Трбование: $\sum_iw_i = N$
$N_k, \mu_k^{\text{new}}, \sigma^{2new}, \pi_k^{new}$ вычисляются как обычно, но при новом определении $\gamma(i,k)$
Функция для генерации данных.
from scipy.stats import expon
def sim_single_population(mu, N=1000, max_sigma=0.5, mean_sigma=0.08):
exp_min_size = 1./max_sigma**2
exp_mean_size = 1./mean_sigma**2
sigma = 1/sqrt(expon.rvs(loc=exp_min_size, scale=exp_mean_size, size=N))
return normal(mu, scale=sigma, size=N), sigma
N = 1000
a = 0.3
s1, sig1 = sim_single_population(0, N=N*a)
s2, sig2 = sim_single_population(0.5, N=N*(1-a))
s = concatenate([s1, s2])
sigma_tot = concatenate([sig1, sig2])
hist(s, bins=r_[-1:2:0.025], alpha=0.3, color='g', histtype='stepfilled');
ax = twinx(); ax.grid(False)
ax.plot(s, 0.1/sigma_tot, 'o', mew=0, ms=2, alpha=0.6, color='b')
xlim(-0.5, 1.5); title('Simulated sample (to be fitted)')
print "Means: %6.3f %6.3f" % (s1.mean(), s2.mean())
print "Std dev: %6.3f %6.3f" % (sqrt((sig1**2).mean()), sqrt((sig2**2).mean()))
print "Mix (1): %6.3f " % a
Means: -0.000 0.507 Std dev: 0.148 0.144 Mix (1): 0.300
$EM$-алгоритм для этого случая и нарисуем результаты.
max_iter = 300
weights = 1./sigma_tot**2
# Renormalizing the weights so they sum to N
weights *= 1.*weights.size/weights.sum()
# No weights case
#weights = ones(s.size)
# Initial guess of parameters and initializations
p0 = array([-0.05,0.1,0.6,0.1,0.5])
mu1, sig1, mu2, sig2, pi_1 = p0
mu = array([mu1, mu2])
sig = array([sig1, sig2])
pi_ = array([pi_1, 1-pi_1])
gamma = zeros((2, s.size))
N_ = zeros(2)
p_new = p0
# EM loop
counter = 0
converged = False
while not converged:
# Compute the responsibility func. and new parameters
for k in [0,1]:
gamma[k,:] = weights*pi_[k]*normpdf(s, mu[k], sig[k])/pdf_model(s, p_new)
N_[k] = gamma[k,:].sum()
mu[k] = sum(gamma[k]*s)/N_[k]
sig[k] = sqrt( sum(gamma[k]*(s-mu[k])**2)/N_[k] )
pi_[k] = 1.*N_[k]/N
p_new = [mu[0], sig[0], mu[1], sig[1], pi_[0]]
assert abs(N_.sum() - N)/float(N) < 1e-6
assert abs(pi_.sum() - 1) < 1e-6
# Convergence check
counter += 1
converged = counter >= max_iter
print "Means: %7.4f %7.4f" % (p_new[0], p_new[2])
print "Std dev: %7.4f %7.4f" % (p_new[1], p_new[3])
print "Mix (1): %7.4f " % p_new[4]
Means: -0.0022 0.5053 Std dev: 0.0807 0.0781 Mix (1): 0.3070
x = r_[-1:2:0.01]
plot(x, pdf_model(x, p_new), color='k', lw=2); grid(True)
plot(s, 0.1/sigma_tot, 'o', mew=0, ms=2, alpha=0.5);