import seaborn
seaborn.set()
colors = seaborn.color_palette()
import moss
import random
import numpy as np
import scipy, scipy.stats
from numpy.matlib import repmat
Сгенерируем данные.
n_obs = 100
X = column_stack((randn(n_obs, 4), ones(n_obs)))
w = rand(5)
noise = randn(100) * 5
y = dot(X, w) + noise
Оценка вектора параметров методом наименьших квадратов.
ols_fit = lambda X, y: dot(dot(inv(dot(X.T, X)), X.T), y)
w_ols = ols_fit(X, y)
Сравнение реальных весов с оценкой.
bar(arange(5) + .1, w, .4, label="actual weights")
bar(arange(5) + .5, w_ols, .4, color=colors[1], label="estimated weights");
Сравним бутстраповские оценки параметров моделей с разным уровнем шума.
X = column_stack((randn(1000, 4), ones(1000)))
w = rand(5)
# Large sample high noise; N = 1000, noise std = 5
y_large_n = dot(X, w) + randn(1000) * 5
# Small sample low noise; high signal to noise
y_lownoise = dot(X[:100], w) + randn(100)
# Small sample, high noise
y_noisy = dot(X[:100], w) + randn(100) * 5
# Bootstrap
n_boot = 1000
w_boot1 = moss.bootstrap(X, y_large_n, n_boot=n_boot, func=ols_fit)
w_boot2 = moss.bootstrap(X[:100], y_lownoise, n_boot=n_boot, func=ols_fit)
w_boot3 = moss.bootstrap(X[:100], y_noisy, n_boot=n_boot, func=ols_fit)
w_model1 = mean(w_boot1, axis=0)
w_model2 = mean(w_boot2, axis=0)
w_model3 = mean(w_boot3, axis=0)
Чем меньше уровень шума и больше данных в выборке, тем оценка лучше.
barx = linspace(0, 1, 6)[:-1]
models = [w, w_model1, w_model2, w_model3]
cis = [None] + map(seaborn.ci_to_errsize, [ci1, ci2, ci3], models[1:])
for i, model in enumerate(models):
bar(barx + i, model, 0.2, yerr=cis[i], color=colors[i], ecolor="gray")
xticks([.5, 1.5, 2.5, 3.5], ["model", "large N", "low noise", "noisy"]);
mas = []
for n_boots in xrange(100):
w_boot = moss.bootstrap(X, y_large_n, n_boot=n_boot, func=ols_fit)
w_est_boot = []
for indices in w_boot:
boot_X = X[indices, :]
boot_y = y[indices]
w_est = w_ols
w_est_boot.append(w_est.tolist())
w_est_boot = np.array(w_est_boot)
w_end = w_est_boot - repmat(w_est_boot.mean(axis=0), w_est_boot.shape[0], 1)
cov_estimation = np.dot(np.transpose(w_end), w_end) / (n_boots - 1)
diff = np.linalg.norm(cov_estimation - cov)
mas.append(diff)
plt.plot(range(2, 100), difference_bootstrap, 'g-')
plt.legend(('n_boots', 'Bootstrap'))
<matplotlib.legend.Legend at 0x105cab6d0>