import pandas as pd
import numpy as np
import patsy as pt
import statsmodels.api as sm
import pandas.rpy.common as com
from scipy.stats import gaussian_kde
Simulating data from a model
np.random.seed(44333)
x = np.random.normal(size=50)
e = np.random.normal(size=50)
e2 = np.random.standard_cauchy(size=50) # violating assumption
b0 = 1
b1 = 2
y = b0 + b1 * x + e
y2 = b0 + b1 * x + e2
y1, x1 = pt.dmatrices('y ~ x')
y2, x2 = pt.dmatrices('y2 ~ x')
f, (ax1, ax2) = subplots(ncols=2)
ax1.plot(sm.OLS(y1, x1).fit().fittedvalues, sm.OLS(y1, x1).fit().resid, 'ok')
ax2.plot(sm.OLS(y2, x2).fit().fittedvalues, sm.OLS(y2, x2).fit().resid, 'ok')
f.set_size_inches(8, 3)
f.tight_layout();
# repeated simulations
np.random.seed(44333)
betaNorm = []
betaCauch = []
for i in xrange(1000):
x = np.random.normal(size=50)
e = np.random.normal(size=50)
e2 = np.random.standard_cauchy(size=50) # violating assumption
b0 = 1
b1 = 2
y = b0 + b1 * x + e
y2 = b0 + b1 * x + e2
y1, x1 = pt.dmatrices('y ~ x')
y2, x2 = pt.dmatrices('y2 ~ x')
betaNorm.append(sm.OLS(y1, x1).fit().params[1])
betaCauch.append(sm.OLS(y2, x2).fit().params[1])
[pd.Series(betaNorm).quantile(p) for p in [0., .25, .5, .75, 1.]]
[1.5210990549672583, 1.9066088365880756, 2.0075332965346329, 2.1005155714050501, 2.4620828108875505]
[pd.Series(betaCauch).quantile(p) for p in [0., .25, .5, .75, 1.]]
[-67.911046558676702, 1.1673387367300572, 2.0004418485723363, 2.8119298983891468, 205.11312906283948]
boxplot([betaNorm, betaCauch])
ylim(-5, 5);
Simulation based on a data set
galton = com.load_data('galton', package='UsingR')
nobs = galton.shape[0]
f, (ax1, ax2) = subplots(ncols=2)
ax1.hist(galton['child'], bins=100)
ax2.hist(galton['parent'], bins=100)
f.set_size_inches(8, 3)
f.tight_layout();
y, x = pt.dmatrices('child ~ parent', galton)
lm1 = sm.OLS(y, x).fit()
parent0 = np.random.normal(loc=np.mean(galton['parent']), \
scale=np.std(galton['parent']), size=nobs)
child0 = lm1.params[0] + lm1.params[1] * parent0 + \
np.random.normal(scale=np.std(lm1.resid), size=nobs)
f, (ax1, ax2) = subplots(ncols=2)
ax1.plot(galton['parent'], galton['child'], 'ok')
ax2.plot(parent0, child0, 'o', alpha=.7)
f.set_size_inches(8, 3)
f.tight_layout();
Simulating more complicated scenarios
stamp = com.load_data('stamp', package='bootstrap')
nobs = stamp.shape[0]
dens = gaussian_kde(stamp['Thickness'])
x = np.linspace(.06, .14, num=200)
hist(stamp['Thickness'], bins=60, color='grey')
plot(x, dens(x), linewidth=3);
# a simulation that is too simple
plot(x, dens(x), 'k', linewidth=3)
x = np.linspace(.04, .14, num=200)
for i in xrange(10):
newThick = np.random.normal(loc=np.mean(stamp['Thickness']), \
scale=np.std(stamp['Thickness']), size=nobs)
ds = gaussian_kde(newThick)
plot(x, ds(x), color='grey', linewidth=1)
plot(x, dens(x), 'k', linewidth=3)
x = np.linspace(.04, .14, num=200)
for i in xrange(10):
newThick = np.random.normal(loc=stamp['Thickness'], \
scale=dens.factor * .01, \
size=nobs)
ds = gaussian_kde(newThick)
plot(x, ds(x), color='grey', linewidth=1)
plot(x, dens(x), 'k', linewidth=3)
x = np.linspace(.04, .14, num=200)
for i in xrange(10):
newThick = np.random.normal(loc=stamp['Thickness'], \
scale=dens.factor * .01 * 2, \
size=nobs)
ds = gaussian_kde(newThick)
plot(x, ds(x), color='grey', linewidth=1)