Simulation for model checking¶

In [117]:

import pandas as pd
import numpy as np
import patsy as pt
import statsmodels.api as sm
import pandas.rpy.common as com
from scipy.stats import gaussian_kde

Simulating data from a model

In [157]:

np.random.seed(44333)
x = np.random.normal(size=50)
e = np.random.normal(size=50)
e2 = np.random.standard_cauchy(size=50) # violating assumption

b0 = 1
b1 = 2

y = b0 + b1 * x + e
y2 = b0 + b1 * x + e2

In [158]:

y1, x1 = pt.dmatrices('y ~ x')
y2, x2 = pt.dmatrices('y2 ~ x')

f, (ax1, ax2) = subplots(ncols=2)

ax1.plot(sm.OLS(y1, x1).fit().fittedvalues, sm.OLS(y1, x1).fit().resid, 'ok')
ax2.plot(sm.OLS(y2, x2).fit().fittedvalues, sm.OLS(y2, x2).fit().resid, 'ok')

f.set_size_inches(8, 3)
f.tight_layout();

In [159]:

# repeated simulations
np.random.seed(44333)
betaNorm = []
betaCauch = []

for i in xrange(1000):
    x = np.random.normal(size=50)
    e = np.random.normal(size=50)
    e2 = np.random.standard_cauchy(size=50) # violating assumption

    b0 = 1
    b1 = 2

    y = b0 + b1 * x + e
    y2 = b0 + b1 * x + e2
    
    y1, x1 = pt.dmatrices('y ~ x')
    y2, x2 = pt.dmatrices('y2 ~ x')
    
    betaNorm.append(sm.OLS(y1, x1).fit().params[1])
    betaCauch.append(sm.OLS(y2, x2).fit().params[1])

In [160]:

[pd.Series(betaNorm).quantile(p) for p in [0., .25, .5, .75, 1.]]

Out[160]:

[1.5210990549672583,
 1.9066088365880756,
 2.0075332965346329,
 2.1005155714050501,
 2.4620828108875505]

In [161]:

[pd.Series(betaCauch).quantile(p) for p in [0., .25, .5, .75, 1.]]

Out[161]:

[-67.911046558676702,
 1.1673387367300572,
 2.0004418485723363,
 2.8119298983891468,
 205.11312906283948]

In [166]:

boxplot([betaNorm, betaCauch])
ylim(-5, 5);

Simulation based on a data set

In [167]:

galton = com.load_data('galton', package='UsingR')
nobs = galton.shape[0]

f, (ax1, ax2) = subplots(ncols=2)

ax1.hist(galton['child'], bins=100)
ax2.hist(galton['parent'], bins=100)

f.set_size_inches(8, 3)
f.tight_layout();

In [168]:

y, x = pt.dmatrices('child ~ parent', galton)
lm1 = sm.OLS(y, x).fit()

parent0 = np.random.normal(loc=np.mean(galton['parent']), \
                           scale=np.std(galton['parent']), size=nobs)

child0 = lm1.params[0] + lm1.params[1] * parent0 + \
         np.random.normal(scale=np.std(lm1.resid), size=nobs)

f, (ax1, ax2) = subplots(ncols=2)

ax1.plot(galton['parent'], galton['child'], 'ok')
ax2.plot(parent0, child0, 'o', alpha=.7)

f.set_size_inches(8, 3)
f.tight_layout();

Simulating more complicated scenarios

In [169]:

stamp = com.load_data('stamp', package='bootstrap')
nobs = stamp.shape[0]

dens = gaussian_kde(stamp['Thickness'])

x = np.linspace(.06, .14, num=200)
hist(stamp['Thickness'], bins=60, color='grey')
plot(x, dens(x), linewidth=3);

In [170]:

# a simulation that is too simple
plot(x, dens(x), 'k', linewidth=3)

x = np.linspace(.04, .14, num=200)
for i in xrange(10):
    newThick = np.random.normal(loc=np.mean(stamp['Thickness']), \
               scale=np.std(stamp['Thickness']), size=nobs)
    
    ds = gaussian_kde(newThick)
    plot(x, ds(x), color='grey', linewidth=1)

In [171]:

plot(x, dens(x), 'k', linewidth=3)

x = np.linspace(.04, .14, num=200)
for i in xrange(10):
    newThick = np.random.normal(loc=stamp['Thickness'], \
                                scale=dens.factor * .01, \
                                size=nobs)
    
    ds = gaussian_kde(newThick)
    plot(x, ds(x), color='grey', linewidth=1)

In [172]:

plot(x, dens(x), 'k', linewidth=3)

x = np.linspace(.04, .14, num=200)
for i in xrange(10):
    newThick = np.random.normal(loc=stamp['Thickness'], \
                                scale=dens.factor * .01 * 2, \
                                size=nobs)
    ds = gaussian_kde(newThick)
    plot(x, ds(x), color='grey', linewidth=1)

In [ ]: