import pandas as pd import numpy as np import patsy as pt import statsmodels.api as sm import pandas.rpy.common as com from scipy.stats import gaussian_kde np.random.seed(44333) x = np.random.normal(size=50) e = np.random.normal(size=50) e2 = np.random.standard_cauchy(size=50) # violating assumption b0 = 1 b1 = 2 y = b0 + b1 * x + e y2 = b0 + b1 * x + e2 y1, x1 = pt.dmatrices('y ~ x') y2, x2 = pt.dmatrices('y2 ~ x') f, (ax1, ax2) = subplots(ncols=2) ax1.plot(sm.OLS(y1, x1).fit().fittedvalues, sm.OLS(y1, x1).fit().resid, 'ok') ax2.plot(sm.OLS(y2, x2).fit().fittedvalues, sm.OLS(y2, x2).fit().resid, 'ok') f.set_size_inches(8, 3) f.tight_layout(); # repeated simulations np.random.seed(44333) betaNorm = [] betaCauch = [] for i in xrange(1000): x = np.random.normal(size=50) e = np.random.normal(size=50) e2 = np.random.standard_cauchy(size=50) # violating assumption b0 = 1 b1 = 2 y = b0 + b1 * x + e y2 = b0 + b1 * x + e2 y1, x1 = pt.dmatrices('y ~ x') y2, x2 = pt.dmatrices('y2 ~ x') betaNorm.append(sm.OLS(y1, x1).fit().params[1]) betaCauch.append(sm.OLS(y2, x2).fit().params[1]) [pd.Series(betaNorm).quantile(p) for p in [0., .25, .5, .75, 1.]] [pd.Series(betaCauch).quantile(p) for p in [0., .25, .5, .75, 1.]] boxplot([betaNorm, betaCauch]) ylim(-5, 5); galton = com.load_data('galton', package='UsingR') nobs = galton.shape[0] f, (ax1, ax2) = subplots(ncols=2) ax1.hist(galton['child'], bins=100) ax2.hist(galton['parent'], bins=100) f.set_size_inches(8, 3) f.tight_layout(); y, x = pt.dmatrices('child ~ parent', galton) lm1 = sm.OLS(y, x).fit() parent0 = np.random.normal(loc=np.mean(galton['parent']), \ scale=np.std(galton['parent']), size=nobs) child0 = lm1.params[0] + lm1.params[1] * parent0 + \ np.random.normal(scale=np.std(lm1.resid), size=nobs) f, (ax1, ax2) = subplots(ncols=2) ax1.plot(galton['parent'], galton['child'], 'ok') ax2.plot(parent0, child0, 'o', alpha=.7) f.set_size_inches(8, 3) f.tight_layout(); stamp = com.load_data('stamp', package='bootstrap') nobs = stamp.shape[0] dens = gaussian_kde(stamp['Thickness']) x = np.linspace(.06, .14, num=200) hist(stamp['Thickness'], bins=60, color='grey') plot(x, dens(x), linewidth=3); # a simulation that is too simple plot(x, dens(x), 'k', linewidth=3) x = np.linspace(.04, .14, num=200) for i in xrange(10): newThick = np.random.normal(loc=np.mean(stamp['Thickness']), \ scale=np.std(stamp['Thickness']), size=nobs) ds = gaussian_kde(newThick) plot(x, ds(x), color='grey', linewidth=1) plot(x, dens(x), 'k', linewidth=3) x = np.linspace(.04, .14, num=200) for i in xrange(10): newThick = np.random.normal(loc=stamp['Thickness'], \ scale=dens.factor * .01, \ size=nobs) ds = gaussian_kde(newThick) plot(x, ds(x), color='grey', linewidth=1) plot(x, dens(x), 'k', linewidth=3) x = np.linspace(.04, .14, num=200) for i in xrange(10): newThick = np.random.normal(loc=stamp['Thickness'], \ scale=dens.factor * .01 * 2, \ size=nobs) ds = gaussian_kde(newThick) plot(x, ds(x), color='grey', linewidth=1)