import pandas as pd import numpy as np import pandas.rpy.common as com from scipy import stats from statsmodels.formula.api import ols # emulates abline function, only possible to basic line styles and width def abline(a, intercept, gradient, *args, **kwargs): #a = gca() xlim = a.get_xlim() ylim = a.get_ylim() if args: sty = args[0] else: sty = 'r' if kwargs: lw = kwargs['linewidth'] else: lw = 5 a.plot(xlim, [intercept + gradient * x for x in xlim], sty, linewidth=lw) a.set_xlim(xlim) a.set_ylim(ylim) np.random.seed(333) x = np.random.normal(size=30) bootMean = [np.random.choice(x, size=30, replace=True).mean() for i in range(1000)] sampledMean = [np.random.normal(size=30).mean() for i in range(1000)] bootDensity = stats.kde.gaussian_kde(bootMean) sampledDensity = stats.kde.gaussian_kde(sampledMean) x = np.arange(-0.8, 0.8, .01) plot(x, bootDensity(x), 'k') plot(x, sampledDensity(x), 'r'); # this is a bit of a shame import imp bootstrap = imp.load_source('scikits.bootstrap', '/Users/erriza/scikits-bootstrap/src/package/scikits/bootstrap/bootstrap.py') np.random.seed(333) x = np.random.normal(size=30) sampledMean = [np.random.normal(size=30).mean() for i in range(1000)] # this computes the confidence interval bootstrap.ci(x, statfunction=np.mean, n_samples=100) nuclear = com.load_data('nuclear', package='boot') nuke_lm = ols('np.log(cost) ~ date', nuclear).fit() plot(nuclear['date'], np.log(nuclear['cost']), 'ok') abline(gca(), nuke_lm.params['Intercept'], nuke_lm.params['date'], 'r', linewidth=3) # bootsrapping explicitly f, ax = subplots(ncols=3) for axis in ax: nuclear0 = nuclear.ix[np.random.choice(range(1, len(nuclear)+1), size=len(nuclear), replace=True)] nuke_lm0 = ols('np.log(cost) ~ date', nuclear).fit() axis.plot(nuclear0['date'], np.log(nuclear0['cost']), 'ok') abline(axis, nuke_lm0.params['Intercept'], nuke_lm0.params['date'], 'r', linewidth=3) f.set_size_inches(11, 3) f.tight_layout();