# Set matplotlib to plot in the notebook %pylab inline from __future__ import division import utils import seaborn seaborn.set() colors = seaborn.color_palette() # Start by generating a dataset and calculating some statistics d = randn(100) m = d.mean() s = d.std() # Now plot a histogram of the data with 15 bins hist(d, 15) m_y = 2 # Add a plot of the mean and standard deviation range plot(m, m_y, "ko") plot([m - s, m + s], [m_y] * 2, "k--"); # Now let's find the medium and interquartile range med = median(d) iqr = utils.percentiles(d, [25, 75]) # Plot the data with this summary hist(d, 15) plot(med, m_y, "ko") plot(iqr, [m_y] * 2, "k--"); # Let's define a function to streamline the central tendency and error bar plots from above def ctend_plot(point, ci, y, color, label): plot(ci, [y, y], "-", color=color, linewidth=4, label=label) plot(point, y, "o", color=color, markersize=10) hist(d, 15) ctend_plot(m, [m - s, m + s], m_y, colors[1], "std dev") ci = utils.percentiles(d, [16, 84]) ctend_plot(med, ci, m_y - 1, colors[2], "68% CI") legend(); d1 = concatenate([d, [40, 41, 45]]) m1 = d1.mean() s1 = d1.std() med1 = median(d1) ci1 = percentile(d1, [16, 84]) hist(d1, 15) # Find some space near the top of the plot m_y = histogram(d1, 15)[0].max() + 5 ctend_plot(m1, [m1 - s1, m1 + s1], m_y, colors[1], "std dev") ctend_plot(med1, ci1, m_y - 2.5, colors[2], "68% CI") legend(); # First create a vector of x values spanning the support of the histogram x = linspace(d.min(), d.max(), 1000) # Let's define a function to calculate the gaussian function for a given x, mean, and std gauss = lambda x, m, s: (1 / (s * sqrt(2 * pi)) * exp(-0.5 * ((x - m) / s) ** 2)) # We could iterate through the x values and calculate a y value for each using this equation y0 = zeros_like(x) for i, x_i in enumerate(x): y0[i] = gauss(x_i, m, s) # You can also do it in one step with a vectorized computation. # In general, avoid for loops, as vectorized expressions are much faster. y = gauss(x, m, s) # Use a normed histogram to match the range of the probaiblity density function hist(d, 15, normed=True) plot(x, y); # Calculate the height of the gaussian for the outlier distribution and plot x1 = linspace(d1.min(), d1.max(), 10000) y1 = gauss(x1, m1, s1) hist(d1, 15, normed=True) plot(x1, y1); from scipy import stats x = linspace(-4, 4, 10001) hist(stats.norm(0, 1).rvs(1000), 30, normed=True, alpha=.6) plot(x, stats.norm(0, 1).pdf(x), linewidth=6) plot(x, gauss(x, 0, 1), linewidth=3); s = 10000 N = randn(s) sample_size = 100 n = N[randint(0, s, sample_size)] # Get the standard error of the mean and 95% CI for the sample sem1 = n.std() / sqrt(sample_size) m1 = n.mean() ci1 = m1 - 2 * sem1, m1 + 2 * sem1 n_samples = 10000 mu = array([N[randint(0, s, sample_size)].mean() for i in xrange(n_samples)]) sem2 = mu.std() ci2 = utils.percentiles(mu, [2.5, 97.5]) hist(n) ctend_plot(N.mean(), ci2, 3, colors[4], "poplation mean") ctend_plot(n.mean(), ci1, 2, colors[2], "sample mean") legend(loc="best"); sample_size = 20 s = 10000 n = N[randint(0, s, sample_size)] n_boots = 10000 # We can bootstrap the two different statistics me = zeros(n_boots) mn = zeros(n_boots) for i in xrange(n_boots): sample = n[randint(0, sample_size, sample_size)] me[i] = median(sample) mn[i] = mean(sample) # Compute the median and its confidence interval med = median(me) ci95med = utils.percentiles(me, [2.5, 97.5]) # Do the same for the mean xbar = mean(mn) ci95mean = utils.percentiles(mn, [2.5, 97.5]) # Define a function to streamline the central tendency and error bar plots from above def ctend_plot(point, ci, y, color, label): plot(ci, [y, y], "-", color=color, linewidth=4, label=label) plot(point, y, "o", color=color, markersize=10) hist(me, 20, alpha=0.5, normed=True) hist(mn, 20, alpha=0.5, normed=True) plot_height = ylim()[1] ctend_plot(med, ci95med, plot_height * .75, colors[0], "median") ctend_plot(xbar, ci95mean, plot_height * .70, colors[1], "mean") legend(loc="best");