%pylab inline from __future__ import division import utils import seaborn seaborn.set() colors = seaborn.color_palette() # Generate a population with mean 0 and sd of 1 k = 10000 N = randn(k) # Next we will simulate three samples of differnet sizes from the distribution N n = [8, 65, 512] # There are a variety of ways of getting random samples, we'll do it here # by permuting and taking the first k values in the resulting array. # Note that this method will sample *without* replacement, in contrast to # indexing with the return from randint(), as we did before. d_empirical = list() for k in n: sa = permutation(N)[:k] sb = permutation(N)[:k] d_empirical.append(sa.mean() - sb.mean()) print d_empirical n_samples = 100 for i, k in enumerate(n): sa = zeros(k) sb = zeros(k) sa, sb = [], [] for j in xrange(n_samples): sa.append(permutation(N)[:k]) sb.append(permutation(N)[:k]) # Compute the difference vector and plot a histogram # Represent the histogram as a PMF d = mean(sa, axis=1) - mean(sb, axis=1) bar(*utils.pmf_hist(d, 20), color=colors[i], alpha=0.5, label="sample size = %d" % k) # Add some description to the plot title("Probability of getting a spurious\ndifference between two random samples\nas function of sample size.") ylabel("Probability of occurrence") xlabel("Difference between random samples") # Additionally, plot the empirical diffences we obtained in the first step as vertical lines colors = seaborn.color_palette("deep") for i, d in enumerate(d_empirical): axvline(d, color=colors[i], linewidth=2) legend(loc="best"); k = 10000 N = concatenate([chisquare(6, k / 2), randn(k / 2)]) mu = mean(N) me = median(N) sd = std(N) hist(N, 100) axvline(me, label="median", color=colors[1], linewidth=3) axvline(mu, label="mean", color=colors[2], linewidth=3) sd_y = ylim()[1] * .5 plot([mu - sd, mu + sd], [sd_y] * 2, "--", color=colors[2], linewidth=3, label="std dev") title("Simulated true distribution (Chisquare, 6 DOF + Gaussian)") legend(); n = 4 m = 8 sa = permutation(N)[:n] sb = permutation(N)[:m] d_empirical = mean(sa) - mean(sb) sh0 = concatenate((sa, sb)) k = 10000 sa_rand = zeros((n, k)) sb_rand = zeros((m, k)) for i in xrange(k): iter_dist = permutation(sh0) sa_rand[:, i] = iter_dist[:n] sb_rand[:, i] = iter_dist[n:] d = sa_rand.mean(axis=0) - sb_rand.mean(axis=0) bar(*utils.pmf_hist(d, 100)) title("Distribution of the differences between the means of samples sa and sb") ylabel("Probability of occurance") xlabel("Difference (sa - sb)") # Plot the actual difference we got plot([d_empirical, d_empirical], ylim(), color=colors[4], linewidth=2.5) # Get the probability of such a result by counting the number of larger (absolute) differences p = sum(abs(d) > abs(d_empirical)) / k text(xlim()[0], ylim()[1] * .95, "$H_0$ is true with %.2f probability" % p, size=12); # First aggregate the samples sh0 = concatenate((sa, sb)) # Resample 10,000 times with replacement k = 10000 sa_rand = zeros((n, k)) sb_rand = zeros((m, k)) for i in xrange(k): sa_rand[:, i] = sh0[randint(0, n + m, n)] sb_rand[:, i] = sh0[randint(0, n + m, m)] # Compute the difference and plot as before d = median(sa_rand, axis=0) - median(sb_rand, axis=0) bar(*utils.pmf_hist(d, 100), color=colors[3]) title('Distribution of the differences between the means of samples sa and sb') ylabel('Probability of occurrence') xlabel('Difference (sa - sb)') plot([d_empirical] * 2, ylim(), color=colors[1], linewidth=2.5) p = sum(abs(d) > abs(d_empirical)) / k text(xlim()[0], ylim()[1] * .95, "$H_0$ is true with %.2f probability" % p); from scipy.stats import ttest_ind t, p = ttest_ind(sa, sb) print "t = %.2f; p = %.3f" % (t, p) n = 100 w = 0.6 s1 = randn(n) s2 = w * s1 + (1 - w) * randn(n) # First there's the longwinded arithmetic # (Note this isn't all that longwinded since it's a vectorized calculationg) r1 = mean(((s1 - s1.mean()) / s1.std()) * ((s2 - s2.mean()) / s2.std())) # We can also use functions from the scipy.stats package from scipy import stats r2 = mean(stats.zscore(s1) * stats.zscore(s2)) # This package even offers a one-step function # Note this returns a tuple (r, p) r3, p3 = stats.pearsonr(s1, s2) print "%.3f, %.3f, %.3f" % (r1, r2, r3) # Note that scatter(s1, s2) is equivalent to plot(s1, s2, "o") plot(s1, s2, "o") xlabel("Values in variable 1") ylabel("Values in variable 2") # We can display the simulated and computed correlations annot = "Correlations:\nSimulated: %.3f\nEstimated: %.3f" % (w, r1) text(xlim()[0] + .2, ylim()[1] - 1, annot, size=12); # Number of samples k = 1000 r_dist = zeros(k) # Do the bootstrap for i in xrange(k): idx = randint(0, n, n) r_dist[i] = stats.pearsonr(s1[idx], s2[idx])[0] # Compute the two-tailed 95% confidence intervals ci = utils.percentiles(r_dist, [2.5, 97.5]) # Plot the correlation with error bars bar(0.5, r1, width=1) plot([1, 1], ci, color=colors[2], linewidth=3) xlim(0, 2) ylim(0, 1) xticks(()) title("Estimated correlation coefficient with error") ylabel("Correlation between s1, s2"); k = 1000 r_dist = zeros(k) for i in xrange(k): s1_rand = s1[randint(0, n, n)] s2_rand = s2[randint(0, n, n)] r_dist[i] = stats.pearsonr(s1_rand, s2_rand)[0] bar(*utils.pmf_hist(r_dist, 20)) title('Distribution of correlations between s1 and s2') ylabel('Likelihood of occurrence') xlabel('Pearson correlation coefficient') # Now plot the empirical correlation value axvline(r1, color=colors[3], linewidth=3) p = sum(abs(r1) < abs(r_dist)) / k text(0.2, ylim()[1] * .95, "$H_0$ is true with %.2f probability" % p); # Say our acutal measured correlation was 0.4 r_empirical = 0.4 # Simulate two IID normal variables k = 10000 # number of sims n = 30 # sample size s1_sim = randn(k, n) s2_sim = randn(k, n) # Compute the correlation coefficient for each sample r_dist = array(map(stats.pearsonr, s1_sim, s2_sim))[:, 0] # Take first column (r values) # Plot the distribution of coefficients obtained this way bar(*utils.pmf_hist(r_dist, 100), color=colors[4]) title('Distribution of the Montecarlo-simulated correlation coefficients') ylabel('Probability of occurrence') xlabel('Corr(s1,s2)') # Plot the observed correlation on this PMF axvline(r_empirical, color=colors[0], linewidth=3) # Obtain the probability of our observed value by counting how many # r's from our randomization test were more extreme p = sum(abs(r_empirical) > abs(r_dist)) / k xpos = xlim()[0] * .95 ypos = ylim()[1] * .95 text(xpos, ypos, "$H_0$ is false with probability %.3f" % p, size=12);