%pylab inline

from __future__ import division
import utils
import seaborn
seaborn.set()
colors = seaborn.color_palette()

# Generate a population with mean 0 and sd of 1
k = 10000
N = randn(k)

# Next we will simulate three samples of differnet sizes from the distribution N
n = [8, 65, 512]

# There are a variety of ways of getting random samples, we'll do it here
# by permuting and taking the first k values in the resulting array.
# Note that this method will sample *without* replacement, in contrast to
# indexing with the return from randint(), as we did before.
d_empirical = list()
for k in n:
    sa = permutation(N)[:k]
    sb = permutation(N)[:k]
    d_empirical.append(sa.mean() - sb.mean())

print d_empirical

n_samples = 100
for i, k in enumerate(n):
    sa = zeros(k)
    sb = zeros(k)
    
    sa, sb = [], []
    for j in xrange(n_samples):
        sa.append(permutation(N)[:k])
        sb.append(permutation(N)[:k])
    
    # Compute the difference vector and plot a histogram
    # Represent the histogram as a PMF
    
    d = mean(sa, axis=1) - mean(sb, axis=1)
    bar(*utils.pmf_hist(d, 20), color=colors[i], alpha=0.5, label="sample size = %d" % k)

# Add some description to the plot
title("Probability of getting a spurious\ndifference between two random samples\nas function of sample size.")
ylabel("Probability of occurrence")
xlabel("Difference between random samples")

# Additionally, plot the empirical diffences we obtained in the first step as vertical lines
colors = seaborn.color_palette("deep")
for i, d in enumerate(d_empirical):
    axvline(d, color=colors[i], linewidth=2)
    
legend(loc="best");

k = 10000
N = concatenate([chisquare(6, k / 2), randn(k / 2)])
mu = mean(N)
me = median(N)
sd = std(N)

hist(N, 100)
axvline(me, label="median", color=colors[1], linewidth=3)
axvline(mu, label="mean", color=colors[2], linewidth=3)
sd_y = ylim()[1] * .5
plot([mu - sd, mu + sd], [sd_y] * 2, "--", color=colors[2], linewidth=3, label="std dev")
title("Simulated true distribution (Chisquare, 6 DOF + Gaussian)")
legend();

n = 4
m = 8
sa = permutation(N)[:n]
sb = permutation(N)[:m]

d_empirical = mean(sa) - mean(sb)

sh0 = concatenate((sa, sb))

k = 10000
sa_rand = zeros((n, k))
sb_rand = zeros((m, k))
for i in xrange(k):
    iter_dist = permutation(sh0)
    sa_rand[:, i] = iter_dist[:n]
    sb_rand[:, i] = iter_dist[n:]

d = sa_rand.mean(axis=0) - sb_rand.mean(axis=0)
bar(*utils.pmf_hist(d, 100))
title("Distribution of the differences between the means of samples sa and sb")
ylabel("Probability of occurance")
xlabel("Difference (sa - sb)")

# Plot the actual difference we got
plot([d_empirical, d_empirical], ylim(), color=colors[4], linewidth=2.5)

# Get the probability of such a result by counting the number of larger (absolute) differences
p = sum(abs(d) > abs(d_empirical)) / k
text(xlim()[0], ylim()[1] * .95, "$H_0$ is true with %.2f probability" % p, size=12);

# First aggregate the samples
sh0 = concatenate((sa, sb))

# Resample 10,000 times with replacement
k = 10000
sa_rand = zeros((n, k))
sb_rand = zeros((m, k))
for i in xrange(k):
    sa_rand[:, i] = sh0[randint(0, n + m, n)]
    sb_rand[:, i] = sh0[randint(0, n + m, m)]
    
# Compute the difference and plot as before
d = median(sa_rand, axis=0) - median(sb_rand, axis=0)

bar(*utils.pmf_hist(d, 100), color=colors[3])
title('Distribution of the differences between the means of samples sa and sb')
ylabel('Probability of occurrence')
xlabel('Difference (sa - sb)')

plot([d_empirical] * 2, ylim(), color=colors[1], linewidth=2.5)

p = sum(abs(d) > abs(d_empirical)) / k
text(xlim()[0], ylim()[1] * .95, "$H_0$ is true with %.2f probability" % p);

from scipy.stats import ttest_ind
t, p = ttest_ind(sa, sb)
print "t = %.2f; p = %.3f" % (t, p)

n = 100
w = 0.6
s1 = randn(n)
s2 = w * s1 + (1 - w) * randn(n)

# First there's the longwinded arithmetic
# (Note this isn't all that longwinded since it's a vectorized calculationg)
r1 = mean(((s1 - s1.mean()) / s1.std()) * ((s2 - s2.mean()) / s2.std()))

# We can also use functions from the scipy.stats package
from scipy import stats
r2 = mean(stats.zscore(s1) * stats.zscore(s2))

# This package even offers a one-step function
# Note this returns a tuple (r, p)
r3, p3 = stats.pearsonr(s1, s2)

print "%.3f, %.3f, %.3f" % (r1, r2, r3)

# Note that scatter(s1, s2) is equivalent to plot(s1, s2, "o")
plot(s1, s2, "o")
xlabel("Values in variable 1")
ylabel("Values in variable 2")

# We can display the simulated and computed correlations
annot = "Correlations:\nSimulated: %.3f\nEstimated: %.3f" % (w, r1)
text(xlim()[0] + .2, ylim()[1] - 1, annot, size=12);

# Number of samples
k = 1000
r_dist = zeros(k)

# Do the bootstrap
for i in xrange(k):
    idx = randint(0, n, n)
    r_dist[i] = stats.pearsonr(s1[idx], s2[idx])[0]

# Compute the two-tailed 95% confidence intervals
ci = utils.percentiles(r_dist, [2.5, 97.5])

# Plot the correlation with error bars
bar(0.5, r1, width=1)
plot([1, 1], ci, color=colors[2], linewidth=3)
xlim(0, 2)
ylim(0, 1)
xticks(())
title("Estimated correlation coefficient with error")
ylabel("Correlation between s1, s2");

k = 1000
r_dist = zeros(k)
for i in xrange(k):
    s1_rand = s1[randint(0, n, n)]
    s2_rand = s2[randint(0, n, n)]
    r_dist[i] = stats.pearsonr(s1_rand, s2_rand)[0]

bar(*utils.pmf_hist(r_dist, 20))
title('Distribution of correlations between s1 and s2')
ylabel('Likelihood of occurrence')
xlabel('Pearson correlation coefficient')

# Now plot the empirical correlation value
axvline(r1, color=colors[3], linewidth=3)
p = sum(abs(r1) < abs(r_dist)) / k
text(0.2, ylim()[1] * .95, "$H_0$ is true with %.2f probability" % p);

# Say our acutal measured correlation was 0.4
r_empirical = 0.4

# Simulate two IID normal variables
k = 10000 # number of sims
n = 30 # sample size
s1_sim = randn(k, n)
s2_sim = randn(k, n)

# Compute the correlation coefficient for each sample
r_dist = array(map(stats.pearsonr, s1_sim, s2_sim))[:, 0] # Take first column (r values)

# Plot the distribution of coefficients obtained this way
bar(*utils.pmf_hist(r_dist, 100), color=colors[4])
title('Distribution of the Montecarlo-simulated correlation coefficients')
ylabel('Probability of occurrence')
xlabel('Corr(s1,s2)')

# Plot the observed correlation on this PMF
axvline(r_empirical, color=colors[0], linewidth=3)

# Obtain the probability of our observed value by counting how many
# r's from our randomization test were more extreme
p = sum(abs(r_empirical) > abs(r_dist)) / k
xpos = xlim()[0] * .95
ypos = ylim()[1] * .95
text(xpos, ypos, "$H_0$ is false with probability %.3f" % p, size=12);