# Set matplotlib to plot in the notebook
%pylab inline

from __future__ import division
import utils
import seaborn
seaborn.set()
colors = seaborn.color_palette()

# Start by generating a dataset and calculating some statistics
d = randn(100)
m = d.mean()
s = d.std()

# Now plot a histogram of the data with 15 bins
hist(d, 15)

m_y = 2

# Add a plot of the mean and standard deviation range
plot(m, m_y, "ko")
plot([m - s, m + s], [m_y] * 2, "k--");

# Now let's find the medium and interquartile range
med = median(d)
iqr = utils.percentiles(d, [25, 75])

# Plot the data with this summary
hist(d, 15)
plot(med, m_y, "ko")
plot(iqr, [m_y] * 2, "k--");

# Let's define a function to streamline the central tendency and error bar plots from above
def ctend_plot(point, ci, y, color, label):
    plot(ci, [y, y], "-", color=color, linewidth=4, label=label)
    plot(point, y, "o", color=color, markersize=10)

hist(d, 15)
ctend_plot(m, [m - s, m + s], m_y, colors[1], "std dev")

ci = utils.percentiles(d, [16, 84])
ctend_plot(med, ci,  m_y - 1, colors[2], "68% CI")
legend();

d1 = concatenate([d, [40, 41, 45]])
m1 = d1.mean()
s1 = d1.std()
med1 = median(d1)
ci1 = percentile(d1, [16, 84])

hist(d1, 15)
# Find some space near the top of the plot
m_y = histogram(d1, 15)[0].max() + 5
ctend_plot(m1, [m1 - s1, m1 + s1], m_y, colors[1], "std dev")
ctend_plot(med1, ci1, m_y - 2.5, colors[2], "68% CI")
legend();

# First create a vector of x values spanning the support of the histogram
x = linspace(d.min(), d.max(), 1000)

# Let's define a function to calculate the gaussian function for a given x, mean, and std
gauss = lambda x, m, s: (1 / (s * sqrt(2 * pi)) * exp(-0.5 * ((x - m) / s) ** 2))

# We could iterate through the x values and calculate a y value for each using this equation
y0 = zeros_like(x)
for i, x_i in enumerate(x):
    y0[i] = gauss(x_i, m, s)

# You can also do it in one step with a vectorized computation.
# In general, avoid for loops, as vectorized expressions are much faster.
y = gauss(x, m, s)

# Use a normed histogram to match the range of the probaiblity density function
hist(d, 15, normed=True)
plot(x, y);

# Calculate the height of the gaussian for the outlier distribution and plot
x1 = linspace(d1.min(), d1.max(), 10000)
y1 = gauss(x1, m1, s1)
hist(d1, 15, normed=True)
plot(x1, y1);

from scipy import stats
x = linspace(-4, 4, 10001)
hist(stats.norm(0, 1).rvs(1000), 30, normed=True, alpha=.6)
plot(x, stats.norm(0, 1).pdf(x), linewidth=6)
plot(x, gauss(x, 0, 1), linewidth=3);

s = 10000
N = randn(s)

sample_size = 100
n = N[randint(0, s, sample_size)]

# Get the standard error of the mean and 95% CI for the sample
sem1 = n.std() / sqrt(sample_size)
m1 = n.mean()
ci1 = m1 - 2 * sem1, m1 + 2 * sem1

n_samples = 10000
mu = array([N[randint(0, s, sample_size)].mean() for i in xrange(n_samples)])
sem2 = mu.std()
ci2 = utils.percentiles(mu, [2.5, 97.5])

hist(n)
ctend_plot(N.mean(), ci2, 3, colors[4], "poplation mean")
ctend_plot(n.mean(), ci1, 2, colors[2], "sample mean")
legend(loc="best");

sample_size = 20
s = 10000
n = N[randint(0, s, sample_size)]

n_boots = 10000

# We can bootstrap the two different statistics
me = zeros(n_boots)
mn = zeros(n_boots)
for i in xrange(n_boots):
    sample = n[randint(0, sample_size, sample_size)]
    me[i] = median(sample)
    mn[i] = mean(sample)

# Compute the median and its confidence interval
med = median(me)
ci95med = utils.percentiles(me, [2.5, 97.5])

# Do the same for the mean
xbar = mean(mn)
ci95mean = utils.percentiles(mn, [2.5, 97.5])

# Define a function to streamline the central tendency and error bar plots from above
def ctend_plot(point, ci, y, color, label):
    plot(ci, [y, y], "-", color=color, linewidth=4, label=label)
    plot(point, y, "o", color=color, markersize=10)

hist(me, 20, alpha=0.5, normed=True)
hist(mn, 20, alpha=0.5, normed=True)
plot_height = ylim()[1]
ctend_plot(med, ci95med, plot_height * .75, colors[0], "median")
ctend_plot(xbar, ci95mean, plot_height * .70, colors[1], "mean")
legend(loc="best");