#Welcome to Monetate for the March DataPhilly meetup

import numpy as np
import math
import time
from scipy import stats 

#Introducing the normal distribution

a = np.random.randn(1000000)
a

_ = hist(a, bins=100)

a.mean()

np.std(a)

#standard deviation = sqrt(variance)
#variance = mean of square minus square of mean

#mean of squares
mean_of_squares = a.dot(a)/len(a)

#square of mean
square_of_mean = (a.mean()**2)

variance = mean_of_squares - square_of_mean
print 'variance', variance

print 'standard deviation', math.sqrt(variance)

len(a)

start = time.time()
a.dot(a)
print time.time()-start

start = time.time()
sum(i*i for i in a)
print time.time()-start


a[:10]

sigma = 5

a*=sigma
print a[:10]
print 'mean', a.mean()
print 'std dev', np.std(a)

_ = hist(a, bins=100)

mu = 100

a+= mu
print 'mean', a.mean()
print 'std dev', np.std(a)

_ = hist(a, bins=100)

def gen_norms(n, mu, sigma):
    return np.random.randn(n) * sigma + mu

_ = hist(gen_norms(10000, 200, 5), bins=10)

_ = hist(stats.norm.rvs(size=1000))

_ = hist(stats.norm.rvs(loc=10, size=1000))

_ = hist(stats.norm.rvs(scale=10, size=1000))

_ = hist(stats.norm.rvs(loc=100, scale=10, size=1000))

#Introducing confidence interval

# http://en.wikipedia.org/wiki/Confidence_interval
def confidence_interval(vals):
    n = len(vals)
    mean = vals.mean()
    sigma = np.std(vals)
    ssd = sigma/math.sqrt(n)
    z_score = 1.96 # p=-0.05
    return mean - z_score*ssd, mean+z_score*ssd


confidence_interval(stats.norm.rvs(loc=100, scale=10, size=100))


mean = 100
wrong = 0
for i in range(100):
    a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100))
    if not (a<mean<b):
        wrong += 1
print wrong

def confidence_wrongness():
    mean = 100
    wrong = 0
    for i in range(100):
        a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100))
        if not (a<mean<b):
            wrong += 1
    return wrong

hist([confidence_wrongness() for i in range(100)])

#hooray! math works.

#Introducing one-way ANOVA

a = stats.norm.rvs(loc=5, scale=10, size=100)
b = stats.norm.rvs(loc=10, scale=20, size=100)

#The one-way ANOVA tests the null hypothesis that two or more groups have the same population mean.
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
stats.f_oneway(a,b)

# from http://stackoverflow.com/questions/6871201/plot-two-histograms-at-the-same-time-with-matplotlib
bins = np.linspace(-20, 30, 50)

pyplot.hist(a, bins, alpha=0.5)
pyplot.hist(b, bins, alpha=0.5)
pyplot.show()

print a.mean(), np.std(a)
print b.mean(), np.std(b)

#Introducting Linear Regression

_ = hist(stats.uniform.rvs(scale=10, size=1000))

#y = norm + x
y = stats.norm.rvs(loc=5, scale=1, size=1000)
x = stats.uniform.rvs(scale=10, size=1000)
print list(zip(x[:3],y[:3]))
y+=x
print list(zip(x[:3],y[:3]))

scatter(x,y)

slope, intercept, r_value, p_value, stderr = stats.linregress(x,y)

print 'slope', slope
print 'intercept', intercept
print 'r_value', r_value
print 'p_value', p_value
print 'stderr', stderr

#slope : float
#    slope of the regression line
#intercept : float
#    intercept of the regression line
#r-value : float
#    correlation coefficient
#p-value : float
#    two-sided p-value for a hypothesis test whose null hypothesis is that the slope is zero.
#stderr : float
#    Standard error of the estimate

#from http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
#plot the a 2d histogram
a = hist2d(x,y, bins=100) # a = supresses output

#plot the computed fit line
fit_xs = [0,10]
fit_ys = [fit_x*slope + intercept for fit_x in fit_xs]
plot(fit_xs,fit_ys, 'k',linewidth=2)

fit_xs = np.linspace(0, 10, 51)

fit_ys = fit_xs * slope + intercept
_ = hist2d(x,y, bins=100)

plot(fit_xs,fit_ys, 'k',linewidth=2)
print fit_xs[0],fit_ys[0]
print fit_xs[1],fit_ys[1]
print fit_xs[2],fit_ys[2]

# http://engineering.monetate.com/2012/11/02/red-states-pinterest-blue-states-macs/