In [ ]:

#Welcome to Monetate for the March DataPhilly meetup

In [460]:

import numpy as np
import math
import time
from scipy import stats 

In [ ]:

#Introducing the normal distribution

In [492]:

a = np.random.randn(1000000)
a

Out[492]:

array([-1.63750585,  0.74482092, -0.04815637, ..., -1.5315872 ,
       -0.45943841, -0.25171966])

In [493]:

_ = hist(a, bins=100)

In [477]:

a.mean()

Out[477]:

0.022361050204139612

In [478]:

np.std(a)

Out[478]:

1.0021686012238014

In [479]:

#standard deviation = sqrt(variance)
#variance = mean of square minus square of mean

#mean of squares
mean_of_squares = a.dot(a)/len(a)

#square of mean
square_of_mean = (a.mean()**2)

variance = mean_of_squares - square_of_mean
print 'variance', variance

print 'standard deviation', math.sqrt(variance)

variance 1.00434190528
standard deviation 1.00216860122

In [484]:

len(a)

Out[484]:

In [485]:

start = time.time()
a.dot(a)
print time.time()-start

0.00362396240234

In [486]:

start = time.time()
sum(i*i for i in a)
print time.time()-start

1.50220799446

In [487]:

Out[487]:

414.52085526329444

In [488]:

a[:10]

Out[488]:

array([ 1.54337579, -0.22298286,  1.09256106,  0.83276554,  0.04672008,
       -0.92547545,  1.47941986,  0.03762187,  0.18366413, -0.79001251])

In [494]:

sigma = 5

a*=sigma
print a[:10]
print 'mean', a.mean()
print 'std dev', np.std(a)

[-8.18752926  3.72410459 -0.24078185 -4.66832525  0.54021312  1.75742153
 -5.98583905  0.86952862 -1.30490708 -0.79470159]
mean 0.00556193052795
std dev 4.99427902019

In [495]:

_ = hist(a, bins=100)

In [496]:

mu = 100

a+= mu
print 'mean', a.mean()
print 'std dev', np.std(a)

mean 100.005561931
std dev 4.99427902019

In [497]:

_ = hist(a, bins=100)

In [498]:

def gen_norms(n, mu, sigma):
    return np.random.randn(n) * sigma + mu

In [500]:

_ = hist(gen_norms(10000, 200, 5), bins=10)

In [501]:

_ = hist(stats.norm.rvs(size=1000))

In [502]:

_ = hist(stats.norm.rvs(loc=10, size=1000))

In [503]:

_ = hist(stats.norm.rvs(scale=10, size=1000))

In [504]:

_ = hist(stats.norm.rvs(loc=100, scale=10, size=1000))

In [ ]:

#Introducing confidence interval

In [544]:

# http://en.wikipedia.org/wiki/Confidence_interval
def confidence_interval(vals):
    n = len(vals)
    mean = vals.mean()
    sigma = np.std(vals)
    ssd = sigma/math.sqrt(n)
    z_score = 1.96 # p=-0.05
    return mean - z_score*ssd, mean+z_score*ssd


confidence_interval(stats.norm.rvs(loc=100, scale=10, size=100))

Out[544]:

(97.502924102225123, 101.50639835800867)

In [545]:

mean = 100
wrong = 0
for i in range(100):
    a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100))
    if not (a<mean<b):
        wrong += 1
print wrong

In [546]:

def confidence_wrongness():
    mean = 100
    wrong = 0
    for i in range(100):
        a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100))
        if not (a<mean<b):
            wrong += 1
    return wrong

In [547]:

hist([confidence_wrongness() for i in range(100)])

Out[547]:

(array([ 8, 12,  9, 18, 19, 24,  5,  4,  0,  1]),
 array([  1. ,   2.2,   3.4,   4.6,   5.8,   7. ,   8.2,   9.4,  10.6,
        11.8,  13. ]),
 <a list of 10 Patch objects>)

In [435]:

#hooray! math works.

In [ ]:

#Introducing one-way ANOVA

In [530]:

a = stats.norm.rvs(loc=5, scale=10, size=100)
b = stats.norm.rvs(loc=10, scale=20, size=100)

#The one-way ANOVA tests the null hypothesis that two or more groups have the same population mean.
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
stats.f_oneway(a,b)

Out[530]:

(2.4906409666614393, 0.11612135274608126)

In [531]:

# from http://stackoverflow.com/questions/6871201/plot-two-histograms-at-the-same-time-with-matplotlib
bins = np.linspace(-20, 30, 50)

pyplot.hist(a, bins, alpha=0.5)
pyplot.hist(b, bins, alpha=0.5)
pyplot.show()

In [532]:

print a.mean(), np.std(a)
print b.mean(), np.std(b)

4.64916410873 10.9514365695
8.30930618331 20.3117320034

In [ ]:

#Introducting Linear Regression

In [394]:

_ = hist(stats.uniform.rvs(scale=10, size=1000))

In [539]:

#y = norm + x
y = stats.norm.rvs(loc=5, scale=1, size=1000)
x = stats.uniform.rvs(scale=10, size=1000)
print list(zip(x[:3],y[:3]))
y+=x
print list(zip(x[:3],y[:3]))

[(5.9080163574090774, 6.5572216419803731), (7.7466873643872871, 5.4299635923047918), (4.1182370314828578, 5.9690268176702057)]
[(5.9080163574090774, 12.465237999389451), (7.7466873643872871, 13.176650956692079), (4.1182370314828578, 10.087263849153064)]

In [540]:

scatter(x,y)

Out[540]:

<matplotlib.collections.PathCollection at 0x144fa30c>

In [541]:

slope, intercept, r_value, p_value, stderr = stats.linregress(x,y)

print 'slope', slope
print 'intercept', intercept
print 'r_value', r_value
print 'p_value', p_value
print 'stderr', stderr

#slope : float
#    slope of the regression line
#intercept : float
#    intercept of the regression line
#r-value : float
#    correlation coefficient
#p-value : float
#    two-sided p-value for a hypothesis test whose null hypothesis is that the slope is zero.
#stderr : float
#    Standard error of the estimate

slope 0.992016829072
intercept 5.05417778483
r_value 0.939887787031
p_value 0.0
stderr 0.0114089959349

In [542]:

#from http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
#plot the a 2d histogram
a = hist2d(x,y, bins=100) # a = supresses output

#plot the computed fit line
fit_xs = [0,10]
fit_ys = [fit_x*slope + intercept for fit_x in fit_xs]
plot(fit_xs,fit_ys, 'k',linewidth=2)

Out[542]:

[<matplotlib.lines.Line2D at 0x1496932c>]

In [543]:

fit_xs = np.linspace(0, 10, 51)

fit_ys = fit_xs * slope + intercept
_ = hist2d(x,y, bins=100)

plot(fit_xs,fit_ys, 'k',linewidth=2)
print fit_xs[0],fit_ys[0]
print fit_xs[1],fit_ys[1]
print fit_xs[2],fit_ys[2]

0.0 5.05417778483
0.2 5.25258115064
0.4 5.45098451646

In [ ]:

# http://engineering.monetate.com/2012/11/02/red-states-pinterest-blue-states-macs/