In []:
#Welcome to Monetate for the March DataPhilly meetup
In [460]:
import numpy as np
import math
import time
from scipy import stats 
In []:
#Introducing the normal distribution
In [492]:
a = np.random.randn(1000000)
a
Out[492]:
array([-1.63750585,  0.74482092, -0.04815637, ..., -1.5315872 ,
       -0.45943841, -0.25171966])
In [493]:
_ = hist(a, bins=100)
In [477]:
a.mean()
Out[477]:
0.022361050204139612
In [478]:
np.std(a)
Out[478]:
1.0021686012238014
In [479]:
#standard deviation = sqrt(variance)
#variance = mean of square minus square of mean

#mean of squares
mean_of_squares = a.dot(a)/len(a)

#square of mean
square_of_mean = (a.mean()**2)

variance = mean_of_squares - square_of_mean
print 'variance', variance

print 'standard deviation', math.sqrt(variance)
variance 1.00434190528
standard deviation 1.00216860122

In [484]:
len(a)
Out[484]:
1000000
In [485]:
start = time.time()
a.dot(a)
print time.time()-start
0.00362396240234

In [486]:
start = time.time()
sum(i*i for i in a)
print time.time()-start
1.50220799446

In [487]:
 
Out[487]:
414.52085526329444
In [488]:
a[:10]
Out[488]:
array([ 1.54337579, -0.22298286,  1.09256106,  0.83276554,  0.04672008,
       -0.92547545,  1.47941986,  0.03762187,  0.18366413, -0.79001251])
In [494]:
sigma = 5

a*=sigma
print a[:10]
print 'mean', a.mean()
print 'std dev', np.std(a)
[-8.18752926  3.72410459 -0.24078185 -4.66832525  0.54021312  1.75742153
 -5.98583905  0.86952862 -1.30490708 -0.79470159]
mean 0.00556193052795
std dev 4.99427902019

In [495]:
_ = hist(a, bins=100)
In [496]:
mu = 100

a+= mu
print 'mean', a.mean()
print 'std dev', np.std(a)
mean 100.005561931
std dev 4.99427902019

In [497]:
_ = hist(a, bins=100)
In [498]:
def gen_norms(n, mu, sigma):
    return np.random.randn(n) * sigma + mu
In [500]:
_ = hist(gen_norms(10000, 200, 5), bins=10)
In [501]:
_ = hist(stats.norm.rvs(size=1000))
In [502]:
_ = hist(stats.norm.rvs(loc=10, size=1000))
In [503]:
_ = hist(stats.norm.rvs(scale=10, size=1000))
In [504]:
_ = hist(stats.norm.rvs(loc=100, scale=10, size=1000))
In []:
#Introducing confidence interval
In [544]:
# http://en.wikipedia.org/wiki/Confidence_interval
def confidence_interval(vals):
    n = len(vals)
    mean = vals.mean()
    sigma = np.std(vals)
    ssd = sigma/math.sqrt(n)
    z_score = 1.96 # p=-0.05
    return mean - z_score*ssd, mean+z_score*ssd


confidence_interval(stats.norm.rvs(loc=100, scale=10, size=100))
Out[544]:
(97.502924102225123, 101.50639835800867)
In [545]:
mean = 100
wrong = 0
for i in range(100):
    a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100))
    if not (a<mean<b):
        wrong += 1
print wrong
6

In [546]:
def confidence_wrongness():
    mean = 100
    wrong = 0
    for i in range(100):
        a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100))
        if not (a<mean<b):
            wrong += 1
    return wrong
In [547]:
hist([confidence_wrongness() for i in range(100)])
Out[547]:
(array([ 8, 12,  9, 18, 19, 24,  5,  4,  0,  1]),
 array([  1. ,   2.2,   3.4,   4.6,   5.8,   7. ,   8.2,   9.4,  10.6,
        11.8,  13. ]),
 <a list of 10 Patch objects>)
In [435]:
#hooray! math works.
In []:
#Introducing one-way ANOVA
In [530]:
a = stats.norm.rvs(loc=5, scale=10, size=100)
b = stats.norm.rvs(loc=10, scale=20, size=100)

#The one-way ANOVA tests the null hypothesis that two or more groups have the same population mean.
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
stats.f_oneway(a,b)
Out[530]:
(2.4906409666614393, 0.11612135274608126)
In [531]:
# from http://stackoverflow.com/questions/6871201/plot-two-histograms-at-the-same-time-with-matplotlib
bins = np.linspace(-20, 30, 50)

pyplot.hist(a, bins, alpha=0.5)
pyplot.hist(b, bins, alpha=0.5)
pyplot.show()
In [532]:
print a.mean(), np.std(a)
print b.mean(), np.std(b)
4.64916410873 10.9514365695
8.30930618331 20.3117320034

In []:
#Introducting Linear Regression
In [394]:
_ = hist(stats.uniform.rvs(scale=10, size=1000))
In [539]:
#y = norm + x
y = stats.norm.rvs(loc=5, scale=1, size=1000)
x = stats.uniform.rvs(scale=10, size=1000)
print list(zip(x[:3],y[:3]))
y+=x
print list(zip(x[:3],y[:3]))
[(5.9080163574090774, 6.5572216419803731), (7.7466873643872871, 5.4299635923047918), (4.1182370314828578, 5.9690268176702057)]
[(5.9080163574090774, 12.465237999389451), (7.7466873643872871, 13.176650956692079), (4.1182370314828578, 10.087263849153064)]

In [540]:
scatter(x,y)
Out[540]:
<matplotlib.collections.PathCollection at 0x144fa30c>
In [541]:
slope, intercept, r_value, p_value, stderr = stats.linregress(x,y)

print 'slope', slope
print 'intercept', intercept
print 'r_value', r_value
print 'p_value', p_value
print 'stderr', stderr

#slope : float
#    slope of the regression line
#intercept : float
#    intercept of the regression line
#r-value : float
#    correlation coefficient
#p-value : float
#    two-sided p-value for a hypothesis test whose null hypothesis is that the slope is zero.
#stderr : float
#    Standard error of the estimate
slope 0.992016829072
intercept 5.05417778483
r_value 0.939887787031
p_value 0.0
stderr 0.0114089959349

In [542]:
#from http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
#plot the a 2d histogram
a = hist2d(x,y, bins=100) # a = supresses output

#plot the computed fit line
fit_xs = [0,10]
fit_ys = [fit_x*slope + intercept for fit_x in fit_xs]
plot(fit_xs,fit_ys, 'k',linewidth=2)
Out[542]:
[<matplotlib.lines.Line2D at 0x1496932c>]
In [543]:
fit_xs = np.linspace(0, 10, 51)

fit_ys = fit_xs * slope + intercept
_ = hist2d(x,y, bins=100)

plot(fit_xs,fit_ys, 'k',linewidth=2)
print fit_xs[0],fit_ys[0]
print fit_xs[1],fit_ys[1]
print fit_xs[2],fit_ys[2]
0.0 5.05417778483
0.2 5.25258115064
0.4 5.45098451646

In []:
# http://engineering.monetate.com/2012/11/02/red-states-pinterest-blue-states-macs/