In []:
#Welcome to Monetate for the March DataPhilly meetup

In [460]:
import numpy as np
import math
import time
from scipy import stats

In []:
#Introducing the normal distribution

In [492]:
a = np.random.randn(1000000)
a

Out[492]:
array([-1.63750585,  0.74482092, -0.04815637, ..., -1.5315872 ,
-0.45943841, -0.25171966])

In [493]:
_ = hist(a, bins=100)

In [477]:
a.mean()

Out[477]:
0.022361050204139612

In [478]:
np.std(a)

Out[478]:
1.0021686012238014

In [479]:
#standard deviation = sqrt(variance)
#variance = mean of square minus square of mean

#mean of squares
mean_of_squares = a.dot(a)/len(a)

#square of mean
square_of_mean = (a.mean()**2)

variance = mean_of_squares - square_of_mean
print 'variance', variance

print 'standard deviation', math.sqrt(variance)

variance 1.00434190528
standard deviation 1.00216860122


In [484]:
len(a)

Out[484]:
1000000

In [485]:
start = time.time()
a.dot(a)
print time.time()-start

0.00362396240234


In [486]:
start = time.time()
sum(i*i for i in a)
print time.time()-start

1.50220799446


In [487]:


Out[487]:
414.52085526329444

In [488]:
a[:10]

Out[488]:
array([ 1.54337579, -0.22298286,  1.09256106,  0.83276554,  0.04672008,
-0.92547545,  1.47941986,  0.03762187,  0.18366413, -0.79001251])

In [494]:
sigma = 5

a*=sigma
print a[:10]
print 'mean', a.mean()
print 'std dev', np.std(a)

[-8.18752926  3.72410459 -0.24078185 -4.66832525  0.54021312  1.75742153
-5.98583905  0.86952862 -1.30490708 -0.79470159]
mean 0.00556193052795
std dev 4.99427902019


In [495]:
_ = hist(a, bins=100)

In [496]:
mu = 100

a+= mu
print 'mean', a.mean()
print 'std dev', np.std(a)

mean 100.005561931
std dev 4.99427902019


In [497]:
_ = hist(a, bins=100)

In [498]:
def gen_norms(n, mu, sigma):
return np.random.randn(n) * sigma + mu

In [500]:
_ = hist(gen_norms(10000, 200, 5), bins=10)

In [501]:
_ = hist(stats.norm.rvs(size=1000))

In [502]:
_ = hist(stats.norm.rvs(loc=10, size=1000))

In [503]:
_ = hist(stats.norm.rvs(scale=10, size=1000))

In [504]:
_ = hist(stats.norm.rvs(loc=100, scale=10, size=1000))

In []:
#Introducing confidence interval

In [544]:
# http://en.wikipedia.org/wiki/Confidence_interval
def confidence_interval(vals):
n = len(vals)
mean = vals.mean()
sigma = np.std(vals)
ssd = sigma/math.sqrt(n)
z_score = 1.96 # p=-0.05
return mean - z_score*ssd, mean+z_score*ssd

confidence_interval(stats.norm.rvs(loc=100, scale=10, size=100))

Out[544]:
(97.502924102225123, 101.50639835800867)

In [545]:
mean = 100
wrong = 0
for i in range(100):
a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100))
if not (a<mean<b):
wrong += 1
print wrong

6


In [546]:
def confidence_wrongness():
mean = 100
wrong = 0
for i in range(100):
a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100))
if not (a<mean<b):
wrong += 1
return wrong

In [547]:
hist([confidence_wrongness() for i in range(100)])

Out[547]:
(array([ 8, 12,  9, 18, 19, 24,  5,  4,  0,  1]),
array([  1. ,   2.2,   3.4,   4.6,   5.8,   7. ,   8.2,   9.4,  10.6,
11.8,  13. ]),
<a list of 10 Patch objects>)

In [435]:
#hooray! math works.

In []:
#Introducing one-way ANOVA

In [530]:
a = stats.norm.rvs(loc=5, scale=10, size=100)
b = stats.norm.rvs(loc=10, scale=20, size=100)

#The one-way ANOVA tests the null hypothesis that two or more groups have the same population mean.
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
stats.f_oneway(a,b)

Out[530]:
(2.4906409666614393, 0.11612135274608126)

In [531]:
# from http://stackoverflow.com/questions/6871201/plot-two-histograms-at-the-same-time-with-matplotlib
bins = np.linspace(-20, 30, 50)

pyplot.hist(a, bins, alpha=0.5)
pyplot.hist(b, bins, alpha=0.5)
pyplot.show()

In [532]:
print a.mean(), np.std(a)
print b.mean(), np.std(b)

4.64916410873 10.9514365695
8.30930618331 20.3117320034


In []:
#Introducting Linear Regression

In [394]:
_ = hist(stats.uniform.rvs(scale=10, size=1000))

In [539]:
#y = norm + x
y = stats.norm.rvs(loc=5, scale=1, size=1000)
x = stats.uniform.rvs(scale=10, size=1000)
print list(zip(x[:3],y[:3]))
y+=x
print list(zip(x[:3],y[:3]))

[(5.9080163574090774, 6.5572216419803731), (7.7466873643872871, 5.4299635923047918), (4.1182370314828578, 5.9690268176702057)]
[(5.9080163574090774, 12.465237999389451), (7.7466873643872871, 13.176650956692079), (4.1182370314828578, 10.087263849153064)]


In [540]:
scatter(x,y)

Out[540]:
<matplotlib.collections.PathCollection at 0x144fa30c>

In [541]:
slope, intercept, r_value, p_value, stderr = stats.linregress(x,y)

print 'slope', slope
print 'intercept', intercept
print 'r_value', r_value
print 'p_value', p_value
print 'stderr', stderr

#slope : float
#    slope of the regression line
#intercept : float
#    intercept of the regression line
#r-value : float
#    correlation coefficient
#p-value : float
#    two-sided p-value for a hypothesis test whose null hypothesis is that the slope is zero.
#stderr : float
#    Standard error of the estimate

slope 0.992016829072
intercept 5.05417778483
r_value 0.939887787031
p_value 0.0
stderr 0.0114089959349


In [542]:
#from http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
#plot the a 2d histogram
a = hist2d(x,y, bins=100) # a = supresses output

#plot the computed fit line
fit_xs = [0,10]
fit_ys = [fit_x*slope + intercept for fit_x in fit_xs]
plot(fit_xs,fit_ys, 'k',linewidth=2)

Out[542]:
[<matplotlib.lines.Line2D at 0x1496932c>]

In [543]:
fit_xs = np.linspace(0, 10, 51)

fit_ys = fit_xs * slope + intercept
_ = hist2d(x,y, bins=100)

plot(fit_xs,fit_ys, 'k',linewidth=2)
print fit_xs[0],fit_ys[0]
print fit_xs[1],fit_ys[1]
print fit_xs[2],fit_ys[2]

0.0 5.05417778483
0.2 5.25258115064
0.4 5.45098451646


In []:
# http://engineering.monetate.com/2012/11/02/red-states-pinterest-blue-states-macs/