#Welcome to Monetate for the March DataPhilly meetup
import numpy as np
import math
import time
from scipy import stats
#Introducing the normal distribution
a = np.random.randn(1000000)
a
array([-1.63750585, 0.74482092, -0.04815637, ..., -1.5315872 , -0.45943841, -0.25171966])
_ = hist(a, bins=100)
a.mean()
0.022361050204139612
np.std(a)
1.0021686012238014
#standard deviation = sqrt(variance)
#variance = mean of square minus square of mean
#mean of squares
mean_of_squares = a.dot(a)/len(a)
#square of mean
square_of_mean = (a.mean()**2)
variance = mean_of_squares - square_of_mean
print 'variance', variance
print 'standard deviation', math.sqrt(variance)
variance 1.00434190528 standard deviation 1.00216860122
len(a)
1000000
start = time.time()
a.dot(a)
print time.time()-start
0.00362396240234
start = time.time()
sum(i*i for i in a)
print time.time()-start
1.50220799446
414.52085526329444
a[:10]
array([ 1.54337579, -0.22298286, 1.09256106, 0.83276554, 0.04672008, -0.92547545, 1.47941986, 0.03762187, 0.18366413, -0.79001251])
sigma = 5
a*=sigma
print a[:10]
print 'mean', a.mean()
print 'std dev', np.std(a)
[-8.18752926 3.72410459 -0.24078185 -4.66832525 0.54021312 1.75742153 -5.98583905 0.86952862 -1.30490708 -0.79470159] mean 0.00556193052795 std dev 4.99427902019
_ = hist(a, bins=100)
mu = 100
a+= mu
print 'mean', a.mean()
print 'std dev', np.std(a)
mean 100.005561931 std dev 4.99427902019
_ = hist(a, bins=100)
def gen_norms(n, mu, sigma):
return np.random.randn(n) * sigma + mu
_ = hist(gen_norms(10000, 200, 5), bins=10)
_ = hist(stats.norm.rvs(size=1000))
_ = hist(stats.norm.rvs(loc=10, size=1000))
_ = hist(stats.norm.rvs(scale=10, size=1000))
_ = hist(stats.norm.rvs(loc=100, scale=10, size=1000))
#Introducing confidence interval
# http://en.wikipedia.org/wiki/Confidence_interval
def confidence_interval(vals):
n = len(vals)
mean = vals.mean()
sigma = np.std(vals)
ssd = sigma/math.sqrt(n)
z_score = 1.96 # p=-0.05
return mean - z_score*ssd, mean+z_score*ssd
confidence_interval(stats.norm.rvs(loc=100, scale=10, size=100))
(97.502924102225123, 101.50639835800867)
mean = 100
wrong = 0
for i in range(100):
a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100))
if not (a<mean<b):
wrong += 1
print wrong
6
def confidence_wrongness():
mean = 100
wrong = 0
for i in range(100):
a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100))
if not (a<mean<b):
wrong += 1
return wrong
hist([confidence_wrongness() for i in range(100)])
(array([ 8, 12, 9, 18, 19, 24, 5, 4, 0, 1]), array([ 1. , 2.2, 3.4, 4.6, 5.8, 7. , 8.2, 9.4, 10.6, 11.8, 13. ]), <a list of 10 Patch objects>)
#hooray! math works.
#Introducing one-way ANOVA
a = stats.norm.rvs(loc=5, scale=10, size=100)
b = stats.norm.rvs(loc=10, scale=20, size=100)
#The one-way ANOVA tests the null hypothesis that two or more groups have the same population mean.
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
stats.f_oneway(a,b)
(2.4906409666614393, 0.11612135274608126)
# from http://stackoverflow.com/questions/6871201/plot-two-histograms-at-the-same-time-with-matplotlib
bins = np.linspace(-20, 30, 50)
pyplot.hist(a, bins, alpha=0.5)
pyplot.hist(b, bins, alpha=0.5)
pyplot.show()
print a.mean(), np.std(a)
print b.mean(), np.std(b)
4.64916410873 10.9514365695 8.30930618331 20.3117320034
#Introducting Linear Regression
_ = hist(stats.uniform.rvs(scale=10, size=1000))
#y = norm + x
y = stats.norm.rvs(loc=5, scale=1, size=1000)
x = stats.uniform.rvs(scale=10, size=1000)
print list(zip(x[:3],y[:3]))
y+=x
print list(zip(x[:3],y[:3]))
[(5.9080163574090774, 6.5572216419803731), (7.7466873643872871, 5.4299635923047918), (4.1182370314828578, 5.9690268176702057)] [(5.9080163574090774, 12.465237999389451), (7.7466873643872871, 13.176650956692079), (4.1182370314828578, 10.087263849153064)]
scatter(x,y)
<matplotlib.collections.PathCollection at 0x144fa30c>
slope, intercept, r_value, p_value, stderr = stats.linregress(x,y)
print 'slope', slope
print 'intercept', intercept
print 'r_value', r_value
print 'p_value', p_value
print 'stderr', stderr
#slope : float
# slope of the regression line
#intercept : float
# intercept of the regression line
#r-value : float
# correlation coefficient
#p-value : float
# two-sided p-value for a hypothesis test whose null hypothesis is that the slope is zero.
#stderr : float
# Standard error of the estimate
slope 0.992016829072 intercept 5.05417778483 r_value 0.939887787031 p_value 0.0 stderr 0.0114089959349
#from http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
#plot the a 2d histogram
a = hist2d(x,y, bins=100) # a = supresses output
#plot the computed fit line
fit_xs = [0,10]
fit_ys = [fit_x*slope + intercept for fit_x in fit_xs]
plot(fit_xs,fit_ys, 'k',linewidth=2)
[<matplotlib.lines.Line2D at 0x1496932c>]
fit_xs = np.linspace(0, 10, 51)
fit_ys = fit_xs * slope + intercept
_ = hist2d(x,y, bins=100)
plot(fit_xs,fit_ys, 'k',linewidth=2)
print fit_xs[0],fit_ys[0]
print fit_xs[1],fit_ys[1]
print fit_xs[2],fit_ys[2]
0.0 5.05417778483 0.2 5.25258115064 0.4 5.45098451646
# http://engineering.monetate.com/2012/11/02/red-states-pinterest-blue-states-macs/