#Welcome to Monetate for the March DataPhilly meetup import numpy as np import math import time from scipy import stats #Introducing the normal distribution a = np.random.randn(1000000) a _ = hist(a, bins=100) a.mean() np.std(a) #standard deviation = sqrt(variance) #variance = mean of square minus square of mean #mean of squares mean_of_squares = a.dot(a)/len(a) #square of mean square_of_mean = (a.mean()**2) variance = mean_of_squares - square_of_mean print 'variance', variance print 'standard deviation', math.sqrt(variance) len(a) start = time.time() a.dot(a) print time.time()-start start = time.time() sum(i*i for i in a) print time.time()-start a[:10] sigma = 5 a*=sigma print a[:10] print 'mean', a.mean() print 'std dev', np.std(a) _ = hist(a, bins=100) mu = 100 a+= mu print 'mean', a.mean() print 'std dev', np.std(a) _ = hist(a, bins=100) def gen_norms(n, mu, sigma): return np.random.randn(n) * sigma + mu _ = hist(gen_norms(10000, 200, 5), bins=10) _ = hist(stats.norm.rvs(size=1000)) _ = hist(stats.norm.rvs(loc=10, size=1000)) _ = hist(stats.norm.rvs(scale=10, size=1000)) _ = hist(stats.norm.rvs(loc=100, scale=10, size=1000)) #Introducing confidence interval # http://en.wikipedia.org/wiki/Confidence_interval def confidence_interval(vals): n = len(vals) mean = vals.mean() sigma = np.std(vals) ssd = sigma/math.sqrt(n) z_score = 1.96 # p=-0.05 return mean - z_score*ssd, mean+z_score*ssd confidence_interval(stats.norm.rvs(loc=100, scale=10, size=100)) mean = 100 wrong = 0 for i in range(100): a, b = confidence_interval(stats.norm.rvs(loc=mean, scale=10, size=100)) if not (a