# special IPython command to prepare the notebook for matplotlib %matplotlib inline import numpy as np import scipy.stats as stats import matplotlib.pyplot as plt import pandas as pd import numpy as np # special matplotlib argument for improved plots from matplotlib import rcParams #colorbrewer2 Dark2 qualitative color table dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' # Let's look at the help file for the stats.binom function # stats.binom? n = 10 p = 0.3 k = np.arange(0,11) # plot all values between 0 and 10 y = stats.binom.pmf(k, n, p) print y plt.plot(k, y, 'o-') plt.title('Binomial: n=%i , p=%.2f' % (n,p),fontsize=15) plt.xlabel('Number of successes') plt.ylabel('Probability of successes', fontsize=15) plt.show() # your turn # Try changing p to p = 0.6. # Try plotting values values larger than 10. # stats.binom? data = stats.binom.rvs(n = 10, p = 0.3, size = 10000) print "Mean: %g" % np.mean(data) print "SD: %g" % np.std(data, ddof=1) plt.figure() plt.hist(data, bins = 10, normed = True) plt.xlabel("x") plt.ylabel("density") plt.show() # Let's look at the help file for the stats.binom function # stats.poisson? lam = 2 # rate parameter lambda n = np.arange(0, 11) y = stats.poisson.pmf(n, lam) print y plt.plot(n, y, 'o-') plt.title('Poisson: $\lambda$ =%i' % lam) plt.xlabel('Number of accidents') plt.ylabel('Probability of number of accidents') plt.show() # your turn # try changing the rate parameters to 1. # Try changing the rate parameter to 8. # stats.poisson? data = stats.poisson.rvs(mu = 2, loc = 0, size = 1000) print "Mean: %g" % np.mean(data) print "SD: %g" % np.std(data, ddof=1) plt.figure() plt.hist(data, bins = 9, normed = True) plt.xlim(0, 10) plt.xlabel("Number of accidents") plt.title("Simulating Poisson Random Variables") plt.show() # your turn # plot the pdf of a normal distribution with mean = 0 and # standard deviation = 1 mu = 0 # mean sigma = 1 # standard deviation x = np.arange(-5,5,0.1) y = stats.norm.pdf(x, 0, 1) plt.plot(x, y) plt.title('Normal: $\mu$=%.1f, $\sigma^2$=%.1f' % (mu, sigma)) plt.xlabel('x') plt.ylabel('Probability density') plt.show() a = 0.5 b = 0.5 x = np.arange(0.01, 1, 0.01) y = stats.beta.pdf(x, a, b) plt.plot(x, y) plt.title('Beta: a=%.1f, b=%.1f' % (a,b)) plt.xlabel('x') plt.ylabel('Probability density') plt.show() # your turn # Try changing alpha = 0.8, beta = 0.5 # Try changing alpha = 2, beta = 6 # Try changing alpha = 4, beta = 2 # Try changing alpha = 1, beta = 1 # What do you notice when alpha and beta are both less than 1 # compared to both bigger than 1? lam = 0.5 x = np.arange(0, 15, 0.1) y = lam * np.exp(-lam * x) # could also use stats.expon.pdf plt.plot(x,y) plt.title('Exponential: $\lambda$ =%.2f' % lam) plt.xlabel('x') plt.ylabel('Probability density') plt.show() data = stats.expon.rvs(scale = 2, size = 1000) print "Mean: %g" % np.mean(data) print "SD: %g" % np.std(data, ddof=1) plt.figure() plt.hist(data, bins = 20, normed = True) plt.xlim(0, 15) plt.title("Simulating Exponential Random Variables") plt.show() # Generate N = 1000 data sets of size 100 numbers drawn from an # exponential distribution (lambda = 0.5) data = stats.expon.rvs(scale=2, size = (1000, 100)) # Compute sample mean for each N datasets sample_means = np.mean(data, axis = 1) print 'The mean of the means is: ', np.mean(sample_means) print 'The standard deviation of the means is: ', np.std(sample_means, ddof=1) plt.figure() plt.boxplot(sample_means) plt.figure() plt.hist(sample_means, normed=True) plt.show() n = 20 for N in [100, 1000, 10000]: data = stats.expon.rvs(scale = 2, size=(N, n)) sample_means = np.mean(data, axis=1) mu = np.mean(sample_means) sig = np.std(sample_means, ddof=1) plt.figure() plt.hist(sample_means, bins=20, normed=True) x = np.linspace(0, 4, 100) y = stats.norm.pdf(x, loc=mu, scale=sig) plt.plot(x, y, color = 'r') plt.title('Distribution of sample means using N=%i datasets' % N) n = 100 for N in [100, 1000, 10000]: data = stats.expon.rvs(scale = 2, size=(N, n)) sample_means = np.mean(data, axis=1) mu = np.mean(sample_means) sig = np.std(sample_means, ddof=1) print 'Standard deviation of simulated distribution: %g' % sig print 'Standard deviation using CLT: %g' % np.sqrt( 4 / float(n)) print N = 1000 for n in [10, 100, 1000]: data = stats.expon.rvs(scale = 2, size=(N, n)) sample_means = np.mean(data, axis=1) mu = np.mean(sample_means) sig = np.std(sample_means, ddof=1) plt.figure() plt.hist(sample_means, bins=20, normed=True) x = np.linspace(0, 4, 100) y = stats.norm.pdf(x, loc=mu, scale=sig) plt.plot(x, y, color = 'r') plt.title('Distribution of sample means using N=%i datasets of size n=%i datasets' % (N, n)) N = 1000 for n in [10, 100, 1000]: data = stats.expon.rvs(scale = 2, size=(N, n)) sample_means = np.mean(data, axis=1) mu = np.mean(sample_means) sig = np.std(sample_means, ddof=1) print 'Standard deviation of simulated distribution: %g' % sig print 'Standard deviation using CLT: %g' % np.sqrt( 4 / float(n)) print