import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
import scipy.stats
# Collect a ton of samples (takes ~20 minutes to run!).
num_samples=100000000
results = np.zeros(num_samples)
for i in xrange(num_samples):
results[i] = np.random.random(12).sum() - 6
n, bins, _ = plt.hist(results, normed=True, histtype='step', bins=250)
plt.title("Histogram of sample data")
<matplotlib.text.Text at 0xe00d3cc>
# Smooth out the histogram data with interpolation.
x_old = bins[:-1] + (bins[1:] - bins[:-1]) / 2.0 # midpoints of the bins
y_old = n # number of points in the bins
x_new = np.linspace(-8, 8, num=100000)
y_new = scipy.interpolate.spline(x_old, y_old, x_new)
plt.title("Interpolated sample data")
plt.plot(x_new, y_new)
[<matplotlib.lines.Line2D at 0xe00782c>]
# Get the PDF of the normal distribution with mean 0 and variance 1.
pdf = scipy.stats.norm().pdf
y_norm = pdf(x_new)
# Plot the normal PDF minus the sample PDF. Values of y > 0 show where
# the normal distribution is higher than the sample distribution.
plt.plot(x_new, y_norm - y_new, label='N(0,1) - samples')
plt.legend()
<matplotlib.legend.Legend at 0xe2532cc>