How is he going to do it all?
PyMC is a python module that implements Bayesian statistical models and fitting algorithms.
PyMC(2) was Developed by Chris Fonnesbeck, David Huard, and Anand Patil.
Dose Number of Number of
(log g/ml) rats deaths
---------------------------------------
-0.896 5 0
-0.296 5 1
-0.053 5 3
0.727 5 5
Pixie dust Number of Number of
(log g/ml) horses unicorn
---------------------------------------
-0.896 5 0
-0.296 5 1
-0.053 5 3
0.727 5 5
$logit(\theta) = \alpha + \beta x_i $
(which is the same as $~\theta_i = \frac{1}{1 + e^{-{\alpha + \beta x_i}}}$)
%pylab inline
from IPython.display import Image
Populating the interactive namespace from numpy and matplotlib
from pymc import *
from numpy import ones, array
# alpha ~ Normal(mu=0, std=5)
alpha = Normal('alpha', mu=0, tau=1./5**2, value=0.)
# an alternative way is to define a function which returns the likelihood:
@stochastic
def alpha(value=0, mu=0, tau=1./5**2):
return normal_like(value, mu=mu, tau=tau)
# beta ~ U(-inf, inf)
@stochastic
def beta(value=10.):
return 0.
# logit(theta) = a + b*x
# theta = invlogit(a + b*x)
dust=array([-.86,-.3,-.05,.73])
@deterministic
def theta(a=alpha, b=beta, d=dust):
"""theta = inv_logit(a+b)"""
return invlogit(a+b*d)
n = 5*ones(4,dtype=int)
data =array([0,1,3,5], dtype=float)
# unicorns ~ Bin(N_i, theta_i)
unicorns = Binomial('unicorns', n=n, p=theta, value=data, observed=True)
from pymc import *
from numpy import ones, array
dust=array([-.86,-.3,-.05,.73])
n = 5*ones(4,dtype=int)
data =array([0,1,3,5], dtype=float)
# alpha ~ Normal(mu=0, std=5)
alpha = Normal('alpha', mu=0, tau=1./5**2, value=0.)
# beta ~ U(-inf, inf)
@stochastic
def beta(value=10.):
return 0.
# theta_i = invlogit(a + b*x_i)
@deterministic
def theta(a=alpha, b=beta, d=dust):
"""theta = inv_logit(a+b)"""
return invlogit(a+b*d)
# unicorns ~ binomial(n_i, theta_i)
unicorns = Binomial('unicorns', n=n, p=theta, value=data, observed=True)
M = MCMC([alpha, beta, theta, unicorns])
#create a graph for the model and save it to a file
graph.graph(M,format='png',path='',name='unicorns_model')
import scipy.misc
plt.figure(figsize=(8, 8))
plt.imshow(scipy.misc.imread('unicorns_model.png'))
<matplotlib.image.AxesImage at 0x10739d3d0>
print "alpha's parents: ", alpha.parents
print "alpha's children: ", alpha.children
alpha's parents: {'mu': 0, 'tau': 0.04} alpha's children: set([<pymc.PyMCObjects.Deterministic 'theta' at 0x10709ebd0>])
print "alpha's value:", alpha.value
print "beta's value:", beta.value
print "theta's value:", theta.value
print "unicorns's value:", unicorns.value
alpha's value: 0.0 beta's value: 10.0 theta's value: [ 1.84071905e-04 4.74258732e-02 3.77540669e-01 9.99324917e-01] unicorns's value: [0 1 3 5]
beta.value = 11
print "beta's value:", beta.value
print "theta's value:", theta.value
beta's value: 11.0 theta's value: [ 7.79005221e-05 3.55711893e-02 3.65864409e-01 9.99674558e-01]
print "alpha's logp: ", alpha.logp
print "model's logp: ", M.logp
alpha's logp: -2.52837644564 model's logp: -6.02692913581
map = MAP([alpha, beta, theta, unicorns])
map.fit()
print "alpha's value:", alpha.value
print "beta's value:", beta.value
print "theta's value:", theta.value
print "unicorns's value:", unicorns.value
alpha's value: 0.813133194826 beta's value: 7.63530029532 theta's value: [ 0.0031625 0.18581185 0.60620033 0.99831937] unicorns's value: [0 1 3 5]
Common interest in bioassay studies is the LD50.
LD50 - the dosage level in which the probability of death turning into a unicorn is 50%.
LD50_using_MAP = -alpha.value / beta.value
print "LD50: ", LD50_using_MAP
LD50: -0.106496557224
How do we answer questions about uncertainty (e.g, confience interval, Is probable that LD50 is larger than 0?) ?
We need to estimate the posterior in order to evaluate the model properly.
$P(\alpha, \beta|X)=~?$
Since exact analytical solution is hard, we will approximate it.
We will generate samples from the posterior using a magical routine called Markov chain Monte Carlo (MCMC)
Image(filename='what_we_would_get.png')
At each step of the algorithm we would generate a sample given the current status of our model.
$X_t = S(X_{t-1})$
we would then have a long chain of samples.
According to the MCMC theory, after $T$ steps our samples are drawn from the model's posterior (under some assumptions).
from IPython.display import YouTubeVideo
for node in nodes:
*acceptance is proportional to how bad is the new sample
M = MCMC([alpha, beta, theta, unicorns])
M.sample(10000, burn=5000)
[****************100%******************] 10000 of 10000 complete
fig = plt.figure(figsize=(16,8))
Matplot.plot(alpha, new=False)
Plotting alpha
M.summary()
alpha: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 1.211 1.049 0.049 [-0.736 3.376] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -0.694 0.497 1.153 1.83 3.474 beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 11.172 5.184 0.25 [ 3.074 21.702] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 3.532 7.348 10.265 14.136 23.026 theta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.007 0.022 0.001 [ 0. 0.039] 0.151 0.122 0.003 [ 0. 0.397] 0.634 0.179 0.007 [ 0.288 0.955] 0.993 0.026 0.001 [ 0.961 1. ] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.0 0.0 0.0 0.004 0.07 0.007 0.056 0.122 0.219 0.464 0.252 0.515 0.654 0.769 0.93 0.93 0.998 1.0 1.0 1.0
M.trace('alpha')[:]
array([-0.10523308, -0.10523308, 1.04476538, ..., 1.09404241, 2.48917766, 2.48917766])
alpha.trace()
array([-0.10523308, -0.10523308, 1.04476538, ..., 1.09404241, 2.48917766, 2.48917766])
fig = plt.figure(figsize=(8,8))
scatter(alpha.trace(), beta.trace());
xlabel('alpha');
ylabel('beta');
plt.figure(figsize=(8,8))
hist(-alpha.trace()/beta.trace(), 20);
xlabel('LD50');
Another way we can do it
@deterministic
def ld50(a=alpha, b=beta):
return -a/b
M2 = MCMC([alpha, beta, theta, unicorns, ld50])
M2.sample(10000, 5000)
[****************100%******************] 10000 of 10000 complete
fig = plt.figure(figsize=(16,8))
Matplot.plot(ld50, new=False)
Plotting ld50
print "LD50 using MAP: ", LD50_using_MAP
ld50.summary()
LD50 using MAP: -0.106496557224 ld50: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ -0.105 0.09 0.003 [-0.277 0.086] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -0.273 -0.161 -0.108 -0.056 0.097
print "P(LD50 > 0): ", np.sum(ld50.trace() > 0) / float(len(ld50.trace()))
P(LD50 > 0): 0.097
Let's go back to the alpha node:
fig = plt.figure(figsize=(16,8))
Matplot.plot(alpha, new=False)
Plotting alpha
Image(filename='ham-mcmc.png')
import pymc as pm
import numpy as np
from numpy.random import randn, rand
class SliceStep(pm.Gibbs):
"""
simple slice sampler
"""
def __init__(self, stochastic, width = 2, maxiter = 1000,
verbose = None, *args, **kwargs):
"""
Input:
stochastic - stochastic node
width <float> - the initial width of the interval
maxiter <int> - maximum number of iteration allowed for stepping-out and shrinking
"""
pm.Gibbs.__init__(self, stochastic, verbose=verbose, *args, **kwargs)
self.width = width
self.neval = 0
self.maxiter = maxiter
def step(self):
stoch = self.stochastic
value = stoch.value
#sample vertical level
z = self.logp_plus_loglike - np.random.exponential()
if self.verbose>2:
print self._id + ' current value: %.3f' % value
print self._id + ' sampled vertical level ' + `z`
#position an interval at random starting position around the current value
r = self.width * np.random.rand()
xr = value + r
xl = xr - self.width
if self.verbose>2:
print 'initial interval [%.3f, %.3f]' % (xl, xr)
#step out to the left
iter = 0
stoch.value = xl
while (self.get_logp() >= z) and (iter < self.maxiter):
xl -= self.width
stoch.value = xl
iter += 1
assert iter < self.maxiter, "Step-out procedure failed"
self.neval += iter
if self.verbose>2:
print 'after %d iteration interval is [%.3f, %.3f]' % (iter, xl, xr)
#step out to the right
iter = 0
stoch.value = xr
while (self.get_logp() >= z) and (iter < self.maxiter):
xr += self.width
stoch.value = xr
iter += 1
assert iter < self.maxiter, "Step-out procedure failed"
self.neval += iter
if self.verbose>2:
print 'after %d iteration interval is [%.3f, %.3f]' % (iter, xl, xr)
#draw a new point from the interval [xl, xr].
xp = rand()*(xr-xl) + xl
stoch.value = xp
#if the point is outside the interval than shrink it and draw again
iter = 0
while(self.get_logp() < z) and (iter < self.maxiter):
if (xp > value):
xr = xp
else:
xl = xp
xp = rand() * (xr-xl) + xl #draw again
stoch.value = xp
iter += 1
assert iter < self.maxiter, "Shrink-in procedure failed."
self.neval += iter
if self.verbose>2:
print 'after %d iteration found new value: %.3f' % (iter, xp)
def get_logp(self):
try:
return self.logp_plus_loglike
except pm.ZeroProbability:
return -np.inf
#Set up a new MCMC model
M_slice = MCMC([alpha, beta, theta, ld50])
#Set up the Slice Step Method (http://en.wikipedia.org/wiki/Slice_sampling)
[M_slice.use_step_method(SliceStep, node) for node in M_slice.stochastics];
#run MCMC
M_slice.sample(2000)
print
fig = plt.figure(figsize=(16,8))
Matplot.plot(alpha, new=False)
[****************100%******************] 2000 of 2000 complete Plotting alpha
Image(filename='comparison.png', width=800)
https://github.com/isofer/Bayesian-data-analysis-with-PyMC2.git
http://pymc-devs.github.io/pymc/
http://camdavidsonpilon.github.io/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/