Notebook

Implementation of Kruschke's BEST Test¶

Paper at http://www.indiana.edu/~kruschke/articles/Kruschke2013JEPG.pdf
Supporting website at http://www.indiana.edu/~kruschke/BEST/

In [1]:

import pylab as py
import emcee

Style for plotting¶

In [2]:

style_text="""
axes.titlesize : 24
axes.labelsize : 20
lines.linewidth : 3
lines.markersize : 5
xtick.labelsize : 20
ytick.labelsize : 20
figure.figsize : 10,8
axes.grid: True
grid.linestyle: -
grid.color: 0.75
font.size: 20
font.family: sans-serif
legend.fontsize: 20
legend.frameon: False
legend.numpoints: 1
legend.scatterpoints: 1
lines.solid_capstyle: round

text.color: .15
xtick.color: .15
ytick.color: .15
xtick.direction: out
ytick.direction: out

axes.axisbelow: True

# these are tableau20 colors
axes.color_cycle: 1f77b4, aec7e8,ff7f0e,ffbb78,2ca02c,98df8a,d62728,ff9896,9467bd,c5b0d5,8c564b,c49c94,e377c2,f7b6d2,7f7f7f,c7c7c7,bcbd22,dbdb8d,17becf,9edae5
"""

with open('mystyle.mplstyle','w') as fid:
    fid.write(style_text)
    
py.style.use('mystyle.mplstyle')    

In [3]:

def histogram(y,bins=50,plot=True):
    N,bins=np.histogram(y,bins)
    
    dx=bins[1]-bins[0]
    if dx==0.0:  #  all in 1 bin!
        val=bins[0]
        bins=np.linspace(val-abs(val),val+abs(val),50)
        N,bins=np.histogram(y,bins)
    
    dx=bins[1]-bins[0]
    x=bins[0:-1]+(bins[1]-bins[0])/2.0
    
    y=N*1.0/np.sum(N)/dx
    
    if plot:
        py.plot(x,y,'o-')
        yl=py.gca().get_ylim()
        py.gca().set_ylim([0,yl[1]])
        xl=py.gca().get_xlim()
        if xl[0]<=0 and xl[0]>=0:    
            py.plot([0,0],[0,yl[1]],'k--')

    return x,y

Implementation of the BEST model and support code¶

In [4]:

import time

def time2str(tm):
    
    frac=tm-int(tm)
    tm=int(tm)
    
    s=''
    sc=tm % 60
    tm=tm//60
    
    mn=tm % 60
    tm=tm//60
    
    hr=tm % 24
    tm=tm//24
    dy=tm

    if (dy>0):
        s=s+"%d d, " % dy

    if (hr>0):
        s=s+"%d h, " % hr

    if (mn>0):
        s=s+"%d m, " % mn


    s=s+"%.2f s" % (sc+frac)

    return s

def timeit(reset=False):
    global _timeit_data
    try:
        _timeit_data
    except NameError:
        _timeit_data=time.time()

    if reset:
        _timeit_data=time.time()

    else:
        return time2str(time.time()-_timeit_data)

In [119]:

from scipy.special import gammaln,gamma

def tpdf(x,df,mu,sd):
    t=(x-mu)/float(sd)
    return gamma((df+1)/2.0)/sqrt(df*pi)/gamma(df/2.0)/sd*(1+t**2/df)**(-(df+1)/2.0)
    
def logtpdf(x,df,mu,sd):
    try:
        N=len(x)
    except TypeError:
        N=1
    
    t=(x-mu)/float(sd)
    return N*(gammaln((df+1)/2.0)-0.5*log(df*np.pi)-gammaln(df/2.0)-np.log(sd))+(-(df+1)/2.0)*sum(np.log(1+t**2/df))

def loguniformpdf(x,mn,mx):
    if mn < x < mx:
        return np.log(1.0/(mx-mn))
    return -np.inf

def logjeffreyspdf(x):
    if x>0.0:
        return -np.log(x)
    return -np.inf

def lognormalpdf(x,mn,sig):
    # 1/sqrt(2*pi*sigma^2)*exp(-x^2/2/sigma^2)
    try:
        N=len(x)
    except TypeError:
        N=1
        
    return -0.5*log(2*np.pi*sig**2)*N - sum((x-mn)**2/sig**2/2.0)

def logexponpdf(x,scale):
    if x<=0:
        return -np.inf
    return -np.log(scale)-x/scale

class BESTModel(object):
    
    def __init__(self,y1,y2):
        self.data=array(y1,float),array(y2,float)
        pooled=np.concatenate((y1,y2))
        self.S=np.std(pooled)
        self.M=np.mean(pooled)
        
        self.names=['mu1','mu2','sigma1','sigma2','nu']
        self.params=[]
        self.params_dict={}
        
    def initial_value(self):
        return array([self.M,self.M,self.S,self.S,10])
    
    def prior(self,theta):
        mu1,mu2,sd1,sd2,nu=theta
        value=0.0
        value+=lognormalpdf(mu1,self.M,1000*self.S)
        value+=lognormalpdf(mu2,self.M,1000*self.S)
        
        mn=0.001*self.S
        mx=1000*self.S
        value+=loguniformpdf(sd1,mn,mx-mn)
        value+=loguniformpdf(sd2,mn,mx-mn)

        value+=logexponpdf(nu-1,scale=29)
        return value
        
    def run_mcmc(self,iterations=1000,burn=0.1):
        # Set up the sampler.
        ndim, nwalkers = len(self), 100
        val=self.initial_value()
        pos=emcee.utils.sample_ball(val, .05*val+1e-4, size=nwalkers)
        self.sampler = emcee.EnsembleSampler(nwalkers, ndim, model)
        
        timeit(reset=True)
        print("Running MCMC...")
        self.sampler.run_mcmc(pos, iterations)
        print("Done.")
        print timeit()
        
        burnin = int(self.sampler.chain.shape[1]*burn)
        samples = self.sampler.chain[:, burnin:, :]
        self.mu1=samples[:,:,0]
        self.mu2=samples[:,:,1]
        self.sigma1=samples[:,:,2]
        self.sigma2=samples[:,:,3]
        self.nu=samples[:,:,4]
        
        self.params=[self.mu1,self.mu2,self.sigma1,self.sigma2,self.nu]
        self.params_dict['mu1']=self.mu1
        self.params_dict['mu2']=self.mu2
        self.params_dict['sigma1']=self.sigma1
        self.params_dict['sigma2']=self.sigma2
        self.params_dict['nu']=self.nu
        
        
        
    def __len__(self):
        return 5  # mu1,mu2,sd1,sd2,nu
        
    def likelihood(self,theta):
        mu1,mu2,sd1,sd2,nu=theta
        y1,y2=self.data
        
        value=0.0
        value+=logtpdf(y1,nu,mu1,sd1)
        value+=logtpdf(y2,nu,mu2,sd2)

        return value
    
    def plot_chains(self,S,*args,**kwargs):
        mu1,mu2,sigma1,sigma2,nu=self.params
        N=float(prod(mu1.shape))

        if '=' in S:
            name,expr=S.split('=')
            value=eval(expr)
        else:
            name=S
            if name=='mu1':
                name=r"\mu_1"
            elif name=='mu2':
                name=r"\mu_2"
            elif name=='sigma1':
                name=r"\sigma_1"
            elif name=='sigma2':
                name=r"\sigma_2"
            elif name=='nu':
                name=r"\nu"
            else:
                name=r"%s" % name
        
            value=eval(S)
        
        plot(value, color="k",alpha=0.02,**kwargs)
        if "\\" in name:        
            ylabel("$"+name+"$")        
        else:
            ylabel(name)        
    
    def plot_distribution(self,S,p=95):
        
        pp=[(100-p)/2.0,50,100-(100-p)/2.0]
        
        mu1,mu2,sigma1,sigma2,nu=self.params
        N=float(prod(mu1.shape))
        
        if '=' in S:
            name,expr=S.split('=')
            value=eval(expr)
        else:
            name=S
            if name=='mu1':
                name=r"\hat{\mu}_1"
            elif name=='mu2':
                name=r"\hat{\mu}_2"
            elif name=='sigma1':
                name=r"\hat{\sigma}_1"
            elif name=='sigma2':
                name=r"\hat{\sigma}_2"
            elif name=='nu':
                name=r"\hat{\nu}"
            else:
                name=r"\hat{%s}" % name
        
            value=eval(S)
            
        result=histogram(value.ravel(),bins=200)
        v=np.percentile(value.ravel(), pp ,axis=0)
        if r"\hat" in name:
            title(r'$%s=%.3f^{+%.3f}_{-%.3f}$' % (name,v[1],(v[2]-v[1]),(v[1]-v[0])),va='bottom')
        else:
            title(r'%s$=%.3f^{+%.3f}_{-%.3f}$' % (name,v[1],(v[2]-v[1]),(v[1]-v[0])),va='bottom')
        
        
    def P(self,S):
        
        mu1,mu2,sigma1,sigma2,nu=self.params
        N=float(prod(mu1.shape))
        result=eval('sum(%s)/N' % S)
        return result
            
    def posterior(self,theta):
        prior = self.prior(theta)
        if not np.isfinite(prior):
            return -np.inf
        return prior + self.likelihood(theta)
        
    def __call__(self,theta):
        return self.posterior(theta)

Example from paper¶

In [120]:

drug = (101,100,102,104,102,97,105,105,98,101,100,123,105,103,100,95,102,106,
        109,102,82,102,100,102,102,101,102,102,103,103,97,97,103,101,97,104,
        96,103,124,101,101,100,101,101,104,100,101)
placebo = (99,101,100,101,102,100,97,101,104,101,102,102,100,105,88,101,100,
           104,100,100,100,101,102,103,97,101,101,100,101,99,101,100,100,
           101,100,99,101,100,102,99,100,99)

model=BESTModel(drug,placebo)

In [121]:

model.run_mcmc()

Running MCMC...
Done.
5.82 s

Plot chains¶

In [122]:

for name,sample in zip(model.names,model.params):
    figure(figsize=(12,5))
    model.plot_chains(name)

Plot distributions¶

In [124]:

for name,sample in zip(model.names,model.params):
    figure(figsize=(12,5))  
    model.plot_distribution(name)

In [127]:

figure(figsize=(12,5))  
model.plot_distribution('Difference of Means=mu1-mu2')

figure(figsize=(12,5))  
model.plot_distribution('Difference of Std Devs=sigma1-sigma2')

figure(figsize=(12,5))  
model.plot_distribution('Effect Size=(mu1-mu2)/sqrt((sigma1**2+sigma2**2)/2)')

Some Tests¶

In [125]:

model.P('(mu1>101) & (mu1<102.1)')

Out[125]:

0.85502222222222224

In [126]:

model.P('sigma1<sigma2')

Out[126]:

0.0060111111111111112

In [ ]: