%matplotlib inline
import matplotlib.pylab as plt
from pymc import *
from numpy import array, empty
from numpy.random import randint
# 矿难的发生数
disasters_array = array([4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,
3, 3, 5, 4, 5, 3, 1, 4, 4, 1, 5, 5, 3, 4, 2, 5,
2, 2, 3, 4, 2, 1, 3, 2, 2, 1, 1, 1, 1, 3, 0, 0,
1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,
0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,
3, 3, 1, 1, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])
# 1851年到1962年间英国煤矿矿难发生数字
plt.figure(figsize=(12.5, 3.5))
plt.bar(np.arange(1851, 1962), disasters_array, color="#348ABD",linewidth=0)
plt.xlabel("Year")
plt.ylabel("Disasters")
plt.title("UK coal mining disasters, 1851-1962")
plt.xlim(1851, 1962)
plt.plot(np.arange(1851, 1962), expected, lw=4, color="#E24A33",
label="expected number of text-messages received")
# 定义先验概率分布
# 转折点服从一个离散均匀分布
switchpoint = DiscreteUniform(
'switchpoint',
lower=0,
upper=110)
# 前期矿难的发生服从poisson分布,此分布的参数服从一个beta为1的指数分布。
early_mean = Exponential('early_mean', beta=1.)
# 后期矿难的发生服从poisson分布,此分布的参数服从一个beta为1的指数分布。
late_mean = Exponential('late_mean', beta=1.)
# 用装饰器定义rate和其它变量的确定性关系
@deterministic(plot=False)
def rate(s=switchpoint, e=early_mean, l=late_mean):
''' Concatenate Poisson means '''
out = empty(len(disasters_array))
out[:s] = e
out[s:] = l
return out
# 定义观测数据的分布函数,ovserved为真表示此变量的数值不发生改变
disasters = Poisson('disasters', mu=rate, value=disasters_array, observed=True)
# 变量间存在依存关系,例如rate依赖于三个变量,而rate又是disasters的参数
rate.parents
rate.children
{<pymc.distributions.Poisson 'disasters' at 0x10c513f90>}
# 变量内存放着数值,随机变量会给一个随机初值
switchpoint.value
array(6)
from pymc import graph, MCMC
graph.dag(MCMC([switchpoint, rate, early_mean, late_mean, disasters]))
!dot MCMC.dot -Tpng -o dag.png
from IPython.core.display import Image
Image('dag.png')
# 建立模型,将五个变量放进来
mymodel = pymc.Model([switchpoint,
early_mean,
late_mean,
rate,
disasters])
M = MCMC(mymodel)
# sample方法来运行模拟过程, burn参数表示放弃最前面的1000个数值,thin参数表示抽10分之1的数值以避免自相关
M.sample(iter=10000, burn=1000, thin=10)
[-----------------100%-----------------] 10000 of 10000 complete in 1.7 sec
# 输出的是一个trace
M.trace('switchpoint')[100:110]
array([41, 44, 40, 48, 39, 46, 38, 42, 41, 43])
# 观察参数的后验分布
plt.hist(M.trace('early_mean')[:],bins=50);
# 输出所有参数的分布图
pymc.Matplot.plot(M);
Plotting late_mean Plotting early_mean Plotting switchpoint
early = M.trace('early_mean')[:].mean()
late = M.trace('late_mean')[:].mean()
expected = [early]*40 + [late]*(1962-1851-40)
plt.figure(figsize=(12.5, 3.5))
plt.bar(np.arange(1851, 1962), disasters_array, color="#348ABD",linewidth=0,alpha=0.65,
label="observed disaster per year")
plt.xlabel("Year")
plt.ylabel("Disasters")
plt.title("UK coal mining disasters, 1851-1962")
plt.xlim(1851, 1962)
plt.plot(np.arange(1851, 1962), expected, lw=4, color="#E24A33",
label="expected number of disaster per year")
plt.legend();