social() import pymc as pm param = pm.Exponential('poisson_param', 1) data_generator = pm.Poisson('data_generator', param) data_plus_one = data_generator + 1 print param.children, '\n' print data_generator.parents, '\n' print data_generator.children, '\n' print 'parameter.value = ', param.value print 'data_generator.value = ', data_generator.value print 'data_plus_one = ', data_plus_one.value lambda_1 = pm.Exponential('lambda1', 1) # prior on first behaviour lambda_2 = pm.Exponential('lambda2', 1) # prior on second behaviour tau = pm.DiscreteUniform('tau', 0, 10) # prior on behaviour change) print "lambda_1.value = %.3f" % lambda_1.value print "lambda_2.value = %.3f" % lambda_2.value print "tau.value = %.3f" % tau.value # After calling random() the new value is stored in the variables # 'value' attribute lambda_1.random(), lambda_2.random(), tau.random() print "lambda_1.value = %.3f" % lambda_1.value print "lambda_2.value = %.3f" % lambda_2.value print "tau.value = %.3f" % tau.value type(lambda_1 + lambda_2) import numpy as np n_data = 5 @pm.deterministic def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2): out = np.zeros(n_data) out[:tau] = lambda_1 # lambda before tau out[tau:] = lambda_2 # lambda after tau return out %matplotlib inline from matplotlib import pyplot as plt import seaborn as sns sns.set(context = 'poster', style = 'whitegrid') samples = [lambda_1.random() for i in range(20000)] plt.hist(samples, bins=70, normed=True, histtype='stepfilled', alpha=0.8) plt.title('Prior distribution for $\lambda _1$') plt.xlim(0, 8) data = np.array([10, 5]) fixed_variable = pm.Poisson('fxd', 1, value=data, observed=True) print 'value: ', fixed_variable.value print 'calling .random()' fixed_variable.random() print 'value: ', fixed_variable.value # some fake data data = np.array([10, 25, 15, 20, 35]) obs = pm.Poisson('obs', lambda_, value=data, observed=True) obs.value model = pm.Model([obs, lambda_, lambda_1, lambda_2, tau]) tau = pm.rdiscrete_uniform(0, 80) tau alpha = 1. / 20. lambda_1, lambda_2 = pm.rexponential(alpha, 2) print lambda_1, lambda_2 datau = np.r_[pm.rpoisson(lambda_1, tau), pm.rpoisson(lambda_2, 80 - tau)] plt.bar(np.arange(80), data, color='#3498db') plt.bar(tau - 1, data[tau - 1], color='r', label='user behaviour changed') plt.xlabel('Time (days)') plt.ylabel('count of texts received') plt.title('Artificial dataset') plt.xlim(0, 80) plt.legend(); def plot_artificial_sms_dataset(): tau = pm.rdiscrete_uniform(0, 80) alpha = 1. / 20. lambda_1, lambda_2 = pm.rexponential(alpha, 2) data = np.r_[pm.rpoisson(lambda_1, tau), pm.rpoisson(lambda_2, 80 - tau)] plt.bar(np.arange(80), data, color="#348ABD") plt.bar(tau - 1, data[tau - 1], color="r", label="user behaviour changed") plt.xlim(0, 80) plt.title("More example of artificial datasets") for i in range(4): plt.subplot(4, 1, i) plot_artificial_sms_dataset() import pymc as pm # The parameters are the bounds of the Uniform. p = pm.Uniform('p', lower=0, upper=1) # set constants p_true = 0.05 # remember this is normally unknown N = 2500 # sample N Bernoulli random variables from Ber(0.05). # each random variable has a 0.05 chance of being a 1. # this is the data-generation step occurrences = pm.rbernoulli(p_true, N) print occurrences, occurrences.sum() # Occurrences.mean is equal to n/N print 'What is the observed frequency in Group A? {}'.format( occurrences.mean()) print 'Does this equal the true frequency? {}'.format( occurrences.mean() == p_true) # include the observations, which are Bernoulli obs = pm.Bernoulli('obs', p, value=occurrences, observed=True) # To be explained in later lessons mcmc = pm.MCMC([p, obs]) mcmc.sample(18000, 1000) plt.title('Posterior distribution of $p _A$, the true effectiveness of site A') plt.vlines(p_true, 0, 90, linestyle='--', label='true $p_A$ (unknown)') plt.hist(mcmc.trace('p')[:], bins=25, histtype='stepfilled', normed=True) plt.legend() # These two quantities are unknown to use # We just use them to simulate test data true_p_A = 0.05 true_p_B = 0.04 # Unequal sample sizes are no problem with Bayesian analysis N_A = 1700 N_B = 950 # Generate observations obs_A = pm.rbernoulli(true_p_A, N_A) obs_B = pm.rbernoulli(true_p_B, N_B) print 'Obs from Site A: ', obs_A[:30].astype(int), '...' print 'Obs from Site B: ', obs_B[:30].astype(int), '...' print obs_A.mean() print obs_B.mean() # Set up pymc model assuming uniform priors for p_A and p_B. p_A = pm.Uniform('p_A', 0, 1) p_B = pm.Uniform('p_B', 0, 1) # Define the deterministic delta function. # This is our unknown of interest. @pm.deterministic def delta(p_A=p_A, p_B=p_B): return p_A - p_B # Set of observations obs_A = pm.Bernoulli('obs_A', p_A, value=obs_A, observed=True) obs_b = pm.Bernoulli('obs_B', p_B, value=obs_B, observed=True) # Explained later mcmc = pm.MCMC([p_A, p_B, delta, obs_A, obs_B]) mcmc.sample(20000, 1000) p_A_samples = mcmc.trace('p_A')[:] p_B_samples = mcmc.trace('p_B')[:] delta_samples = mcmc.trace('delta')[:] ax = plt.subplot(311) plt.xlim(0, 0.1) plt.hist(p_A_samples, histtype='stepfilled', bins=25, alpha=0.80, label='posterior of $p_A$', color='red', normed=True) plt.vlines(true_p_A, 0, 80, linestyle='--', label='true $p_A$ (unknown)') plt.legend(loc='upper right') plt.title('Posterior distributions of $p_A$, $p_B$, and delta unknowns') ax = plt.subplot(312) plt.xlim(0, 0.1) plt.hist(p_B_samples, histtype='stepfilled', bins=25, alpha=0.80, label='posterior of $p_B$', color='#467821', normed=True) plt.vlines(true_p_B, 0, 80, linestyle='--', label='true $p_B$ (unknown)') plt.legend(loc='upper right') ax = plt.subplot(313) plt.hist(delta_samples, histtype='stepfilled', bins=30, alpha=0.80, label='posterior of delta', color = '#7A68A6', normed=True) plt.vlines(true_p_A - true_p_B, 0, 60, linestyle='--', label='true delta (unknown)') plt.vlines(0, 0, 60, color='black', alpha=0.2) plt.legend(loc='upper right'); # Count the number of samples less than 0, i.e. the area under the curve # before 0, represent the probability that site A is worse than site B. print 'Probability site A is WORSE than site B: {}'.format( (delta_samples < 0).mean()) print 'Probability site A is BETTER than site B: {}'.format( (delta_samples > 0).mean()) from IPython.core.display import HTML def css_styling(): styles = open("/users/ryankelly/desktop/custom_notebook2.css", "r").read() return HTML(styles) css_styling() def social(): code = """ Tweet Follow @Ryanmdk submit to reddit """ return HTML(code)