social()

import pymc as pm

param = pm.Exponential('poisson_param', 1)
data_generator = pm.Poisson('data_generator', param)
data_plus_one = data_generator + 1

print param.children, '\n'
print data_generator.parents, '\n'
print data_generator.children, '\n'

print 'parameter.value = ', param.value
print 'data_generator.value = ', data_generator.value
print 'data_plus_one = ', data_plus_one.value

lambda_1 = pm.Exponential('lambda1', 1) # prior on first behaviour
lambda_2 = pm.Exponential('lambda2', 1) # prior on second behaviour
tau = pm.DiscreteUniform('tau', 0, 10) # prior on behaviour change)

print "lambda_1.value = %.3f" % lambda_1.value
print "lambda_2.value = %.3f" % lambda_2.value
print "tau.value = %.3f" % tau.value

# After calling random() the new value is stored in the variables
# 'value' attribute
lambda_1.random(), lambda_2.random(), tau.random()

print "lambda_1.value = %.3f" % lambda_1.value
print "lambda_2.value = %.3f" % lambda_2.value
print "tau.value = %.3f" % tau.value

type(lambda_1 + lambda_2)

import numpy as np

n_data = 5

@pm.deterministic
def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
    out = np.zeros(n_data)
    out[:tau] = lambda_1 # lambda before tau
    out[tau:] = lambda_2 # lambda after tau
    return out

%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

sns.set(context = 'poster', style = 'whitegrid')

samples = [lambda_1.random() for i in range(20000)]
plt.hist(samples, bins=70, normed=True, histtype='stepfilled', alpha=0.8)
plt.title('Prior distribution for $\lambda _1$')
plt.xlim(0, 8)

data = np.array([10, 5])
fixed_variable = pm.Poisson('fxd', 1, value=data, observed=True)
print 'value: ', fixed_variable.value
print 'calling .random()'
fixed_variable.random()
print 'value: ', fixed_variable.value

# some fake data
data = np.array([10, 25, 15, 20, 35])
obs = pm.Poisson('obs', lambda_, value=data, observed=True)
obs.value

model = pm.Model([obs, lambda_, lambda_1, lambda_2, tau])

tau = pm.rdiscrete_uniform(0, 80)
tau

alpha = 1. / 20.
lambda_1, lambda_2 = pm.rexponential(alpha, 2)
print lambda_1, lambda_2

datau = np.r_[pm.rpoisson(lambda_1, tau), pm.rpoisson(lambda_2, 80 - tau)]

plt.bar(np.arange(80), data, color='#3498db')
plt.bar(tau - 1, data[tau - 1], color='r', label='user behaviour changed')
plt.xlabel('Time (days)')
plt.ylabel('count of texts received')
plt.title('Artificial dataset')
plt.xlim(0, 80)
plt.legend();

def plot_artificial_sms_dataset():
    tau = pm.rdiscrete_uniform(0, 80)
    alpha = 1. / 20.
    lambda_1, lambda_2 = pm.rexponential(alpha, 2)
    data = np.r_[pm.rpoisson(lambda_1, tau), pm.rpoisson(lambda_2, 80 - tau)]
    plt.bar(np.arange(80), data, color="#348ABD")
    plt.bar(tau - 1, data[tau - 1], color="r", label="user behaviour changed")
    plt.xlim(0, 80)


plt.title("More example of artificial datasets")
for i in range(4):
    plt.subplot(4, 1, i)
    plot_artificial_sms_dataset()

import pymc as pm

# The parameters are the bounds of the Uniform.
p = pm.Uniform('p', lower=0, upper=1)

# set constants
p_true = 0.05 # remember this is normally unknown
N = 2500

# sample N Bernoulli random variables from Ber(0.05).
# each random variable has a 0.05 chance of being a 1.
# this is the data-generation step
occurrences = pm.rbernoulli(p_true, N)

print occurrences, occurrences.sum()

# Occurrences.mean is equal to n/N
print 'What is the observed frequency in Group A? {}'.format(
        occurrences.mean())
print 'Does this equal the true frequency? {}'.format(
        occurrences.mean() == p_true)

# include the observations, which are Bernoulli
obs = pm.Bernoulli('obs', p, value=occurrences, observed=True)

# To be explained in later lessons
mcmc = pm.MCMC([p, obs])
mcmc.sample(18000, 1000)

plt.title('Posterior distribution of $p _A$, the true effectiveness of site A')
plt.vlines(p_true, 0, 90, linestyle='--', label='true $p_A$ (unknown)')
plt.hist(mcmc.trace('p')[:], bins=25, histtype='stepfilled', normed=True)
plt.legend()

# These two quantities are unknown to use
# We just use them to simulate test data

true_p_A = 0.05
true_p_B = 0.04

# Unequal sample sizes are no problem with Bayesian analysis
N_A = 1700
N_B = 950

# Generate observations
obs_A = pm.rbernoulli(true_p_A, N_A)
obs_B = pm.rbernoulli(true_p_B, N_B)
print 'Obs from Site A: ', obs_A[:30].astype(int), '...'
print 'Obs from Site B: ', obs_B[:30].astype(int), '...'

print obs_A.mean()
print obs_B.mean()

# Set up pymc model assuming uniform priors for p_A and p_B.
p_A = pm.Uniform('p_A', 0, 1)
p_B = pm.Uniform('p_B', 0, 1)

# Define the deterministic delta function.
# This is our unknown of interest.
@pm.deterministic
def delta(p_A=p_A, p_B=p_B):
    return p_A - p_B

# Set of observations
obs_A = pm.Bernoulli('obs_A', p_A, value=obs_A, observed=True)
obs_b = pm.Bernoulli('obs_B', p_B, value=obs_B, observed=True)

# Explained later
mcmc = pm.MCMC([p_A, p_B, delta, obs_A, obs_B])
mcmc.sample(20000, 1000)

p_A_samples = mcmc.trace('p_A')[:]
p_B_samples = mcmc.trace('p_B')[:]
delta_samples = mcmc.trace('delta')[:]

ax = plt.subplot(311)

plt.xlim(0, 0.1)
plt.hist(p_A_samples, histtype='stepfilled', bins=25, alpha=0.80,
         label='posterior of $p_A$', color='red', normed=True)
plt.vlines(true_p_A, 0, 80, linestyle='--', label='true $p_A$ (unknown)')
plt.legend(loc='upper right')
plt.title('Posterior distributions of $p_A$, $p_B$, and delta unknowns')

ax = plt.subplot(312)

plt.xlim(0, 0.1)
plt.hist(p_B_samples, histtype='stepfilled', bins=25, alpha=0.80,
         label='posterior of $p_B$', color='#467821', normed=True)
plt.vlines(true_p_B, 0, 80, linestyle='--', label='true $p_B$ (unknown)')
plt.legend(loc='upper right')

ax = plt.subplot(313)

plt.hist(delta_samples, histtype='stepfilled', bins=30, alpha=0.80,
         label='posterior of delta', color = '#7A68A6', normed=True)
plt.vlines(true_p_A - true_p_B, 0, 60, linestyle='--', 
         label='true delta (unknown)')
plt.vlines(0, 0, 60, color='black', alpha=0.2)
plt.legend(loc='upper right');

# Count the number of samples less than 0, i.e. the area under the curve
# before 0, represent the probability that site A is worse than site B.

print 'Probability site A is WORSE than site B: {}'.format(
        (delta_samples < 0).mean())
print 'Probability site A is BETTER than site B: {}'.format(
        (delta_samples > 0).mean())

from IPython.core.display import HTML


def css_styling():
    styles = open("/users/ryankelly/desktop/custom_notebook2.css", "r").read()

    return HTML(styles)
css_styling()

def social():
    code = """
    <a style='float:left; margin-right:5px;' href="https://twitter.com/share" class="twitter-share-button" data-text="Check this out" data-via="Ryanmdk">Tweet</a>
<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, 'script', 'twitter-wjs');</script>
    <a style='float:left; margin-right:5px;' href="https://twitter.com/Ryanmdk" class="twitter-follow-button" data-show-count="false">Follow @Ryanmdk</a>
<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, 'script', 'twitter-wjs');</script>
    <a style='float:left; margin-right:5px;'target='_parent' href="http://www.reddit.com/submit" onclick="window.location = 'http://www.reddit.com/submit?url=' + encodeURIComponent(window.location); return false"> <img src="http://www.reddit.com/static/spreddit7.gif" alt="submit to reddit" border="0" /> </a>
<script src="//platform.linkedin.com/in.js" type="text/javascript">
  lang: en_US
</script>
<script type="IN/Share"></script>
"""
    return HTML(code)