import pandas as pd
import numpy as np
import pymc as pm
import numpy.random as rand
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
This notebook goes along with this Stackoverflow question. I'm using fake data here but the oddities in the traceplots are similar in real data as well.
I'm creating a hierarchical model for call times in a call center. In my case, the biggest factor of a call's time is the outcome of a call. Here I'm only considering Failure
and Success
outcomes. The next largest effect is the agent who the call was routed to which I'm moddeling as a random effect. So I have:
Where $r$ detnotes the result type and $a$ a particular agent.
def keys_n_vals(m):
keys = m.keys()
return (keys, [m[k] for k in keys])
def rand_pmf(pmf, how_many):
outcomes, probs = keys_n_vals(pmf)
return rand.choice(outcomes, how_many, p=probs)
def make_fake_data(how_many, results_pmf, results_effect, agents_effect):
def gen_seconds_connected(result, agent):
mu_r, sigma_r = results_effect[result]
mu_a, sigma_a = agents_effect[agent][result]
return rand.normal(mu_r + mu_a, sigma_r + sigma_a)
results = rand_pmf(results_pmf, how_many)
agents = rand.choice(agents_effect.keys(), how_many)
seconds_connected = map(gen_seconds_connected, results, agents)
return pd.DataFrame({'result': results, 'agent_id': agents, 'seconds_connected': seconds_connected})
# in this fake dataset the effect of the agent is only on the mean and is fixed across result types
fake_df = make_fake_data(10000,
results_pmf={'Failure': 0.8, 'Success': 0.2},
results_effect={'Failure': (140, 40), 'Success': (850, 200)},
agents_effect={'Chatty Cathy': {'Failure': (50,0), 'Success': (50,0)},
'Curt Curtis': {'Failure': (-30,0), 'Success': (-30,0)},
'Steady Stan': {'Failure': (5,0), 'Success': (5,0)}})
fake_df.groupby(['agent_id','result']).describe()
seconds_connected | |||
---|---|---|---|
agent_id | result | ||
Chatty Cathy | Failure | count | 2728.000000 |
mean | 189.775665 | ||
std | 40.134338 | ||
min | 25.435853 | ||
25% | 163.965927 | ||
50% | 190.073333 | ||
75% | 216.683787 | ||
max | 321.470919 | ||
Success | count | 668.000000 | |
mean | 904.878363 | ||
std | 195.134525 | ||
min | 360.958115 | ||
25% | 770.502846 | ||
50% | 902.133720 | ||
75% | 1036.429093 | ||
max | 1570.212090 | ||
Curt Curtis | Failure | count | 2656.000000 |
mean | 108.092979 | ||
std | 40.015300 | ||
min | -37.575129 | ||
25% | 81.483815 | ||
50% | 108.239755 | ||
75% | 134.126299 | ||
max | 228.893524 | ||
Success | count | 645.000000 | |
mean | 815.604842 | ||
std | 202.994399 | ||
min | 192.978100 | ||
25% | 671.515013 | ||
50% | 822.725658 | ||
75% | 949.297296 | ||
max | 1489.916596 | ||
Steady Stan | Failure | count | 2644.000000 |
mean | 145.727715 | ||
std | 40.034219 | ||
min | -16.544193 | ||
25% | 119.198794 | ||
50% | 146.376462 | ||
75% | 172.265891 | ||
max | 272.249799 | ||
Success | count | 659.000000 | |
mean | 863.108611 | ||
std | 200.879465 | ||
min | 236.728991 | ||
25% | 729.875248 | ||
50% | 864.957164 | ||
75% | 1001.389938 | ||
max | 1503.010142 |
# commenting out the log transformation out due to an error when invoking find_MAP...
# the distribution of seconds_connected is lognormal
#fake_df['seconds_connected'] = np.log(fake_df['seconds_connected'].values)
#fake_df.groupby(['agent_id','result']).describe()
# We need to map the cols to codes that can be used as indexes in the PyMC model
def lookup_codes(vals):
uvals = vals.unique()
return dict(zip(uvals, range(len(uvals))))
def add_lookup_codes(df, col):
lookup = lookup_codes(df[col])
df[col + '_code'] = df[col].replace(lookup).values
def add_codes(df):
new_df = df.copy()
add_lookup_codes(new_df, 'result')
add_lookup_codes(new_df, 'agent_id')
return new_df
def make_model_with_multilevel_agent_effect(original_df):
df = add_codes(original_df)
n_results, n_agents = [len(df[col].unique()) for col in ('result','agent_id')]
results_idx, agents_idx, seconds_connected = [df[col].values for col in
('result_code','agent_id_code', 'seconds_connected')]
with pm.Model() as model:
# Hyperpriors
sigma_a = pm.Uniform('sigma_alpha', lower=0, upper=100)
# Intercept for each result, distributed around group mean mu_r along
# with other random effects
rho = pm.Normal('mu_rho', mu=0, sd=2000, shape=n_results)
# Agent Effect, aka agent chatty bias
alpha = pm.Normal('alpha', mu=0, sd=sigma_a, shape=n_agents)
# Model error
eps = pm.Uniform('sigma_rho', lower=0, upper=500, shape=n_results)
# Model prediction of seconds_connected
seconds_connected_est = rho[results_idx] + alpha[agents_idx]
# Data likelihood
like = pm.Normal('seconds_connected_like',
mu=seconds_connected_est,
sd=eps[results_idx],
observed=seconds_connected)
return model
hierarchical_model = make_model_with_multilevel_agent_effect(fake_df)
with hierarchical_model:
start = pm.find_MAP()
step = pm.NUTS(scaling=start)
hierarchical_trace = pm.sample(5000, step, start=start, progressbar=True)
[-----------------100%-----------------] 5000 of 5000 complete in 1047.7 sec
trace = hierarchical_trace
plt.figure(figsize=(7, 7))
pm.traceplot(trace[1000:]);
plt.tight_layout();
<matplotlib.figure.Figure at 0x115fba490>
The posterior plots look like they are fitting the fake data but I am still puzzled on how exactly the traces for the different alpha
and mu_ru
distribtuions follow eachother. They look like offsets of eachother. Granted, the is agent effect of the fake data is fixed but I still wouldn't expect the traces to look like that.
plt.figure(figsize=(7, 7))
pm.traceplot(trace[:1000]);
plt.tight_layout();
<matplotlib.figure.Figure at 0x112f3a650>