In [45]:

import pandas as pd
import numpy as np
import pymc as pm
import numpy.random as rand
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

Description¶

This notebook goes along with this Stackoverflow question. I'm using fake data here but the oddities in the traceplots are similar in real data as well.

I'm creating a hierarchical model for call times in a call center. In my case, the biggest factor of a call's time is the outcome of a call. Here I'm only considering Failure and Success outcomes. The next largest effect is the agent who the call was routed to which I'm moddeling as a random effect. So I have:

$$\text{seconds}_{r, a} = \rho_{r} + \alpha_{a}$$

Where $r$ detnotes the result type and $a$ a particular agent.

Helpers to generate fake data¶

In [35]:

def keys_n_vals(m):
    keys = m.keys()
    return (keys, [m[k] for k in keys])

def rand_pmf(pmf, how_many):
    outcomes, probs = keys_n_vals(pmf)
    return rand.choice(outcomes, how_many, p=probs)

def make_fake_data(how_many, results_pmf, results_effect, agents_effect):
    
    def gen_seconds_connected(result, agent):
        mu_r, sigma_r = results_effect[result]
        mu_a, sigma_a = agents_effect[agent][result]
        return rand.normal(mu_r + mu_a, sigma_r + sigma_a)    
    
    results = rand_pmf(results_pmf, how_many)
    agents = rand.choice(agents_effect.keys(), how_many)
    seconds_connected = map(gen_seconds_connected, results, agents)
    return pd.DataFrame({'result': results, 'agent_id': agents, 'seconds_connected': seconds_connected})

In [36]:

# in this fake dataset the effect of the agent is only on the mean and is fixed across result types
fake_df = make_fake_data(10000, 
               results_pmf={'Failure': 0.8, 'Success': 0.2},
               results_effect={'Failure': (140, 40), 'Success': (850, 200)},
               agents_effect={'Chatty Cathy': {'Failure': (50,0), 'Success': (50,0)},
                              'Curt Curtis': {'Failure': (-30,0), 'Success': (-30,0)},
                              'Steady Stan': {'Failure': (5,0), 'Success': (5,0)}})

fake_df.groupby(['agent_id','result']).describe()

Out[36]:

			seconds_connected
agent_id	result
Chatty Cathy	Failure	count	2728.000000
		mean	189.775665
		std	40.134338
		min	25.435853
		25%	163.965927
		50%	190.073333
		75%	216.683787
		max	321.470919
	Success	count	668.000000
		mean	904.878363
		std	195.134525
		min	360.958115
		25%	770.502846
		50%	902.133720
		75%	1036.429093
		max	1570.212090
Curt Curtis	Failure	count	2656.000000
		mean	108.092979
		std	40.015300
		min	-37.575129
		25%	81.483815
		50%	108.239755
		75%	134.126299
		max	228.893524
	Success	count	645.000000
		mean	815.604842
		std	202.994399
		min	192.978100
		25%	671.515013
		50%	822.725658
		75%	949.297296
		max	1489.916596
Steady Stan	Failure	count	2644.000000
		mean	145.727715
		std	40.034219
		min	-16.544193
		25%	119.198794
		50%	146.376462
		75%	172.265891
		max	272.249799
	Success	count	659.000000
		mean	863.108611
		std	200.879465
		min	236.728991
		25%	729.875248
		50%	864.957164
		75%	1001.389938
		max	1503.010142

In [37]:

# commenting out the log transformation out due to an error when invoking find_MAP...

# the distribution of seconds_connected is lognormal
#fake_df['seconds_connected'] = np.log(fake_df['seconds_connected'].values)
#fake_df.groupby(['agent_id','result']).describe()

In [38]:

# We need to map the cols to codes that can be used as indexes in the PyMC model
def lookup_codes(vals):
    uvals = vals.unique()
    return dict(zip(uvals, range(len(uvals))))

def add_lookup_codes(df, col):
    lookup = lookup_codes(df[col])
    df[col + '_code'] = df[col].replace(lookup).values

def add_codes(df):
    new_df = df.copy()
    add_lookup_codes(new_df, 'result')
    add_lookup_codes(new_df, 'agent_id')
    return new_df

def make_model_with_multilevel_agent_effect(original_df):
    df = add_codes(original_df)
    n_results, n_agents = [len(df[col].unique()) for col in ('result','agent_id')]
    results_idx, agents_idx, seconds_connected = [df[col].values for col in 
                                                  ('result_code','agent_id_code', 'seconds_connected')]
    
    with pm.Model() as model:
        # Hyperpriors
        sigma_a = pm.Uniform('sigma_alpha', lower=0, upper=100)
    
        # Intercept for each result, distributed around group mean mu_r along
        # with other random effects
        rho = pm.Normal('mu_rho', mu=0, sd=2000, shape=n_results)
        # Agent Effect, aka agent chatty bias
        alpha = pm.Normal('alpha', mu=0, sd=sigma_a, shape=n_agents)
    
        # Model error
        eps = pm.Uniform('sigma_rho', lower=0, upper=500, shape=n_results)
        
        # Model prediction of seconds_connected
        seconds_connected_est = rho[results_idx] + alpha[agents_idx]
        
        # Data likelihood
        like = pm.Normal('seconds_connected_like', 
                         mu=seconds_connected_est, 
                         sd=eps[results_idx], 
                         observed=seconds_connected)
    return model
        
hierarchical_model = make_model_with_multilevel_agent_effect(fake_df)

In [47]:

with hierarchical_model:
    start = pm.find_MAP()
    step = pm.NUTS(scaling=start)
    hierarchical_trace = pm.sample(5000, step, start=start, progressbar=True)

 [-----------------100%-----------------] 5000 of 5000 complete in 1047.7 sec

In [60]:

trace = hierarchical_trace
plt.figure(figsize=(7, 7))
pm.traceplot(trace[1000:]);
plt.tight_layout();

<matplotlib.figure.Figure at 0x115fba490>

The posterior plots look like they are fitting the fake data but I am still puzzled on how exactly the traces for the different alpha and mu_ru distribtuions follow eachother. They look like offsets of eachother. Granted, the is agent effect of the fake data is fixed but I still wouldn't expect the traces to look like that.

In [51]:

plt.figure(figsize=(7, 7))
pm.traceplot(trace[:1000]);
plt.tight_layout();

<matplotlib.figure.Figure at 0x112f3a650>