import sys
sys.path.append('..')
from collections import Counter, OrderedDict
import numpy as np
import scipy as sp
import seaborn as sbn
from matplotlib import pyplot as plt
from trials import Trials
params = OrderedDict([('Control', 0.70), ('A', 0.72), ('B', 0.75)])
sources = OrderedDict([(label, sp.stats.bernoulli(p)) for label, p in params.items()])
samples = 10
total = 1000
def do_experiments():
trial = Trials(params.keys())
metrics = {metric: [] for metric in ['expected lift', 'dominance', 'z-test dominance', 'empirical lift', 'lift CI']}
for k in range(0, total, samples):
# Update
data = OrderedDict([(label, source.rvs(samples)) for label, source in sources.items()])
observations = OrderedDict([(label, (Counter(datum)[1], Counter(datum)[0])) for label, datum in data.items()])
trial.update(observations)
# Calculate metrics
for key in metrics:
metrics[key].append(trial.evaluate(key))
return metrics
Plot Bayesian (blue) and frequentist z-test (green) dominance p-values in the top subplots, and the 95%-credible interval for lift in the bottom ones
paths = 15
for i, v in enumerate(sources):
if v == 'Control':
continue
# Do experiments and evaluate the metrics
arrays = {name: [] for name in ['lifts', 'elifts', 'ps', 'fps', 'ci']}
for k in range(paths):
metrics = do_experiments()
arrays['lifts'].append([lift[v] for lift in metrics['expected lift']])
arrays['elifts'].append([lift[v] for lift in metrics['empirical lift']])
arrays['ps'].append([p[v] for p in metrics['dominance']])
arrays['fps'].append([p[v] for p in metrics['z-test dominance']])
arrays['ci'].append([i[v] for i in metrics['lift CI']])
for k in arrays:
arrays[k] = np.array(arrays[k])
fig, (p_plot, lift_plot) = plt.subplots(2, sharex=True)
dpi = 118
fig.set_size_inches(1600/dpi, 800/dpi, dpi=dpi)
plt.xlim(0, total-samples)
lift = (params[v]-params['Control'])/params['Control']
xs = [x * samples for x in range(len(lifts))]
# Plot p-values
for ps, fps in zip(arrays['ps'], arrays['fps']):
p_plot.hlines(0.95, 0, total, color='red', linestyle='--')
p_plot.hlines(0.05, 0, total, color='red', linestyle='--')
p_plot.plot(xs, ps, color='blue')
p_plot.plot(xs, fps, color='green')
# Plot lift range
for lifts, elifts, cis in zip(arrays['lifts'], arrays['elifts'], arrays['ci']):
lift_plot.hlines(lift, 0, total, color='cyan', linestyle='--')
lift_plot.plot(xs, elifts, color='green', alpha=0.05)
lift_plot.fill_between(xs, [lower for lower, _, _ in cis], [upper for _, _, upper in cis], color='blue', alpha=0.1)
fig.suptitle('Variation {}. Lift {:.2%}'.format(v, lift))
fig.show()
Compare Bayesian (blue) vs Empirical (green) predicted lift square error. Lower bar means lower error (better).
paths = 25
ps_sses = []
fps_sses = []
lift_sses = []
elift_sses = []
for i, v in enumerate(sources):
if v == 'Control':
continue
# Do experiments and evaluate the metrics
arrays = {name: [] for name in ['lifts', 'elifts', 'ps', 'fps']}
for k in range(paths):
metrics = do_experiments()
arrays['lifts'].append([lift[v] for lift in metrics['expected lift']])
arrays['elifts'].append([p[v] for p in metrics['empirical lift']])
arrays['ps'].append([p[v] for p in metrics['dominance']])
arrays['fps'].append([p[v] for p in metrics['z-test dominance']])
for k in arrays:
arrays[k] = np.array(arrays[k])
# Plot square error
fig = plt.figure()
lift_plot = plt.subplot()
plt.xlim(0, 50)
dpi = 118
fig.set_size_inches(1600/dpi, 800/dpi, dpi=dpi)
lift = (params[v]-params['Control'])/params['Control']
for ps, fps in zip(arrays['ps'], arrays['fps']):
p_error = (ps - 1)**2 if lift > 0 else ps**2
fp_error = (fps - 1)**2 if lift > 0 else fps**2
ps_sses.append(np.sum(p_error))
fps_sses.append(np.sum(fp_error))
for lifts, elifts in zip(arrays['lifts'], arrays['elifts']):
lift_error = (lifts - lift)**2
elift_error = (elifts - lift)**2
lift_sses.append(np.sum(lift_error))
elift_sses.append(np.sum(elift_error))
lift_plot.bar(range(0, len(lifts)*3, 3), lift_error, color='blue', alpha=0.05)
lift_plot.bar(range(1, len(lifts)*3+1, 3), elift_error, color='green', alpha=0.05)
fig.suptitle('Variation {}. Lift {:.2%}'.format(v, lift))
fig.show()
print('Bayesian p-value mean SSE: {:.4}'.format(np.mean(ps_sses)))
print('Frequentist (z-test) p-value mean SSE: {:.4}'.format(np.mean(fps_sses)))
print('Bayesian lift mean SSE: {:.4}'.format(np.mean(lift_sses)))
print('Empirical lift mean SSE: {:.4}'.format(np.mean(elift_sses)))
Bayesian p-value mean SSE: 11.31 Frequentist (z-test) p-value mean SSE: 11.31 Bayesian lift mean SSE: 0.3694 Empirical lift mean SSE: 0.3981