We have access to many millions of domains which can each be broken into smaller segments or even URLs. For each ad campaign, how do we pick which parts of what domains to serve to ads to? And how much do we value each of those parts?
A MAB is an online algorithm which chooses from a set of strategies in a sequence of trials so as to maximize the total payoff of the chosen strategies.
Rephrased, it is the problem a gambler faces at a row of slot machines, sometimes known as "one-armed bandits", when deciding which machines to play, how many times to play each machine and in which order to play them. When played, each machine provides a random reward from a distribution specific to that machine. The objective of the gambler is to maximize the sum of rewards earned through a sequence of lever pulls.$^1$
$^1$ Wikipedia
In general, any system that emphasizes continuous improvement where products remain in a perpetual state of testing/exploration.
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy.random as rand
import numpy as np
%matplotlib inline
from scipy.stats import beta
import random as random
import scipy.integrate as integrate
import math as math
import time
from bokeh.plotting import *
from bokeh.objects import Glyph
from bokeh.objects import Range1d
output_notebook(url="default")
Pulling a bandit's arm once is Bernoulli with chance of success $p$ $$\$$
$$ f(x;p) = p^x(1-p)^{1-x} \hskip2em \text{ for } x \in \{0,1\} \\ $$Sum of $n$ identical Bernoulli distributed random variables is binomial $$\$$
$$f(x;n,p) = \genfrac(){0pt}{0}{n}{x} p^x(1-p)^{n-x} \hskip2em \text{ for } x \in \{0,1,2,...,n\} $$Beta distribution is the conjugate prior of the binomial distribution $$\$$
$$ f(x;\alpha,\beta) = \frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)}x^{\alpha-1}(1-x)^{\beta-1} $$The mode of the beta distribution is given by $$\\$$ $$\frac{\alpha-1}{\alpha+\beta-2}$$ $$\$$
where $\alpha - 1$ is the count of successful trials and $\beta - 1$ the count of failed trials
$$\\$$ You can also update the Beta distribution with information from new trials with $\alpha_{new}$ and $\beta_{new}$ to get $$\$$ $$ \frac{\Gamma(\alpha'+\beta')}{\Gamma(\alpha')\Gamma(\beta')}x^{\alpha'-1}(1-x)^{\beta'-1} $$ $$\$$
where $$\alpha' = \alpha+\alpha_{new}$$ and $$\beta' = \beta+\beta_{new}$$
def betaPlot(prob = 0.5, N = 100):
figure(tools="",
title = 'Beta Distribution',
x_axis_label = 'Success Probability',
y_axis_label = 'PDF',
x_range=Range1d(start=0, end=1))
hold()
a = 1
b = 1
x = np.linspace(0, 1, 100)
line(x, beta.pdf(x,a,b), color="#2E2EFE", plot_width=800, plot_height=500, legend='Beta Posterior')
line([prob,prob],[0,10], color="#FE2E2E", line_dash = 'dashed', legend='True Mean')
grid().grid_line_alpha=0.3
renderer1 = [r for r in curplot().renderers if isinstance(r, Glyph)][0]
ds1 = renderer1.data_source
show()
for i in range(N):
test = rand.rand() < 0.5
if i < 6:
print 'a = %s and b = %s' %(a,b)
raw_input("")
print 'Payoff: %s' %test.real
a += test
b += not(test)
ds1.data["x"] = x
ds1.data["y"] = beta.pdf(x,a,b)
ds1._dirty = True
cursession().store_objects(ds1)
time.sleep(0.1)
betaPlot()
class MAB(object):
def __init__(self,numArms):
self.armPayoffs = rand.uniform(0,1,numArms)
self.successes = np.ones(numArms)
self.failures = np.ones(numArms)
def pullArm(self,arm):
test = rand.uniform()
result = test < self.armPayoffs[arm]
bestResult = test < max(self.armPayoffs)
regret = bestResult - result
self.successes[arm] += result
self.failures[arm] += not(result)
return result, regret
The standard is to use $regret$
$$R_T = \sum^T_{i=1}(m^*-m_{B(i)}) $$where $m^*$ is the payout from the optimal arm and $m_{B(i)}$ is the payout from arm $i$
def strategyStratifiedSample(mab):
samples = mab.successes + mab.failures
bestEstArm = samples == min(samples)
return random.sample(np.where(bestEstArm)[0], 1)[0]
def mab3Plot(strategy, pulls = 100):
figure(tools="",
title = 'Multi-Armed Bandit',
x_axis_label = 'Success Probability',
y_axis_label = 'PDF',
x_range=Range1d(start=0, end=1))
hold()
mab = MAB(3)
x = np.linspace(0, 1, 100)
line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#2E2EFE", plot_width=800, plot_height=500, legend='Bandit 1 Posterior')
line([mab.armPayoffs[0],mab.armPayoffs[0]],[0,10], color="#2E2EFE", line_dash = 'dashed', legend='Bandit 1 Truth')
line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#AC58FA", plot_width=800, plot_height=500, legend='Bandit 2 Posterior')
line([mab.armPayoffs[1],mab.armPayoffs[1]],[0,10], color="#AC58FA", line_dash = 'dashed', legend='Bandit 2 Truth')
line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#FE642E", plot_width=800, plot_height=500, legend='Bandit 3 Posterior')
line([mab.armPayoffs[2],mab.armPayoffs[2]],[0,10], color="#FE642E", line_dash = 'dashed', legend='Bandit 3 Truth')
grid().grid_line_alpha=0.3
renderer0 = [r for r in curplot().renderers if isinstance(r, Glyph)][0]
ds0 = renderer0.data_source
renderer2 = [r for r in curplot().renderers if isinstance(r, Glyph)][2]
ds2 = renderer2.data_source
renderer4 = [r for r in curplot().renderers if isinstance(r, Glyph)][4]
ds4 = renderer4.data_source
show()
for i in range(pulls):
arm = strategy(mab)
result, regret = mab.pullArm(arm)
ds0.data["x"] = x
ds0.data["y"] = beta.pdf(x,mab.successes[0],mab.failures[0])
ds0._dirty = True
ds2.data["x"] = x
ds2.data["y"] = beta.pdf(x,mab.successes[1],mab.failures[1])
ds2._dirty = True
ds4.data["x"] = x
ds4.data["y"] = beta.pdf(x,mab.successes[2],mab.failures[2])
ds4._dirty = True
cursession().store_objects(ds0)
cursession().store_objects(ds2)
cursession().store_objects(ds4)
time.sleep(0.01)
mab3Plot(strategy=strategyStratifiedSample)
def stratifiedRegret(cdata):
figure(tools="",
title = 'Multi-Armed Bandit Regret',
x_axis_label = 'Iteration',
y_axis_label = 'Regret',
x_range=Range1d(start=0, end=pulls),
y_range=Range1d(start=0, end=max(cdata)))
hold()
line(range(pulls), cdata, color="#FF4000", plot_width=800, plot_height=500, legend='Stratified Sample')
grid().grid_line_alpha=0.3
legend()[0].orientation = "top_left"
show()
mab = MAB(3)
pulls = 1000
data = np.zeros(pulls)
for i in range(pulls):
arm = strategyStratifiedSample(mab)
result, regret = mab.pullArm(arm)
data[i] = regret
cdata = np.cumsum(data)
print mab.armPayoffs
stratifiedRegret(cdata)
[ 0.26952967 0.25392393 0.53431951]
def strategyGreedy(mab):
rates = mab.successes/(mab.successes + mab.failures)
bestEstArm = rates == max(rates)
return random.sample(np.where(bestEstArm)[0], 1)[0]
mab3Plot(strategy=strategyGreedy)
def strategyEpsilonGreedy(mab, epsilon = 0.1):
if random.random() < epsilon:
return random.randint(0,len(mab.successes)-1)
else:
return strategyGreedy(mab)
def strategyEpsilonFirst(mab, pulls = 100, epsilon = 0.1):
trials = pulls*epsilon
samples = mab.successes + mab.failures
if sum(samples) < trials:
return strategyStratifiedSample(mab)
else:
return strategyGreedy(mab)
mab3Plot(strategy=strategyEpsilonFirst)
def basicRegretPlot(data):
randBandit = np.cumsum(data.StratifiedSample.astype(float))
greedyBandit = np.cumsum(data.Greedy.astype(float))
epsGreedyBandit = np.cumsum(data.EpsilonGreedy.astype(float))
epsFirstBandit = np.cumsum(data.EpsilonFirst.astype(float))
figure(tools="",
title = 'Multi-Armed Bandit Regret',
x_axis_label = 'Iteration',
y_axis_label = 'Regret',
x_range=Range1d(start=0, end=len(data)),
y_range=Range1d(start=0, end=max(randBandit)))
hold()
line(range(len(randBandit)), randBandit, color="#FF4000", plot_width=800, plot_height=500, legend='Stratified Sample')
line(range(len(randBandit)), greedyBandit, color="#AC58FA", plot_width=800, plot_height=500, legend='Greedy')
line(range(len(randBandit)), epsGreedyBandit, color="#0080FF", plot_width=800, plot_height=500, legend='Epsilon Greedy')
line(range(len(randBandit)), epsFirstBandit, color="#01DF01", plot_width=800, plot_height=500, legend='Epsilon First')
grid().grid_line_alpha=0.3
legend()[0].orientation = "top_left"
show()
class MABs(object):
def __init__(self,armPayoffs):
self.armPayoffs = armPayoffs
self.successes = np.ones(len(armPayoffs))
self.failures = np.ones(len(armPayoffs))
def pullArm(self,arm,test):
result = test < self.armPayoffs[arm]
bestResult = test < max(self.armPayoffs)
regret = bestResult - result
self.successes[arm] += result
self.failures[arm] += not(result)
return result, regret
def genBasicData(numArms = 3, pulls = 100):
armPayoffs = rand.uniform(0,1,numArms)
mab0 = MABs(armPayoffs)
mab1 = MABs(armPayoffs)
mab2 = MABs(armPayoffs)
mab3 = MABs(armPayoffs)
cols = ['StratifiedSample','Greedy','EpsilonGreedy','EpsilonFirst']
data = pd.DataFrame(columns=cols)
for i in range(pulls):
test = rand.uniform()
arm0 = strategyStratifiedSample(mab0)
result0, regret0 = mab0.pullArm(arm0, test)
arm1 = strategyGreedy(mab1)
result1, regret1 = mab1.pullArm(arm1, test)
arm2 = strategyEpsilonGreedy(mab2, 0.1)
result2, regret2 = mab2.pullArm(arm2, test)
arm3 = strategyEpsilonFirst(mab3, pulls, 0.1)
result3, regret3 = mab3.pullArm(arm3, test)
row = pd.DataFrame([dict(StratifiedSample = regret0,
Greedy = regret1,
EpsilonGreedy = regret2,
EpsilonFirst = regret3)])
data = data.append(row, ignore_index=True)
#print mab0.armPayoffs
return data
data = genBasicData()
basicRegretPlot(data)
def basicSimulation(sims = 100):
data = genBasicData()
for i in range(1,sims):
data += genBasicData()
return data/100.0
data = basicSimulation()
basicRegretPlot(data)
The binomial bandit assumes that the rewards in each configuration are independent Bernoulli random variables with success probabilities ($\theta_1,...,\theta_k$)$^1$.
Assume a simple uninformative prior of $\theta_a \backsim U(0,1)$ with all $\theta_i$ being independent.
Let $Y_{at}$ and $N_{at}$ denote the cumulative number of successes and trials observed for arm $a$ up to time $t$. Then the posterior distribution of $\theta = (\theta_1,...,\theta_k) $ is
$$ p(\theta|\boldsymbol y_t) = \prod^k_{a=1}B(\theta_a|Y_{at}+1,N_{at}-Y_{at}+1) \\ $$Then the probability of arm $a$ being the optimal arm with all the information at time $t$ is
$$ w_{at} = \int^1_0 B(\theta_a|Y_{at}+1,N_{at}-Y_{at}+1) \prod^k_{j \ne a} P(\theta_j<\theta_a|Y_{jt}+1,N_{jt}-Y_{jt}+1) $$$$\$$
$^1$ 'A modern Bayesian look at the multi-armed bandit' - Steven Scott
def bpmProbs(mab):
y = mab.successes-1
n = mab.successes + mab.failures - 2
k = len(y)
ans = np.zeros(k)
for i in range(k):
indx = range(k)
del indx[i]
def f(x):
r = beta.pdf(x, y[i]+1, n[i]-y[i]+1)
for j in indx:
r = r * beta.cdf(x, y[j]+1, n[j]-y[j]+1)
return r
ans[i] = integrate.quad(f,0,1)[0]
return ans
def weightedSample(weights):
'''Return the index to sample based on weight vector'''
r = random.random()
cweight = np.cumsum(weights)
choice = min(cweight[np.where(cweight >= r)])
return np.where(cweight == choice)[0][0]
pd.Series([weightedSample([0.333,0.333,0.334]) for _ in range(10000)]).hist(figsize=(12,7))
#pd.Series([weightedSample([0.8,0.1,0.1]) for _ in range(10000)]).hist(figsize=(12,7))
def StrategyBPM(mab):
weights = bpmProbs(mab)
return weightedSample(weights)
mab3Plot(strategy=StrategyBPM)
def genBasicData2(numArms = 3, pulls = 100):
armPayoffs = rand.uniform(0,1,numArms)
mab0 = MABs(armPayoffs)
mab1 = MABs(armPayoffs)
mab2 = MABs(armPayoffs)
mab3 = MABs(armPayoffs)
mab4 = MABs(armPayoffs)
cols = ['StratifiedSample','Greedy','EpsilonGreedy','EpsilonFirst','BPM']
data = pd.DataFrame(columns=cols)
for i in range(pulls):
test = rand.uniform()
arm0 = strategyStratifiedSample(mab0)
result0, regret0 = mab0.pullArm(arm0, test)
arm1 = strategyGreedy(mab1)
result1, regret1 = mab1.pullArm(arm1, test)
arm2 = strategyEpsilonGreedy(mab2, 0.1)
result2, regret2 = mab2.pullArm(arm2, test)
arm3 = strategyEpsilonFirst(mab3, pulls, 0.1)
result3, regret3 = mab3.pullArm(arm3, test)
arm4 = StrategyBPM(mab4)
result4, regret4 = mab4.pullArm(arm4, test)
row = pd.DataFrame([dict(StratifiedSample = regret0,
Greedy = regret1,
EpsilonGreedy = regret2,
EpsilonFirst = regret3,
BPM = regret4)])
data = data.append(row, ignore_index=True)
#print mab0.armPayoffs
return data
def basicSimulation2(sims = 100):
data = genBasicData2()
dataseries = []
dataseries.append(data.sum())
for i in range(1,sims):
d = genBasicData2()
data += d
dataseries.append(d.sum())
return data/100.0, dataseries
def basicRegretPlot2(data):
randBandit = np.cumsum(data.StratifiedSample.astype(float))
greedyBandit = np.cumsum(data.Greedy.astype(float))
epsGreedyBandit = np.cumsum(data.EpsilonGreedy.astype(float))
epsFirstBandit = np.cumsum(data.EpsilonFirst.astype(float))
BPMBandit = np.cumsum(data.BPM.astype(float))
figure(tools="",
title = 'Multi-Armed Bandit Regret',
x_axis_label = 'Iteration',
y_axis_label = 'Regret',
x_range=Range1d(start=0, end=len(data)),
y_range=Range1d(start=0, end=max(randBandit)))
hold()
line(range(len(randBandit)), randBandit, color="#FF4000", plot_width=800, plot_height=500, legend='Stratified Sample')
line(range(len(randBandit)), greedyBandit, color="#AC58FA", plot_width=800, plot_height=500, legend='Greedy')
line(range(len(randBandit)), epsGreedyBandit, color="#0080FF", plot_width=800, plot_height=500, legend='Epsilon Greedy')
line(range(len(randBandit)), epsFirstBandit, color="#01DF01", plot_width=800, plot_height=500, legend='Epsilon First')
line(range(len(randBandit)), BPMBandit, color="#FF0000", plot_width=800, plot_height=500, legend='BPM')
grid().grid_line_alpha=0.3
legend()[0].orientation = "top_left"
show()
data, dataseries = basicSimulation2()
basicRegretPlot2(data)
pd.concat(dataseries,axis=1).mean(axis=1)
BPM 4.83 EpsilonFirst 4.63 EpsilonGreedy 5.25 Greedy 5.15 StratifiedSample 25.72 dtype: float64
pd.concat(dataseries,axis=1).var(axis=1)
BPM 8.869798 EpsilonFirst 62.437475 EpsilonGreedy 26.593434 Greedy 91.684343 StratifiedSample 194.789495 dtype: float64
def genBasicData3(numArms = 3, pulls = 100):
armPayoffs = rand.uniform(0,1,numArms)
mab0 = MABs(armPayoffs)
mab1 = MABs(armPayoffs)
mab2 = MABs(armPayoffs)
mab3 = MABs(armPayoffs)
mab4 = MABs(armPayoffs)
cols = ['StratifiedSample','Greedy','EpsilonGreedy','EpsilonFirst','BPM']
data = pd.DataFrame(columns=cols)
for i in range(pulls):
if i == pulls/2:
mab0.armPayoffs = mab1.armPayoffs = mab2.armPayoffs = \
mab3.armPayoffs = mab4.armPayoffs = rand.uniform(0,1,numArms)
test = rand.uniform()
arm0 = strategyStratifiedSample(mab0)
result0, regret0 = mab0.pullArm(arm0, test)
arm1 = strategyGreedy(mab1)
result1, regret1 = mab1.pullArm(arm1, test)
arm2 = strategyEpsilonGreedy(mab2, 0.1)
result2, regret2 = mab2.pullArm(arm2, test)
arm3 = strategyEpsilonFirst(mab3, pulls, 0.1)
result3, regret3 = mab3.pullArm(arm3, test)
arm4 = StrategyBPM(mab4)
result4, regret4 = mab4.pullArm(arm4, test)
row = pd.DataFrame([dict(StratifiedSample = regret0,
Greedy = regret1,
EpsilonGreedy = regret2,
EpsilonFirst = regret3,
BPM = regret4)])
data = data.append(row, ignore_index=True)
#print mab0.armPayoffs
return data
def basicSimulation3(sims = 100):
data = genBasicData3()
for i in range(1,sims):
data += genBasicData3()
return data/100.0
data = basicSimulation3()
basicRegretPlot2(data)
def mab3PlotVarying(strategy, pulls = 100):
figure(tools="",
title = 'Multi-Armed Bandit',
x_axis_label = 'Success Probability',
y_axis_label = 'PDF',
x_range=Range1d(start=0, end=1))
hold()
armPayoffs = np.array([0, 0.5, 0])
x1 = np.linspace(0, 1.5, 100)
payoff0 = map(math.sin, x1)
payoff2 = map(math.cos, x1)
armPayoffs[0] = payoff0[0]
armPayoffs[2] = payoff2[0]
mab = MABs(armPayoffs)
x = np.linspace(0, 1, 100)
line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#2E2EFE", plot_width=800, plot_height=500, legend='Bandit 1 Posterior')
line([mab.armPayoffs[0],mab.armPayoffs[0]],[0,10], color="#2E2EFE", line_dash = 'dashed', legend='Bandit 1 Truth')
line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#AC58FA", plot_width=800, plot_height=500, legend='Bandit 2 Posterior')
line([mab.armPayoffs[1],mab.armPayoffs[1]],[0,10], color="#AC58FA", line_dash = 'dashed', legend='Bandit 2 Truth')
line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#FE642E", plot_width=800, plot_height=500, legend='Bandit 3 Posterior')
line([mab.armPayoffs[2],mab.armPayoffs[2]],[0,10], color="#FE642E", line_dash = 'dashed', legend='Bandit 3 Truth')
grid().grid_line_alpha=0.3
renderer0 = [r for r in curplot().renderers if isinstance(r, Glyph)][0]
ds0 = renderer0.data_source
renderer1 = [r for r in curplot().renderers if isinstance(r, Glyph)][1]
ds1 = renderer1.data_source
renderer2 = [r for r in curplot().renderers if isinstance(r, Glyph)][2]
ds2 = renderer2.data_source
renderer3 = [r for r in curplot().renderers if isinstance(r, Glyph)][3]
ds3 = renderer3.data_source
renderer4 = [r for r in curplot().renderers if isinstance(r, Glyph)][4]
ds4 = renderer4.data_source
renderer5 = [r for r in curplot().renderers if isinstance(r, Glyph)][5]
ds5 = renderer5.data_source
show()
for i in range(pulls):
test = rand.uniform()
armPayoffs[0] = payoff0[i]
armPayoffs[2] = payoff2[i]
mab.armPayoffs = armPayoffs
arm = strategy(mab)
result, regret = mab.pullArm(arm, test)
ds0.data["x"] = x
ds0.data["y"] = beta.pdf(x,mab.successes[0],mab.failures[0])
ds0._dirty = True
ds1.data["x"] = [payoff0[i],payoff0[i]]
ds1.data["y"] = [0,10]
ds1._dirty = True
ds2.data["x"] = x
ds2.data["y"] = beta.pdf(x,mab.successes[1],mab.failures[1])
ds2._dirty = True
ds3.data["x"] = [0.5,0.5]
ds3.data["y"] = [0,10]
ds3._dirty = True
ds4.data["x"] = x
ds4.data["y"] = beta.pdf(x,mab.successes[2],mab.failures[2])
ds4._dirty = True
ds5.data["x"] = [payoff2[i],payoff2[i]]
ds5.data["y"] = [0,10]
ds5._dirty = True
cursession().store_objects(ds0)
cursession().store_objects(ds1)
cursession().store_objects(ds2)
cursession().store_objects(ds3)
cursession().store_objects(ds4)
cursession().store_objects(ds5)
time.sleep(0.01)
mab3PlotVarying(strategy=StrategyBPM)
class MABdecay(object):
def __init__(self,armPayoffs,decay):
self.armPayoffs = armPayoffs
self.successes = np.ones(len(armPayoffs))
self.failures = np.ones(len(armPayoffs))
self.decay = decay
def pullArm(self,arm,test):
result = test < self.armPayoffs[arm]
bestResult = test < max(self.armPayoffs)
regret = bestResult - result
self.successes[arm] += result
self.failures[arm] += not(result)
#decay
self.successes[arm] = max(1, (1-self.decay)*self.successes[arm])
self.failures[arm] = max(1, (1-self.decay)*self.failures[arm])
return result, regret
def mab3PlotVarying2(strategy, pulls = 100, decay = 0.1):
figure(tools="",
title = 'Multi-Armed Bandit',
x_axis_label = 'Success Probability',
y_axis_label = 'PDF',
x_range=Range1d(start=0, end=1))
hold()
armPayoffs = np.array([0, 0.5, 0])
x1 = np.linspace(0, 1.5, 100)
payoff0 = map(math.sin, x1)
payoff2 = map(math.cos, x1)
armPayoffs[0] = payoff0[0]
armPayoffs[2] = payoff2[0]
mab = MABdecay(armPayoffs, decay)
x = np.linspace(0, 1, 100)
line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#2E2EFE", plot_width=800, plot_height=500, legend='Bandit 1 Posterior')
line([mab.armPayoffs[0],mab.armPayoffs[0]],[0,10], color="#2E2EFE", line_dash = 'dashed', legend='Bandit 1 Truth')
line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#AC58FA", plot_width=800, plot_height=500, legend='Bandit 2 Posterior')
line([mab.armPayoffs[1],mab.armPayoffs[1]],[0,10], color="#AC58FA", line_dash = 'dashed', legend='Bandit 2 Truth')
line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#FE642E", plot_width=800, plot_height=500, legend='Bandit 3 Posterior')
line([mab.armPayoffs[2],mab.armPayoffs[2]],[0,10], color="#FE642E", line_dash = 'dashed', legend='Bandit 3 Truth')
grid().grid_line_alpha=0.3
renderer0 = [r for r in curplot().renderers if isinstance(r, Glyph)][0]
ds0 = renderer0.data_source
renderer1 = [r for r in curplot().renderers if isinstance(r, Glyph)][1]
ds1 = renderer1.data_source
renderer2 = [r for r in curplot().renderers if isinstance(r, Glyph)][2]
ds2 = renderer2.data_source
renderer3 = [r for r in curplot().renderers if isinstance(r, Glyph)][3]
ds3 = renderer3.data_source
renderer4 = [r for r in curplot().renderers if isinstance(r, Glyph)][4]
ds4 = renderer4.data_source
renderer5 = [r for r in curplot().renderers if isinstance(r, Glyph)][5]
ds5 = renderer5.data_source
show()
for i in range(pulls):
test = rand.uniform()
armPayoffs[0] = payoff0[i]
armPayoffs[2] = payoff2[i]
mab.armPayoffs = armPayoffs
arm = strategy(mab)
result, regret = mab.pullArm(arm, test)
ds0.data["x"] = x
ds0.data["y"] = beta.pdf(x,mab.successes[0],mab.failures[0])
ds0._dirty = True
ds1.data["x"] = [payoff0[i],payoff0[i]]
ds1.data["y"] = [0,10]
ds1._dirty = True
ds2.data["x"] = x
ds2.data["y"] = beta.pdf(x,mab.successes[1],mab.failures[1])
ds2._dirty = True
ds3.data["x"] = [0.5,0.5]
ds3.data["y"] = [0,10]
ds3._dirty = True
ds4.data["x"] = x
ds4.data["y"] = beta.pdf(x,mab.successes[2],mab.failures[2])
ds4._dirty = True
ds5.data["x"] = [payoff2[i],payoff2[i]]
ds5.data["y"] = [0,10]
ds5._dirty = True
cursession().store_objects(ds0)
cursession().store_objects(ds1)
cursession().store_objects(ds2)
cursession().store_objects(ds3)
cursession().store_objects(ds4)
cursession().store_objects(ds5)
time.sleep(0.01)
mab3PlotVarying2(strategy=StrategyBPM)
def genBasicData4(numArms = 3, pulls = 100, decay = 0.1):
armPayoffs = [0, 0.5, 0]
x = np.linspace(0, 2, 100)
payoff0 = map(math.sin, x)
payoff2 = map(math.cos, x)
armPayoffs[0] = payoff0[0]
armPayoffs[2] = payoff2[0]
mab0 = MABdecay(armPayoffs, decay = 0.1)
mab1 = MABdecay(armPayoffs, decay = 0.1)
mab2 = MABdecay(armPayoffs, decay = 0.1)
mab3 = MABdecay(armPayoffs, decay = 0.1)
mab4 = MABdecay(armPayoffs, decay = 0.1)
cols = ['StratifiedSample','Greedy','EpsilonGreedy','EpsilonFirst','BPM']
data = pd.DataFrame(columns=cols)
for i in range(pulls):
armPayoffs[0] = payoff0[i]
armPayoffs[2] = payoff2[i]
mab0.armPayoffs = mab1.armPayoffs = mab2.armPayoffs = \
mab3.armPayoffs = mab4.armPayoffs = armPayoffs
test = rand.uniform()
arm0 = strategyStratifiedSample(mab0)
result0, regret0 = mab0.pullArm(arm0, test)
arm1 = strategyGreedy(mab1)
result1, regret1 = mab1.pullArm(arm1, test)
arm2 = strategyEpsilonGreedy(mab2, 0.1)
result2, regret2 = mab2.pullArm(arm2, test)
arm3 = strategyEpsilonFirst(mab3, pulls, 0.1)
result3, regret3 = mab3.pullArm(arm3, test)
arm4 = StrategyBPM(mab4)
result4, regret4 = mab4.pullArm(arm4, test)
row = pd.DataFrame([dict(StratifiedSample = regret0,
Greedy = regret1,
EpsilonGreedy = regret2,
EpsilonFirst = regret3,
BPM = regret4)])
data = data.append(row, ignore_index=True)
return data
def basicSimulation4(sims = 100):
data = genBasicData4()
dataseries = []
dataseries.append(data.sum())
for i in range(1,sims):
d = genBasicData4()
data += d
dataseries.append(d.sum())
return data/100.0, dataseries
data, dataseries = basicSimulation4()
basicRegretPlot2(data)
pd.concat(dataseries,axis=1).mean(axis=1)
BPM 13.71 EpsilonFirst 24.70 EpsilonGreedy 14.07 Greedy 14.39 StratifiedSample 37.00 dtype: float64
pd.concat(dataseries,axis=1).var(axis=1)
BPM 25.420101 EpsilonFirst 38.191919 EpsilonGreedy 65.479899 Greedy 61.715051 StratifiedSample 14.989899 dtype: float64