Based upon section 3 of arXiv:1008.4686
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import emcee
import triangle
import random
Suppose we have a simple regression model
$$ y_i = \beta x_i + \epsilon_i $$with $\epsilon_i$ iid $N(0,\sigma^2)$. However, an unknown number of the $y_i$ are actually drawn from $N(0,\sigma_0^2)$.
def make_example(n, beta, sigma, sigma0):
x = np.random.random(size = n) * 50
y = beta * x + np.random.normal(scale = sigma, size = n)
indices = random.sample(range(n), n//5)
y[indices] = np.random.normal(scale = sigma0, size = n//5)
return x, y
x, y = make_example(50, 0.5, 1, 10)
_ = plt.scatter(x,y)
We have our unknown variables $\beta, \sigma, \sigma_0$, and let us introduce binary variables $(o_i)_{i=1}^n \subseteq \{0,1\}$ where $o_i = 1$ indicates that reading $i$ is an "outlier". So the likelihood is
$$ p(y | \beta, \sigma, \sigma_0, o) = \prod_{i=1}^n f_i(y|\beta,\sigma)^{1-o_i} g_i(y|\sigma_0)^{o_i} $$where $f_i(y) = (2\pi\sigma^2)^{-1/2} \exp\big( -\frac{1}{2\sigma^2} (y_i - \beta x_i)^2 \big)$ and $g_i(y) = (2\pi\sigma_0^2)^{-1/2} \exp\big( -\frac{1}{2\sigma_0^2} y_i^2 \big)$.
For priors, let's choose the Jeffreys prior on $\sigma, \sigma_0$, a flat prior on $\beta$, and a flat prior on $o$, as we have no knowledge of which values are outliers.
def log_likelihood(theta, x, y):
beta, sigma, sigma0 = theta[:3]
o = theta[3:]
linear_part = - np.log(sigma * sigma) / 2 - (y - beta * x)**2 / (2 * sigma * sigma)
outlier_part = - np.log(sigma0 * sigma0) / 2 - y**2 / (2 * sigma0 * sigma0)
return np.sum( o * outlier_part + (1-o) * linear_part)
def log_prior(theta):
beta, sigma, sigma0 = theta[:3]
return - np.log(abs(sigma)) - np.log(abs(sigma0))
def log_posterior(theta, x, y):
return log_likelihood(theta, x, y) + log_prior(theta)
From a simulation point of view, this is a high-dimensional problem, and partly discrete, so I don't think we can just use emcee
naively. Instead, let's roll our own MCMC method, using a combination of a Gibbs sampler and a Metropolis sampler.
def jump_index(index, theta, x, y, old_log_prob):
theta_new = np.array(theta, dtype=np.float64)
if index < 3:
theta_new[index] += np.random.normal(0, 0.2)
else:
if np.random.random() >= 0.5:
theta_new[index] = 1 - theta_new[index]
log_prob = log_posterior(theta_new, x, y)
alpha = log_prob - old_log_prob
if alpha >= 0 or alpha >= np.log(np.random.random()):
return theta_new, log_prob
return theta, old_log_prob
def jump(theta, x, y, old_log_prob):
index = random.randrange(len(theta))
return jump_index(index, theta, x, y, old_log_prob)
def walk(theta, x, y, length = 1000):
old_log_prob = log_posterior(theta,x,y)
trace = []
for _ in range(length):
theta, old_log_prob = jump(theta, x, y, old_log_prob)
trace.append(theta)
return np.array(trace)
theta = np.array([0,1,1] + [1]*len(x))
trace = walk(theta, x, y, 1000000)
trace
array([[ 0. , 1. , 1. , ..., 1. , 1. , 1. ], [ 0. , 1. , 1. , ..., 1. , 1. , 1. ], [ 0. , 1. , 1. , ..., 1. , 1. , 1. ], ..., [ 0.50289543, 1.21465707, 8.5969908 , ..., 0. , 0. , 0. ], [ 0.50289543, 1.21465707, 8.5969908 , ..., 0. , 0. , 0. ], [ 0.50289543, 1.21465707, 8.5969908 , ..., 0. , 0. , 0. ]])
# Marginalise away the outliers and discard burnin and sample every 100 terms to try to reduce correlations
param_trace = trace[10000::100, :3]
figure = triangle.corner(param_trace, labels = ["$\\beta$", "$\sigma$", "$\sigma_0$"], truths=[0.5, 1, 10])
fig, axes = plt.subplots(figsize=(8,6))
axes.plot(x,y, "ok")
xfit = np.linspace(0, 50, num=50)
beta = param_trace[:, 0]
yfit = (xfit[:,None]*beta).mean(1)
axes.plot(xfit, yfit)
s = (xfit[:,None]*beta).std(1)
axes.fill_between(xfit, yfit-s, yfit+s, color="lightgray")
<matplotlib.collections.PolyCollection at 0x9be9e80>
outliers = [ (index, trace[:,index+3].mean()) for index in range(len(x)) ]
outliers.sort(key = lambda pair : pair[1])
outliers
[(27, 0.0094120000000000002), (49, 0.0098130000000000005), (44, 0.010803999999999999), (29, 0.011235999999999999), (22, 0.016619999999999999), (9, 0.018874999999999999), (40, 0.019254), (30, 0.022537000000000001), (1, 0.028646999999999999), (36, 0.033452000000000003), (45, 0.036044), (25, 0.036894999999999997), (23, 0.042146999999999997), (5, 0.042412999999999999), (14, 0.045911), (6, 0.047826), (7, 0.048520000000000001), (24, 0.051985000000000003), (47, 0.064698000000000006), (17, 0.072871000000000005), (12, 0.076995999999999995), (3, 0.077533000000000005), (19, 0.087968000000000005), (26, 0.089524000000000006), (35, 0.094821000000000003), (46, 0.095072000000000004), (8, 0.095722000000000002), (41, 0.096133999999999997), (20, 0.098364999999999994), (37, 0.099663000000000002), (0, 0.108279), (38, 0.141537), (34, 0.14313799999999999), (10, 0.16606799999999999), (48, 0.18185899999999999), (28, 0.192745), (16, 0.20547499999999999), (18, 0.23061400000000001), (31, 0.25907400000000003), (32, 0.39712700000000001), (15, 0.49575799999999998), (33, 0.65765799999999996), (13, 0.99680100000000005), (4, 0.99761), (39, 0.99973900000000004), (21, 0.99976100000000001), (11, 0.99986799999999998), (2, 1.0), (42, 1.0), (43, 1.0)]
outlier_indices = [i for (i,t) in outliers if t > 0.9]
fig, axes = plt.subplots(figsize=(8,6))
axes.plot(x,y, "ok", color="grey")
axes.plot(x[outlier_indices],y[outlier_indices], "ok", color="red")
[<matplotlib.lines.Line2D at 0xacbaa58>]
The sampling is rather slow. Let's consider the task of updating one of the $o_i$. The posterior distribution is
$$ p \propto \sigma^{-1} \sigma_0^{-1} \prod_{i=1}^n f_i(y|\beta,\sigma)^{1-o_i} g_i(y|\sigma_0)^{o_i}. $$Fix $o_i$ and let $\theta$ be all the other variables (including $y$). Then
$$ p(o_i | \theta ) = \frac{p(o_i, \theta)}{p(\theta)} = \frac{p(o_i, \theta)}{p(0,\theta) + p(1,\theta)}. $$This simplifies a lot to give:
$$ p(o_i|\theta) = \frac{f_i(y|\beta,\sigma)^{1-o_i} g_i(y|\sigma_0)^{o_i}}{f_i(y|\beta,\sigma) + g_i(y|\sigma_0)}. $$In fact, notice that this distribution only depends on $\beta, \sigma, \sigma_0$, and so by updating all the $o_i$ at the same time, we end up taking an exact sample from $p(o|\beta,\sigma\,sigma_0)$. Thus this new algorithm is, essentially, a 4-dimensional Gibbs sampler. We can also exploit numpy
magic to make the $(o_i)$ step pretty fast.
def gibbs(theta, x, y):
# Update beta etc.
old_log_prob = log_posterior(theta, x, y)
theta, old_log_prob = jump_index(0, theta, x, y, old_log_prob)
theta, old_log_prob = jump_index(1, theta, x, y, old_log_prob)
theta, old_log_prob = jump_index(2, theta, x, y, old_log_prob)
# Update outliers
beta, sigma, sigma0 = theta[:3]
o = theta[3:]
linear_part = - np.log(sigma * sigma) / 2 - (y - beta * x)**2 / (2 * sigma * sigma)
outlier_part = - np.log(sigma0 * sigma0) / 2 - y**2 / (2 * sigma0 * sigma0)
# prob = P(o_i = 1)
#prob = np.exp(outlier_part) / (np.exp(linear_part) + np.exp(outlier_part))
prob = 1.0 / (1.0 + np.exp(linear_part - outlier_part))
o = np.array( np.random.random(size = len(prob)) <= prob, dtype = np.float64 )
return np.hstack([np.array([beta, sigma, sigma0]), o])
def walk(theta, length = 10000):
trace = []
for _ in range(length):
theta = gibbs(theta, x, y)
trace.append(theta)
return np.array(trace)
theta = np.array([0,1,1] + [1]*len(x))
trace = walk(theta, 20000)
param_trace = trace[2000::, :3]
figure = triangle.corner(param_trace, labels = ["$\\beta$", "$\sigma$", "$\sigma_0$"], truths=[0.5, 1, 10])
outliers = [ (index, trace[:,index+3].mean()) for index in range(len(x)) ]
outliers.sort(key = lambda pair : pair[1])
outlier_indices = [i for (i,t) in outliers if t > 0.9]
fig, axes = plt.subplots(figsize=(8,6))
axes.plot(x,y, "ok", color="grey")
axes.plot(x[outlier_indices],y[outlier_indices], "ok", color="red")
xfit = np.linspace(0, 50, 50)
beta = trace[:,0]
axes.plot(xfit, xfit*beta.mean())
[<matplotlib.lines.Line2D at 0x6eaedd8>]