Notebook

Based upon section 3 of arXiv:1008.4686

In [2]:

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import emcee
import triangle
import random

Suppose we have a simple regression model

$$ y_i = \beta x_i + \epsilon_i $$

with $\epsilon_i$ iid $N(0,\sigma^2)$. However, an unknown number of the $y_i$ are actually drawn from $N(0,\sigma_0^2)$.

In [229]:

def make_example(n, beta, sigma, sigma0):
    x = np.random.random(size = n) * 50
    y = beta * x + np.random.normal(scale = sigma, size = n)
    indices = random.sample(range(n), n//5)
    y[indices] = np.random.normal(scale = sigma0, size = n//5)
    return x, y

x, y = make_example(50, 0.5, 1, 10)
_ = plt.scatter(x,y)

We have our unknown variables $\beta, \sigma, \sigma_0$, and let us introduce binary variables $(o_i)_{i=1}^n \subseteq \{0,1\}$ where $o_i = 1$ indicates that reading $i$ is an "outlier". So the likelihood is

$$ p(y | \beta, \sigma, \sigma_0, o) = \prod_{i=1}^n f_i(y|\beta,\sigma)^{1-o_i} g_i(y|\sigma_0)^{o_i} $$

where $f_i(y) = (2\pi\sigma^2)^{-1/2} \exp\big( -\frac{1}{2\sigma^2} (y_i - \beta x_i)^2 \big)$ and $g_i(y) = (2\pi\sigma_0^2)^{-1/2} \exp\big( -\frac{1}{2\sigma_0^2} y_i^2 \big)$.

For priors, let's choose the Jeffreys prior on $\sigma, \sigma_0$, a flat prior on $\beta$, and a flat prior on $o$, as we have no knowledge of which values are outliers.

In [230]:

def log_likelihood(theta, x, y):
    beta, sigma, sigma0 = theta[:3]
    o = theta[3:]
    linear_part = - np.log(sigma * sigma) / 2 - (y - beta * x)**2 / (2 * sigma * sigma)
    outlier_part = - np.log(sigma0 * sigma0) / 2 - y**2 / (2 * sigma0 * sigma0)
    return np.sum( o * outlier_part + (1-o) * linear_part)

def log_prior(theta):
    beta, sigma, sigma0 = theta[:3]
    return - np.log(abs(sigma)) - np.log(abs(sigma0))

def log_posterior(theta, x, y):
    return log_likelihood(theta, x, y) + log_prior(theta)

From a simulation point of view, this is a high-dimensional problem, and partly discrete, so I don't think we can just use emcee naively. Instead, let's roll our own MCMC method, using a combination of a Gibbs sampler and a Metropolis sampler.

At each step, we choose which variable to update (I think this definitely makes this a Gibbs method, though the terminology is confusing).
If it's $\beta, \sigma$ or $\sigma_0$ then we apply a usual Metropolis method, say with jump probability $N(0,0.1)$
Otherwise we update one of the $o_i$, where we'll use a jump probability of 50%.

In [231]:

def jump_index(index, theta, x, y, old_log_prob):
    theta_new = np.array(theta, dtype=np.float64)
    if index < 3:
        theta_new[index] += np.random.normal(0, 0.2)
    else:
        if np.random.random() >= 0.5:
            theta_new[index] = 1 - theta_new[index]
    log_prob = log_posterior(theta_new, x, y)
    alpha = log_prob - old_log_prob
    if alpha >= 0 or alpha >= np.log(np.random.random()):
        return theta_new, log_prob
    return theta, old_log_prob

def jump(theta, x, y, old_log_prob):
    index = random.randrange(len(theta))
    return jump_index(index, theta, x, y, old_log_prob)

def walk(theta, x, y, length = 1000):
    old_log_prob = log_posterior(theta,x,y)
    trace = []
    for _ in range(length):
        theta, old_log_prob = jump(theta, x, y, old_log_prob)
        trace.append(theta)
    return np.array(trace)

In [232]:

theta = np.array([0,1,1] + [1]*len(x))
trace = walk(theta, x, y, 1000000)
trace

Out[232]:

array([[ 0.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 0.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 0.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       ..., 
       [ 0.50289543,  1.21465707,  8.5969908 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.50289543,  1.21465707,  8.5969908 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.50289543,  1.21465707,  8.5969908 , ...,  0.        ,
         0.        ,  0.        ]])

In [233]:

# Marginalise away the outliers and discard burnin and sample every 100 terms to try to reduce correlations
param_trace = trace[10000::100, :3]

In [234]:

figure = triangle.corner(param_trace, labels = ["$\\beta$", "$\sigma$", "$\sigma_0$"], truths=[0.5, 1, 10])

In [235]:

fig, axes = plt.subplots(figsize=(8,6))
axes.plot(x,y, "ok")

xfit = np.linspace(0, 50, num=50)
beta = param_trace[:, 0]
yfit = (xfit[:,None]*beta).mean(1)
axes.plot(xfit, yfit)
s = (xfit[:,None]*beta).std(1)
axes.fill_between(xfit, yfit-s, yfit+s, color="lightgray")

Out[235]:

<matplotlib.collections.PolyCollection at 0x9be9e80>

In [236]:

outliers = [ (index, trace[:,index+3].mean()) for index in range(len(x)) ]
outliers.sort(key = lambda pair : pair[1])
outliers

Out[236]:

[(27, 0.0094120000000000002),
 (49, 0.0098130000000000005),
 (44, 0.010803999999999999),
 (29, 0.011235999999999999),
 (22, 0.016619999999999999),
 (9, 0.018874999999999999),
 (40, 0.019254),
 (30, 0.022537000000000001),
 (1, 0.028646999999999999),
 (36, 0.033452000000000003),
 (45, 0.036044),
 (25, 0.036894999999999997),
 (23, 0.042146999999999997),
 (5, 0.042412999999999999),
 (14, 0.045911),
 (6, 0.047826),
 (7, 0.048520000000000001),
 (24, 0.051985000000000003),
 (47, 0.064698000000000006),
 (17, 0.072871000000000005),
 (12, 0.076995999999999995),
 (3, 0.077533000000000005),
 (19, 0.087968000000000005),
 (26, 0.089524000000000006),
 (35, 0.094821000000000003),
 (46, 0.095072000000000004),
 (8, 0.095722000000000002),
 (41, 0.096133999999999997),
 (20, 0.098364999999999994),
 (37, 0.099663000000000002),
 (0, 0.108279),
 (38, 0.141537),
 (34, 0.14313799999999999),
 (10, 0.16606799999999999),
 (48, 0.18185899999999999),
 (28, 0.192745),
 (16, 0.20547499999999999),
 (18, 0.23061400000000001),
 (31, 0.25907400000000003),
 (32, 0.39712700000000001),
 (15, 0.49575799999999998),
 (33, 0.65765799999999996),
 (13, 0.99680100000000005),
 (4, 0.99761),
 (39, 0.99973900000000004),
 (21, 0.99976100000000001),
 (11, 0.99986799999999998),
 (2, 1.0),
 (42, 1.0),
 (43, 1.0)]

In [238]:

outlier_indices = [i for (i,t) in outliers if t > 0.9]

fig, axes = plt.subplots(figsize=(8,6))
axes.plot(x,y, "ok", color="grey")
axes.plot(x[outlier_indices],y[outlier_indices], "ok", color="red")

Out[238]:

[<matplotlib.lines.Line2D at 0xacbaa58>]

Speeding it up¶

The sampling is rather slow. Let's consider the task of updating one of the $o_i$. The posterior distribution is

$$ p \propto \sigma^{-1} \sigma_0^{-1} \prod_{i=1}^n f_i(y|\beta,\sigma)^{1-o_i} g_i(y|\sigma_0)^{o_i}. $$

Fix $o_i$ and let $\theta$ be all the other variables (including $y$). Then

$$ p(o_i | \theta ) = \frac{p(o_i, \theta)}{p(\theta)} = \frac{p(o_i, \theta)}{p(0,\theta) + p(1,\theta)}. $$

This simplifies a lot to give:

$$ p(o_i|\theta) = \frac{f_i(y|\beta,\sigma)^{1-o_i} g_i(y|\sigma_0)^{o_i}}{f_i(y|\beta,\sigma) + g_i(y|\sigma_0)}. $$

In fact, notice that this distribution only depends on $\beta, \sigma, \sigma_0$, and so by updating all the $o_i$ at the same time, we end up taking an exact sample from $p(o|\beta,\sigma\,sigma_0)$. Thus this new algorithm is, essentially, a 4-dimensional Gibbs sampler. We can also exploit numpy magic to make the $(o_i)$ step pretty fast.

In [243]:

def gibbs(theta, x, y):
    # Update beta etc.
    old_log_prob = log_posterior(theta, x, y)
    theta, old_log_prob = jump_index(0, theta, x, y, old_log_prob)
    theta, old_log_prob = jump_index(1, theta, x, y, old_log_prob)
    theta, old_log_prob = jump_index(2, theta, x, y, old_log_prob)
    # Update outliers
    beta, sigma, sigma0 = theta[:3]
    o = theta[3:]
    linear_part = - np.log(sigma * sigma) / 2 - (y - beta * x)**2 / (2 * sigma * sigma)
    outlier_part = - np.log(sigma0 * sigma0) / 2 - y**2 / (2 * sigma0 * sigma0)
    # prob = P(o_i = 1)
    #prob = np.exp(outlier_part) / (np.exp(linear_part) + np.exp(outlier_part))
    prob = 1.0 / (1.0 + np.exp(linear_part - outlier_part))
    o = np.array( np.random.random(size = len(prob)) <= prob, dtype = np.float64 )
    return np.hstack([np.array([beta, sigma, sigma0]), o])

def walk(theta, length = 10000):
    trace = []
    for _ in range(length):
        theta = gibbs(theta, x, y)
        trace.append(theta)
    return np.array(trace)

In [246]:

theta = np.array([0,1,1] + [1]*len(x))
trace = walk(theta, 20000)
param_trace = trace[2000::, :3]
figure = triangle.corner(param_trace, labels = ["$\\beta$", "$\sigma$", "$\sigma_0$"], truths=[0.5, 1, 10])

In [247]:

outliers = [ (index, trace[:,index+3].mean()) for index in range(len(x)) ]
outliers.sort(key = lambda pair : pair[1])
outlier_indices = [i for (i,t) in outliers if t > 0.9]

fig, axes = plt.subplots(figsize=(8,6))
axes.plot(x,y, "ok", color="grey")
axes.plot(x[outlier_indices],y[outlier_indices], "ok", color="red")
xfit = np.linspace(0, 50, 50)
beta = trace[:,0]
axes.plot(xfit, xfit*beta.mean())

Out[247]:

[<matplotlib.lines.Line2D at 0x6eaedd8>]

In [ ]: