Notebook

I got a bit carried away with this assignment...

In [1]:

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas
import scipy
import scipy.stats

np.random.seed(13)

Load and explore a sample of the exon data¶

In [2]:

def load_exon_samples(size=600):
    exon_1, exon_2 = np.loadtxt('data/Twoexondata.txt').T
    sample_idx = np.random.choice(range(len(exon_1)), size=size, replace=False)
    return exon_1[sample_idx], exon_2[sample_idx]

exon_1, exon_2 = load_exon_samples()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
for ax, data, title in ((ax1, exon_1, 'Exon 1'), (ax2, exon_2, 'Exon 2')):
    ax.hist(data, bins=50)
    ax.set_title(title)

Generic MCMC implementation and plotting utilities¶

In [3]:

class BaseProposalGenerator(object):
    """Base MCMC proposal generator class."""

    # Sublcasses should set this to a function that takes parameters
    # and returns a measure of how well it fits the data (e.g. log
    # likelihood).
    _fit_metric_func = None

    def __init__(self, data):
        """Construct a proposal generator.
        
        Parameters
        ----------
        data : array-like
            The data to fit.

        """
        self.data = data

    def __call__(self, params_current):
        """Called to propose an MCMC step.

        Given current parameters, returns `(params_proposed, pi_ratio, q_ratio)`.
        """
        raise NotImplementedError

    def plot_fit(self, ax, params):
        """Plot the fit given by the params."""
        raise NotImplementedError

    def plot_fit_metric(self, ax, chain, **kwargs):
        """Plot the fit metric along the chain."""
        defaults = (('alpha', 0.5), ('s', 5), ('edgecolors', 'none'),
                    ('c', np.linspace(0, 1, len(chain))))
        for key, val in defaults:
            if key not in kwargs:
                kwargs[key] = val
        x = np.arange(0, len(chain))
        metric = np.array([self._fit_metric_func(params) for params in chain])
        cax = ax.scatter(x, metric, **kwargs)
        ax.set_xlim(0, len(chain))
        return cax

In [4]:

class MCMC(object):
    """Implementation of Markov Chain Monte Carlo.
    
    Attributes
    ----------
    burned_in_index : int
        Index at which the chain is considered to be burned in. Affects
        the chain returned by the `get_chain` method. Defaults to 0 and
        can be manually set to something else.

    """
    def __init__(self, proposal_generator, params_start, param_names=None, initial_chain_length=10000):
        """Initialize an MCMC object.
        
        Parameters
        ----------
        proposal_generator : callable
            A function that, when called with the current parameters, proposes the
            next step by returning a tuple `(params_proposed, pi_ratio, q_ratio)`.
        params_start : array-like
            Initial parameters.
        param_names : array-like, optional
            Names for each of the parameters. If specified, this allows methods
            that take an index for a parameter to accept its name instead of its
            index.
        initial_chain_length: int, optional
            The initial length of the chain. The chain is automatically extended if
            `run` is called with a number of steps that would extend past the end
            of the chain.
            
        """
        self._proposal_generator = proposal_generator
        self._chain = np.zeros((initial_chain_length, len(params_start)), dtype=float)
        self._chain[0, :] = params_start
        self._steps_completed = 1
        self.burned_in_index = 0

        self._param_name_to_idx = None
        self.num_params = len(params_start)
        self.param_names = param_names
        if param_names is not None:
            assert len(param_names) == len(params_start)
            self._param_name_to_idx = dict(zip(param_names, range(len(param_names))))

    def run(self, num_steps=None):
        """Run the chain for multiple steps.
        
        Parameters
        ----------
        num_steps : int, optional
            The number of steps to run the chain. If not specified, the chain is
            run up to the size of the currently allocated chain matrix (initially
            set in the constructor).
        
        Returns
        -------
        MCMC
            Returns `self`.
            
        """
        remaining_chain_length = self._chain.shape[0] - self._steps_completed
        if num_steps is None:
            num_steps = remaining_chain_length
        else:
            shape = self._chain.shape
            add = num_steps - remaining_chain_length
            new_chain = np.zeros((shape[0] + add, shape[1]), dtype=self._chain.dtype)
            new_chain[:shape[0], :] = self._chain
            self._chain = new_chain

        while num_steps > 0:
            self._chain[self._steps_completed, :] = self._step(self._chain[self._steps_completed - 1, :])
            self._steps_completed += 1
            num_steps -= 1

        return self

    def get_chain(self, from_beginning=False):
        """Get a view of the chain.
        
        Returns a view of the chain starting at `self.burned_in_index` by default
        and ending at the last step that has been completed.
        
        Parameters
        ----------
        from_beginning : bool, optional
            If True, returns the chain from the beginning (ignoring `self.burned_in_index`).
            
        """
        return self._chain[self.burned_in_index:self._steps_completed, :]

    def hist(self, ax, param, **kwargs):
        """Histogram of a single parameter."""
        defaults = (('bins', 100),)
        for key, val in defaults:
            if key not in kwargs:
                kwargs[key] = val
        param = self._params_to_idx(param)
        chain = self.get_chain()
        return ax.hist(chain[:, param], **kwargs)

    def scatter(self, ax, param_1, param_2, **kwargs):
        """Add a scatter plot to the axis provided.
        
        Gets the chain from `self.get_chain` and plots a scatter plot of two params.
        
        Parameters
        ----------
        ax : Matplotlib axis
            The axis to plot to.
        param_1 : string, int
            The name or index of the first parameter.
        param_2 : string, int
            The name or index of the second parameter.
            
        """
        param_1, param_2 = self._params_to_idx(param_1, param_2)
        chain = self.get_chain()
        defaults = (('alpha', 0.5), ('s', 5), ('marker', 'o'), ('edgecolors', 'none'),
                    ('c', np.linspace(0, 1, len(chain))))
        for key, val in defaults:
            if key not in kwargs:
                kwargs[key] = val
        cax = ax.scatter(chain[:, param_1], chain[:, param_2], **kwargs)
        cmap = cax.get_cmap()
        ax.scatter(chain[0, param_1], chain[0, param_2], label='Start',
                   s=400, color=cmap(0.0), marker='*', edgecolors='black')
        ax.scatter(chain[-1, param_1], chain[-1, param_2],
                   label='Finish', s=400, color=cmap(1.0), marker='*',
                   edgecolors='black')
        return cax

    def scatter_matrix(self, params=None, **kwargs):
        """Scatter plot matrix of the parameter values."""
        if params is None:
            params = self.param_names
            if params is None:
                params = range(self.num_params)
        fig, axs = plt.subplots(len(params), len(params), figsize=(16, 16))
        fig.subplots_adjust(hspace=0.01, wspace=0.01)
        for i, p1 in enumerate(params):
            for j, p2 in enumerate(params):
                ax = axs[i][j]
                ax.xaxis.set_visible(False)
                ax.yaxis.set_visible(False)
                if i == j:
                    ax.annotate(params[i], (0.5, 0.5), xycoords='axes fraction',
                                ha='center', va='center')
                elif i != j:
                    cax = self.scatter(ax, p1, p2, **kwargs)
        cbar = fig.colorbar(cax, ax=axs[0][-1], ticks=(0, 1))
        cbar.set_ticklabels((0, len(self.get_chain())))
        return fig

    def smoothed_lines(self, ax, params, num_samples=500, scale=True, **kwargs):
        """Line plot of parameters."""
        params_orig = params
        params = self._params_to_idx(*params)

        chain = self.get_chain()
        window = len(chain) // num_samples
        chain = pandas.rolling_mean(chain, window)[window:]
        
        if scale:
            scaled_chain = chain - chain.min(axis=0)
            scaled_chain = scaled_chain / scaled_chain.max(axis=0)
            chain = scaled_chain

        sample_idx = np.floor(np.linspace(0, len(chain), num_samples)).astype(np.int)[:-1]
        chain = chain[sample_idx]
        x = np.arange(len(chain))
        for i, param in enumerate(params):
            ax.plot(x, chain[:, param], label=str(params_orig[i]))

    def _step(self, params_current):
        # Takes a single step.
        params_proposed, pi_ratio, q_ratio = self._proposal_generator(params_current)
        alpha = np.min([1.0, pi_ratio * q_ratio])
        if np.random.random() < alpha:
            return params_proposed
        return params_current

    def _params_to_idx(self, *params):
        # Converts params (specified as strings or indices) to indices.
        idxs = []
        for param in params:
            if isinstance(param, basestring):
                # Assume it's a parameter's name; map it to its index.
                idxs.append(self._param_name_to_idx[param])
            else:
                idxs.append(param)
        return tuple(idxs)

In [5]:

# Various plotting routines.

def param_hists(mcmc):
    """Grid of histograms, one for each parameter."""
    if mcmc.param_names is not None:
        param_names = mcmc.param_names
    else:
        param_names = ['Param {0}'.format(i) for i in xrange(mcmc.num_params)]
    num_params = len(param_names)
    
    chain = mcmc.get_chain()
    
    nrows = int(np.ceil(num_params / 3.0))
    ncols = 3
    fig, ax = plt.subplots(nrows, ncols, figsize=(16, 4 * nrows))
    for i, param in enumerate(param_names):
        row = i // ncols
        col = i % ncols
        mcmc.hist(ax[row][col], param, normed=True)
        
        mean, sd = np.mean(chain[:, i]), np.std(chain[:, i])
        title = '{0}: {1:.4f} +/- {2:.4f}'.format(param, mean, sd)
        ax[row][col].set_title(title)

    return fig


def scatter_with_fit_metric(mcmc, proposer, param_1, param_2, **kwargs):
    """Plot a scatter plot of two parameters next to the fit metric
    (e.g., negative log likelihood) over the chain."""
    chain = mcmc.get_chain()
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    if 'cmap' not in kwargs:
        kwargs['cmap'] = matplotlib.cm.rainbow
    cax = mcmc.scatter(ax1, param_1, param_2, **kwargs)
    ax1.set_xlabel(str(param_1))
    ax1.set_ylabel(str(param_2))
    cbar = fig.colorbar(cax, ticks=(0, 1))
    cbar.set_ticklabels((0, len(chain)))
    ax1.legend()
    proposer.plot_fit_metric(ax2, chain, **kwargs)
    ax2.set_xlabel('Iteration')
    ax2.set_ylabel('Fit Metric')
    return fig


def plot_some_fits(mcmc, proposer, **kwargs):
    """Plot a grid of fits along the chain."""
    chain = mcmc.get_chain()
    idxs = np.arange(0, len(chain), len(chain) // 9)[:9]
    fig, ax = plt.subplots(3, 3, figsize=(14, 12))
    for i, idx in enumerate(idxs):
        row = i // 3
        col = i % 3
        proposer.plot_fit(ax[row][col], chain[idx], **kwargs)
        ax[row][col].set_title('Step {0}'.format(idx))
    return fig

Two-student-t proposal generator¶

In [6]:

def two_t_mixture_likelihoods(params, data):
    """Compute likelihoods for each data point."""
    p1_center, p1_width, height_ratio, p2_center, p2_width, nu = params
    p1 =                scipy.stats.t.pdf((data - p1_center) / p1_width, nu)
    p2 = height_ratio * scipy.stats.t.pdf((data - p2_center) / p2_width, nu)
    norm = (p1_width + (height_ratio * p2_width))
    p1_norm = p1 / norm
    p2_norm = p2 / norm
    p = p1_norm + p2_norm
    return p, p1_norm, p2_norm


def two_t_mixture_neg_ll(params, data):
    """Compute the negative log likelihood of the data given the params."""    
    p = two_t_mixture_likelihoods(params, data)[0]
    neg_ll = np.sum(-np.log(p))
    if np.isnan(neg_ll):
        # Ugly hack. Otherwise I was getting nan sometimes and not recovering.
        return 1e100
    return neg_ll


class MixtureTwoStudentTProposalGenerator(BaseProposalGenerator):
    """Proposal generator for a mixure of two student t distributions.
    
    Proposed steps are drawn from a multivariate Normal distribution centered
    at the current position and with the provided covariance matrix.
    
    """
    def __init__(self, data, proposal_covar):
        """Construct a proposal generator.
        
        Parameters
        ----------
        data : array-like
            The data to fit.
        proposal_covar:
            Covariance matrix used to generate proposals.

        """
        super(MixtureTwoStudentTProposalGenerator, self).__init__(data)
        self._fit_metric_func = self.mixture_neg_ll
        self.proposal_covar = proposal_covar
        self._non_neg_indices = np.array([1, 2, 4, 5])
        self._ll_cache = {}

    def __call__(self, params_current):
        """Called to propose an MCMC step.
        
        See the `MCMC` `proposal_generator` parameter for details.
        """
        while True:
            params_proposed = np.random.multivariate_normal(params_current, self.proposal_covar)
            # Ugly hack to prevent parameters from going negative.
            if np.all(params_proposed[self._non_neg_indices] >= 0):
                break
        
        neg_log_pi_current = self.mixture_neg_ll(params_current)
        neg_log_pi_proposed = self.mixture_neg_ll(params_proposed)
        pi_ratio = np.exp(neg_log_pi_current - neg_log_pi_proposed)

        q_ratio = 1.0
    
        return params_proposed, pi_ratio, q_ratio

    def plot_fit(self, ax, params):
        """Visualize how the model fits the data with the give parameters."""
        # Some heuristics to compute the x range.
        min_val, max_val = self.data.min(), self.data.max()
        half_range = (max_val - min_val) / 2.0
        xs = np.linspace(min_val - half_range, max_val + half_range, num=2000)
        y_p, y_p1, y_p2 = two_t_mixture_likelihoods(params, xs)
        non_zeroish = np.where((y_p >= (np.max(y_p) * 0.005)) | ((xs >= min_val) & (xs <= max_val)))
        start, end = non_zeroish[0][0], non_zeroish[0][-1]
        
        ax.plot(xs[start:end], y_p1[start:end], label='p1')
        ax.plot(xs[start:end], y_p2[start:end], label='p2')
        ax.plot(xs[start:end], y_p[start:end], label='p', linewidth=3)
        ax.hist(self.data, normed=True, color='lightgray', alpha=0.5, bins=50, label='data')
        ax.legend()
        ax.set_title('Two-student-t model')
        return ax

    def mixture_neg_ll(self, params):
        """Compute the negative log likelihood of the data given the params."""
        # Retrieve from cache if we're already computed it.
        hash_val = hash('/'.join(str(v) for v in params))
        cached = self._ll_cache.get(hash_val, None)
        if cached is not None:
            return cached

        neg_ll = two_t_mixture_neg_ll(params, self.data)

        self._ll_cache[hash_val] = neg_ll
        return neg_ll

MCMC with two-student-t on the first exon data¶

Find maximum likelihood estimates of the parameters to use as the start of the chain.

In [7]:

data = exon_1
params_guess = [3.3, .5, .1, 4.5, .5, 10]
result = scipy.optimize.minimize(two_t_mixture_neg_ll, params_guess, args=(data,))
chain_start, neg_ll, hess_inv = result['x'], result['fun'], result['hess_inv']
print chain_start, neg_ll

[  3.13396345   0.63218323   0.17743773   4.60357452   0.28501594
  11.60383024] 690.903311837

Build the proposal generator and visualize the start of the chain.

In [8]:

proposer = MixtureTwoStudentTProposalGenerator(data, hess_inv * 0.1)
fig, ax = plt.subplots(figsize=(8, 6))
proposer.plot_fit(ax, chain_start)

Out[8]:

<matplotlib.axes.AxesSubplot at 0xb52a28c>

Run MCMC for 100,000 iterations.

In [9]:

param_names = ('p1_center', 'p1_width', 'height_ratio', 'p2_center', 'p2_width', 'nu')
np.random.seed(13)
mcmc = MCMC(proposer, chain_start, param_names=param_names, initial_chain_length=100000)
%time mcmc.run()

CPU times: user 2min 48s, sys: 156 ms, total: 2min 48s
Wall time: 2min 49s

Out[9]:

<__main__.MCMC at 0xb51d6ac>

Visualize the fit at evenly-spaced intervals along the chain.

In [10]:

_ = plot_some_fits(mcmc, proposer)

Look at histograms of each parameter along with confidence regions.

In [11]:

_ = param_hists(mcmc)

Look at a single scatter plot between two parameters along with the negative log likelihood at each step on the chain.

In [12]:

_ = scatter_with_fit_metric(mcmc, proposer, 'nu', 'p2_center')

View a scatter plot matrix between parameter values at each point along the chain.

In [13]:

_ = mcmc.scatter_matrix()

Finally, plot the posterior distribution of the ratio of areas of the two Student components.

In [14]:

chain = mcmc.get_chain()
areas = chain[:, 1] / (chain[:, 2] * chain[:, 4])
plt.hist(areas, bins=100, range=(0, 40))
plt.title('Ratio of areas')

Out[14]:

<matplotlib.text.Text at 0xd1e7b6c>

MCMC with two-student-t on the second exon data¶

Based on the histogram of the second exon lengths, I don't think a single component will fit the data well. So I will fit two components. Like I did with the first exon data, I start by finding maximum likelihood estimates of the parameters

In [15]:

data2 = exon_2
params_guess2 = [3.3, .5, .1, 4.5, .5, 10]
result2 = scipy.optimize.minimize(two_t_mixture_neg_ll, params_guess2, args=(data2,))
chain_start2, neg_ll2, hess_inv2 = result2['x'], result2['fun'], result2['hess_inv']
print chain_start2, neg_ll2

proposer2 = MixtureTwoStudentTProposalGenerator(data2, hess_inv2 * 0.1)
fig, ax = plt.subplots(figsize=(8, 6))
proposer2.plot_fit(ax, chain_start2)

[ 3.12576857  0.64931975  0.15201056  4.40033815  0.20729874  9.54519873] 684.048325909

Out[15]:

<matplotlib.axes.AxesSubplot at 0xd07c2cc>

Let's run for 500,000 iterations this time.

In [16]:

param_names2 = ('p1_center', 'p1_width', 'height_ratio', 'p2_center', 'p2_width', 'nu')
np.random.seed(13)
mcmc2 = MCMC(proposer2, chain_start2, param_names=param_names2, initial_chain_length=500000)
%time mcmc2.run()

CPU times: user 14min 14s, sys: 848 ms, total: 14min 15s
Wall time: 14min 14s

Out[16]:

<__main__.MCMC at 0xd0837ac>

Now, visualizing the fit at evenly-spaced intervals along the chain, we see something somewhat surprising. The small hump moves from the right to the left somewhere around the 100,000th iteration and stays there for the remainder of the chain.

In [17]:

_ = plot_some_fits(mcmc2, proposer2)

Looking at a scatter plot of the hump centers along with a plot of the negative log likelihood, we see a large drop in negative log likelihood around iteration 75,000 as the small hump moves to the left. Note that, as this happens, we get a better fit than we initially found as the maximum likelihood estimate. This suggests that the minimization was too sensitive to my (bad) parameter guess that had the small hump on the right.

In [18]:

_ = scatter_with_fit_metric(mcmc2, proposer2, 'p1_center', 'p2_center')

To see the difference between the maximum likelihood fit and the best fit found with MCMC, I find and visualize the step in the chain that minimizes the negative log likelihood.

In [19]:

chain2 = mcmc2.get_chain()
nlls2 = np.array([proposer2.mixture_neg_ll(p) for p in chain2])
min_idx2 = np.argmin(nlls2)
print 'Minimum negative log likelihood at index', min_idx2, '--', nlls2[min_idx2]
min_params2 = chain2[min_idx2]
print 'Params', min_params2
fig, ax = plt.subplots()
proposer2.plot_fit(ax, min_params2)

Minimum negative log likelihood at index 150175 -- 676.79887757
Params [ 2.05905712  0.12025956  2.6920738   3.25694949  0.63954603  9.05945015]

Out[19]:

<matplotlib.axes.AxesSubplot at 0x113070ec>

It seems reasonable to suggest that the chain has not mixed well until the small hump moves to the left. To see more evidence of this, I can plot the position of the small hump as a function of the number of iterations.

In [20]:

chain2 = mcmc2.get_chain()
hump1_bigger = (chain2[:, 2] < 1)
hump2_bigger = np.invert(hump1_bigger)
small_hump_center = np.zeros(len(chain2), dtype=np.float)
small_hump_center[hump1_bigger] = chain2[hump1_bigger, 3]
small_hump_center[hump2_bigger] = chain2[hump2_bigger, 0]
plt.plot(small_hump_center)
plt.xlabel('Iteration')
plt.ylabel('Position of smaller hump')

Out[20]:

<matplotlib.text.Text at 0xc3fc4ec>

Based on the analysis above, I now exclude the first 100,000 iterations and continuing visualizing the results.

In [21]:

mcmc2.burned_in_index = 100000

In [22]:

_ = param_hists(mcmc2)

In [23]:

_ = mcmc2.scatter_matrix()

The nu vs. height_ratio scatter plot looks interesting. Here it is in more detail:

In [24]:

_ = scatter_with_fit_metric(mcmc2, proposer2, 'nu', 'height_ratio')