I got a bit carried away with this assignment...
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas
import scipy
import scipy.stats
np.random.seed(13)
def load_exon_samples(size=600):
exon_1, exon_2 = np.loadtxt('data/Twoexondata.txt').T
sample_idx = np.random.choice(range(len(exon_1)), size=size, replace=False)
return exon_1[sample_idx], exon_2[sample_idx]
exon_1, exon_2 = load_exon_samples()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
for ax, data, title in ((ax1, exon_1, 'Exon 1'), (ax2, exon_2, 'Exon 2')):
ax.hist(data, bins=50)
ax.set_title(title)
class BaseProposalGenerator(object):
"""Base MCMC proposal generator class."""
# Sublcasses should set this to a function that takes parameters
# and returns a measure of how well it fits the data (e.g. log
# likelihood).
_fit_metric_func = None
def __init__(self, data):
"""Construct a proposal generator.
Parameters
----------
data : array-like
The data to fit.
"""
self.data = data
def __call__(self, params_current):
"""Called to propose an MCMC step.
Given current parameters, returns `(params_proposed, pi_ratio, q_ratio)`.
"""
raise NotImplementedError
def plot_fit(self, ax, params):
"""Plot the fit given by the params."""
raise NotImplementedError
def plot_fit_metric(self, ax, chain, **kwargs):
"""Plot the fit metric along the chain."""
defaults = (('alpha', 0.5), ('s', 5), ('edgecolors', 'none'),
('c', np.linspace(0, 1, len(chain))))
for key, val in defaults:
if key not in kwargs:
kwargs[key] = val
x = np.arange(0, len(chain))
metric = np.array([self._fit_metric_func(params) for params in chain])
cax = ax.scatter(x, metric, **kwargs)
ax.set_xlim(0, len(chain))
return cax
class MCMC(object):
"""Implementation of Markov Chain Monte Carlo.
Attributes
----------
burned_in_index : int
Index at which the chain is considered to be burned in. Affects
the chain returned by the `get_chain` method. Defaults to 0 and
can be manually set to something else.
"""
def __init__(self, proposal_generator, params_start, param_names=None, initial_chain_length=10000):
"""Initialize an MCMC object.
Parameters
----------
proposal_generator : callable
A function that, when called with the current parameters, proposes the
next step by returning a tuple `(params_proposed, pi_ratio, q_ratio)`.
params_start : array-like
Initial parameters.
param_names : array-like, optional
Names for each of the parameters. If specified, this allows methods
that take an index for a parameter to accept its name instead of its
index.
initial_chain_length: int, optional
The initial length of the chain. The chain is automatically extended if
`run` is called with a number of steps that would extend past the end
of the chain.
"""
self._proposal_generator = proposal_generator
self._chain = np.zeros((initial_chain_length, len(params_start)), dtype=float)
self._chain[0, :] = params_start
self._steps_completed = 1
self.burned_in_index = 0
self._param_name_to_idx = None
self.num_params = len(params_start)
self.param_names = param_names
if param_names is not None:
assert len(param_names) == len(params_start)
self._param_name_to_idx = dict(zip(param_names, range(len(param_names))))
def run(self, num_steps=None):
"""Run the chain for multiple steps.
Parameters
----------
num_steps : int, optional
The number of steps to run the chain. If not specified, the chain is
run up to the size of the currently allocated chain matrix (initially
set in the constructor).
Returns
-------
MCMC
Returns `self`.
"""
remaining_chain_length = self._chain.shape[0] - self._steps_completed
if num_steps is None:
num_steps = remaining_chain_length
else:
shape = self._chain.shape
add = num_steps - remaining_chain_length
new_chain = np.zeros((shape[0] + add, shape[1]), dtype=self._chain.dtype)
new_chain[:shape[0], :] = self._chain
self._chain = new_chain
while num_steps > 0:
self._chain[self._steps_completed, :] = self._step(self._chain[self._steps_completed - 1, :])
self._steps_completed += 1
num_steps -= 1
return self
def get_chain(self, from_beginning=False):
"""Get a view of the chain.
Returns a view of the chain starting at `self.burned_in_index` by default
and ending at the last step that has been completed.
Parameters
----------
from_beginning : bool, optional
If True, returns the chain from the beginning (ignoring `self.burned_in_index`).
"""
return self._chain[self.burned_in_index:self._steps_completed, :]
def hist(self, ax, param, **kwargs):
"""Histogram of a single parameter."""
defaults = (('bins', 100),)
for key, val in defaults:
if key not in kwargs:
kwargs[key] = val
param = self._params_to_idx(param)
chain = self.get_chain()
return ax.hist(chain[:, param], **kwargs)
def scatter(self, ax, param_1, param_2, **kwargs):
"""Add a scatter plot to the axis provided.
Gets the chain from `self.get_chain` and plots a scatter plot of two params.
Parameters
----------
ax : Matplotlib axis
The axis to plot to.
param_1 : string, int
The name or index of the first parameter.
param_2 : string, int
The name or index of the second parameter.
"""
param_1, param_2 = self._params_to_idx(param_1, param_2)
chain = self.get_chain()
defaults = (('alpha', 0.5), ('s', 5), ('marker', 'o'), ('edgecolors', 'none'),
('c', np.linspace(0, 1, len(chain))))
for key, val in defaults:
if key not in kwargs:
kwargs[key] = val
cax = ax.scatter(chain[:, param_1], chain[:, param_2], **kwargs)
cmap = cax.get_cmap()
ax.scatter(chain[0, param_1], chain[0, param_2], label='Start',
s=400, color=cmap(0.0), marker='*', edgecolors='black')
ax.scatter(chain[-1, param_1], chain[-1, param_2],
label='Finish', s=400, color=cmap(1.0), marker='*',
edgecolors='black')
return cax
def scatter_matrix(self, params=None, **kwargs):
"""Scatter plot matrix of the parameter values."""
if params is None:
params = self.param_names
if params is None:
params = range(self.num_params)
fig, axs = plt.subplots(len(params), len(params), figsize=(16, 16))
fig.subplots_adjust(hspace=0.01, wspace=0.01)
for i, p1 in enumerate(params):
for j, p2 in enumerate(params):
ax = axs[i][j]
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
if i == j:
ax.annotate(params[i], (0.5, 0.5), xycoords='axes fraction',
ha='center', va='center')
elif i != j:
cax = self.scatter(ax, p1, p2, **kwargs)
cbar = fig.colorbar(cax, ax=axs[0][-1], ticks=(0, 1))
cbar.set_ticklabels((0, len(self.get_chain())))
return fig
def smoothed_lines(self, ax, params, num_samples=500, scale=True, **kwargs):
"""Line plot of parameters."""
params_orig = params
params = self._params_to_idx(*params)
chain = self.get_chain()
window = len(chain) // num_samples
chain = pandas.rolling_mean(chain, window)[window:]
if scale:
scaled_chain = chain - chain.min(axis=0)
scaled_chain = scaled_chain / scaled_chain.max(axis=0)
chain = scaled_chain
sample_idx = np.floor(np.linspace(0, len(chain), num_samples)).astype(np.int)[:-1]
chain = chain[sample_idx]
x = np.arange(len(chain))
for i, param in enumerate(params):
ax.plot(x, chain[:, param], label=str(params_orig[i]))
def _step(self, params_current):
# Takes a single step.
params_proposed, pi_ratio, q_ratio = self._proposal_generator(params_current)
alpha = np.min([1.0, pi_ratio * q_ratio])
if np.random.random() < alpha:
return params_proposed
return params_current
def _params_to_idx(self, *params):
# Converts params (specified as strings or indices) to indices.
idxs = []
for param in params:
if isinstance(param, basestring):
# Assume it's a parameter's name; map it to its index.
idxs.append(self._param_name_to_idx[param])
else:
idxs.append(param)
return tuple(idxs)
# Various plotting routines.
def param_hists(mcmc):
"""Grid of histograms, one for each parameter."""
if mcmc.param_names is not None:
param_names = mcmc.param_names
else:
param_names = ['Param {0}'.format(i) for i in xrange(mcmc.num_params)]
num_params = len(param_names)
chain = mcmc.get_chain()
nrows = int(np.ceil(num_params / 3.0))
ncols = 3
fig, ax = plt.subplots(nrows, ncols, figsize=(16, 4 * nrows))
for i, param in enumerate(param_names):
row = i // ncols
col = i % ncols
mcmc.hist(ax[row][col], param, normed=True)
mean, sd = np.mean(chain[:, i]), np.std(chain[:, i])
title = '{0}: {1:.4f} +/- {2:.4f}'.format(param, mean, sd)
ax[row][col].set_title(title)
return fig
def scatter_with_fit_metric(mcmc, proposer, param_1, param_2, **kwargs):
"""Plot a scatter plot of two parameters next to the fit metric
(e.g., negative log likelihood) over the chain."""
chain = mcmc.get_chain()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
if 'cmap' not in kwargs:
kwargs['cmap'] = matplotlib.cm.rainbow
cax = mcmc.scatter(ax1, param_1, param_2, **kwargs)
ax1.set_xlabel(str(param_1))
ax1.set_ylabel(str(param_2))
cbar = fig.colorbar(cax, ticks=(0, 1))
cbar.set_ticklabels((0, len(chain)))
ax1.legend()
proposer.plot_fit_metric(ax2, chain, **kwargs)
ax2.set_xlabel('Iteration')
ax2.set_ylabel('Fit Metric')
return fig
def plot_some_fits(mcmc, proposer, **kwargs):
"""Plot a grid of fits along the chain."""
chain = mcmc.get_chain()
idxs = np.arange(0, len(chain), len(chain) // 9)[:9]
fig, ax = plt.subplots(3, 3, figsize=(14, 12))
for i, idx in enumerate(idxs):
row = i // 3
col = i % 3
proposer.plot_fit(ax[row][col], chain[idx], **kwargs)
ax[row][col].set_title('Step {0}'.format(idx))
return fig
def two_t_mixture_likelihoods(params, data):
"""Compute likelihoods for each data point."""
p1_center, p1_width, height_ratio, p2_center, p2_width, nu = params
p1 = scipy.stats.t.pdf((data - p1_center) / p1_width, nu)
p2 = height_ratio * scipy.stats.t.pdf((data - p2_center) / p2_width, nu)
norm = (p1_width + (height_ratio * p2_width))
p1_norm = p1 / norm
p2_norm = p2 / norm
p = p1_norm + p2_norm
return p, p1_norm, p2_norm
def two_t_mixture_neg_ll(params, data):
"""Compute the negative log likelihood of the data given the params."""
p = two_t_mixture_likelihoods(params, data)[0]
neg_ll = np.sum(-np.log(p))
if np.isnan(neg_ll):
# Ugly hack. Otherwise I was getting nan sometimes and not recovering.
return 1e100
return neg_ll
class MixtureTwoStudentTProposalGenerator(BaseProposalGenerator):
"""Proposal generator for a mixure of two student t distributions.
Proposed steps are drawn from a multivariate Normal distribution centered
at the current position and with the provided covariance matrix.
"""
def __init__(self, data, proposal_covar):
"""Construct a proposal generator.
Parameters
----------
data : array-like
The data to fit.
proposal_covar:
Covariance matrix used to generate proposals.
"""
super(MixtureTwoStudentTProposalGenerator, self).__init__(data)
self._fit_metric_func = self.mixture_neg_ll
self.proposal_covar = proposal_covar
self._non_neg_indices = np.array([1, 2, 4, 5])
self._ll_cache = {}
def __call__(self, params_current):
"""Called to propose an MCMC step.
See the `MCMC` `proposal_generator` parameter for details.
"""
while True:
params_proposed = np.random.multivariate_normal(params_current, self.proposal_covar)
# Ugly hack to prevent parameters from going negative.
if np.all(params_proposed[self._non_neg_indices] >= 0):
break
neg_log_pi_current = self.mixture_neg_ll(params_current)
neg_log_pi_proposed = self.mixture_neg_ll(params_proposed)
pi_ratio = np.exp(neg_log_pi_current - neg_log_pi_proposed)
q_ratio = 1.0
return params_proposed, pi_ratio, q_ratio
def plot_fit(self, ax, params):
"""Visualize how the model fits the data with the give parameters."""
# Some heuristics to compute the x range.
min_val, max_val = self.data.min(), self.data.max()
half_range = (max_val - min_val) / 2.0
xs = np.linspace(min_val - half_range, max_val + half_range, num=2000)
y_p, y_p1, y_p2 = two_t_mixture_likelihoods(params, xs)
non_zeroish = np.where((y_p >= (np.max(y_p) * 0.005)) | ((xs >= min_val) & (xs <= max_val)))
start, end = non_zeroish[0][0], non_zeroish[0][-1]
ax.plot(xs[start:end], y_p1[start:end], label='p1')
ax.plot(xs[start:end], y_p2[start:end], label='p2')
ax.plot(xs[start:end], y_p[start:end], label='p', linewidth=3)
ax.hist(self.data, normed=True, color='lightgray', alpha=0.5, bins=50, label='data')
ax.legend()
ax.set_title('Two-student-t model')
return ax
def mixture_neg_ll(self, params):
"""Compute the negative log likelihood of the data given the params."""
# Retrieve from cache if we're already computed it.
hash_val = hash('/'.join(str(v) for v in params))
cached = self._ll_cache.get(hash_val, None)
if cached is not None:
return cached
neg_ll = two_t_mixture_neg_ll(params, self.data)
self._ll_cache[hash_val] = neg_ll
return neg_ll
Find maximum likelihood estimates of the parameters to use as the start of the chain.
data = exon_1
params_guess = [3.3, .5, .1, 4.5, .5, 10]
result = scipy.optimize.minimize(two_t_mixture_neg_ll, params_guess, args=(data,))
chain_start, neg_ll, hess_inv = result['x'], result['fun'], result['hess_inv']
print chain_start, neg_ll
[ 3.13396345 0.63218323 0.17743773 4.60357452 0.28501594 11.60383024] 690.903311837
Build the proposal generator and visualize the start of the chain.
proposer = MixtureTwoStudentTProposalGenerator(data, hess_inv * 0.1)
fig, ax = plt.subplots(figsize=(8, 6))
proposer.plot_fit(ax, chain_start)
<matplotlib.axes.AxesSubplot at 0xb52a28c>
Run MCMC for 100,000 iterations.
param_names = ('p1_center', 'p1_width', 'height_ratio', 'p2_center', 'p2_width', 'nu')
np.random.seed(13)
mcmc = MCMC(proposer, chain_start, param_names=param_names, initial_chain_length=100000)
%time mcmc.run()
CPU times: user 2min 48s, sys: 156 ms, total: 2min 48s Wall time: 2min 49s
<__main__.MCMC at 0xb51d6ac>
Visualize the fit at evenly-spaced intervals along the chain.
_ = plot_some_fits(mcmc, proposer)
Look at histograms of each parameter along with confidence regions.
_ = param_hists(mcmc)
Look at a single scatter plot between two parameters along with the negative log likelihood at each step on the chain.
_ = scatter_with_fit_metric(mcmc, proposer, 'nu', 'p2_center')
View a scatter plot matrix between parameter values at each point along the chain.
_ = mcmc.scatter_matrix()
Finally, plot the posterior distribution of the ratio of areas of the two Student components.
chain = mcmc.get_chain()
areas = chain[:, 1] / (chain[:, 2] * chain[:, 4])
plt.hist(areas, bins=100, range=(0, 40))
plt.title('Ratio of areas')
<matplotlib.text.Text at 0xd1e7b6c>
Based on the histogram of the second exon lengths, I don't think a single component will fit the data well. So I will fit two components. Like I did with the first exon data, I start by finding maximum likelihood estimates of the parameters
data2 = exon_2
params_guess2 = [3.3, .5, .1, 4.5, .5, 10]
result2 = scipy.optimize.minimize(two_t_mixture_neg_ll, params_guess2, args=(data2,))
chain_start2, neg_ll2, hess_inv2 = result2['x'], result2['fun'], result2['hess_inv']
print chain_start2, neg_ll2
proposer2 = MixtureTwoStudentTProposalGenerator(data2, hess_inv2 * 0.1)
fig, ax = plt.subplots(figsize=(8, 6))
proposer2.plot_fit(ax, chain_start2)
[ 3.12576857 0.64931975 0.15201056 4.40033815 0.20729874 9.54519873] 684.048325909
<matplotlib.axes.AxesSubplot at 0xd07c2cc>
Let's run for 500,000 iterations this time.
param_names2 = ('p1_center', 'p1_width', 'height_ratio', 'p2_center', 'p2_width', 'nu')
np.random.seed(13)
mcmc2 = MCMC(proposer2, chain_start2, param_names=param_names2, initial_chain_length=500000)
%time mcmc2.run()
CPU times: user 14min 14s, sys: 848 ms, total: 14min 15s Wall time: 14min 14s
<__main__.MCMC at 0xd0837ac>
Now, visualizing the fit at evenly-spaced intervals along the chain, we see something somewhat surprising. The small hump moves from the right to the left somewhere around the 100,000th iteration and stays there for the remainder of the chain.
_ = plot_some_fits(mcmc2, proposer2)
Looking at a scatter plot of the hump centers along with a plot of the negative log likelihood, we see a large drop in negative log likelihood around iteration 75,000 as the small hump moves to the left. Note that, as this happens, we get a better fit than we initially found as the maximum likelihood estimate. This suggests that the minimization was too sensitive to my (bad) parameter guess that had the small hump on the right.
_ = scatter_with_fit_metric(mcmc2, proposer2, 'p1_center', 'p2_center')
To see the difference between the maximum likelihood fit and the best fit found with MCMC, I find and visualize the step in the chain that minimizes the negative log likelihood.
chain2 = mcmc2.get_chain()
nlls2 = np.array([proposer2.mixture_neg_ll(p) for p in chain2])
min_idx2 = np.argmin(nlls2)
print 'Minimum negative log likelihood at index', min_idx2, '--', nlls2[min_idx2]
min_params2 = chain2[min_idx2]
print 'Params', min_params2
fig, ax = plt.subplots()
proposer2.plot_fit(ax, min_params2)
Minimum negative log likelihood at index 150175 -- 676.79887757 Params [ 2.05905712 0.12025956 2.6920738 3.25694949 0.63954603 9.05945015]
<matplotlib.axes.AxesSubplot at 0x113070ec>
It seems reasonable to suggest that the chain has not mixed well until the small hump moves to the left. To see more evidence of this, I can plot the position of the small hump as a function of the number of iterations.
chain2 = mcmc2.get_chain()
hump1_bigger = (chain2[:, 2] < 1)
hump2_bigger = np.invert(hump1_bigger)
small_hump_center = np.zeros(len(chain2), dtype=np.float)
small_hump_center[hump1_bigger] = chain2[hump1_bigger, 3]
small_hump_center[hump2_bigger] = chain2[hump2_bigger, 0]
plt.plot(small_hump_center)
plt.xlabel('Iteration')
plt.ylabel('Position of smaller hump')
<matplotlib.text.Text at 0xc3fc4ec>
Based on the analysis above, I now exclude the first 100,000 iterations and continuing visualizing the results.
mcmc2.burned_in_index = 100000
_ = param_hists(mcmc2)
_ = mcmc2.scatter_matrix()
The nu vs. height_ratio scatter plot looks interesting. Here it is in more detail:
_ = scatter_with_fit_metric(mcmc2, proposer2, 'nu', 'height_ratio')