In [1]:

import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.misc
import scipy.optimize
import scipy.stats

Load and Explore the Data¶

In [2]:

data = np.loadtxt('data/Mixturevals.txt')
print len(data), 'data points'
plt.hist(data, histtype='step', log=True)
plt.title("Histogram of the data")

1000 data points

Out[2]:

<matplotlib.text.Text at 0x9e571cc>

Plotting a histogram of the data with $p(x)$ and various exponential PDFs shows that many of the points likely came from $p(x)$ since their values are so large. Zooming in shows that there are more small values than the $p(x)$ distribution can explain, however, so many of the points also likely came from an exponential distribution.

In [3]:

def plot_data(data):
    x = np.arange(0, 250, 0.1)
    y_other = (2.0 / np.pi) / (1.0 + x**2)
    plt.hist(data, normed=True, bins=20, label='Data', histtype='step')
    plt.plot(x, y_other, label='$p(x)$')
    for beta in (0.1, 0.5, 5):
        plt.plot(x, scipy.stats.expon(scale=(1.0 / beta)).pdf(x), label="$Exponential({0})$".format(beta))
    plt.yscale('log')
    plt.ylim(10e-10, 0.1)
    plt.legend()
    plt.title("Histogram with possible PDFs")

plot_data(data)

In [4]:

def plot_data_zoomed(data):
    x = np.logspace(-10, 3, num=1000, base=2)
    y_other = (2.0 / np.pi) / (1.0 + x**2)
    plt.hist(data, normed=True, bins=np.logspace(-10, 3, num=20, base=2), label='Data', histtype='step')
    plt.plot(x, y_other, label='$p(x)$')
    plt.xscale('log')
    plt.xlim(10e-3, 4)
    plt.legend()

plot_data_zoomed(data)

Problem 2: Maximum Likelihood Values of $\beta$ and $c$¶

Minimize the negative log probability of the data to get the maximum likelihood estimate of the parameters.

In [5]:

def _log_prob(beta, c, data):
    prob_exp = beta * np.exp(-beta * data)
    prob_p = (2.0 / np.pi) / (1.0 + data**2)
    return np.sum(np.log((prob_exp * c) + (prob_p * (1.0 - c))))

log_prob = np.vectorize(_log_prob, excluded=[2])

def negative_log_prob_data(params, data):
    beta, c = params
    return -log_prob(beta, c, data)

In [6]:

guess = [2, 0.5]
beta_mle, c_mle = scipy.optimize.fmin(negative_log_prob_data, guess, args=(data,))
print "Maximum likelihood estimates of beta and c:", beta_mle, c_mle

Optimization terminated successfully.
         Current function value: 1721.730613
         Iterations: 81
         Function evaluations: 151
Maximum likelihood estimates of beta and c: 0.665297780416 0.359560194534

-c:4: RuntimeWarning: invalid value encountered in log

Problem 3: Posterior Distribution of Parameters¶

Plot the joint posterior distribution of the parameters, then marginalize over $c$ to get the posterior distribution of $\beta$ alone.

In [7]:

betas = np.arange(0, 1, 0.005)
cs = np.arange(0, 1, 0.005)
betas_grid, cs_grid = np.meshgrid(betas, cs)
log_probs = log_prob(betas_grid, cs_grid, data)
scaled_probs = np.exp(log_probs - log_probs.max())
scaled_probs = scaled_probs / scaled_probs.sum()
plt.contour(betas_grid, cs_grid, scaled_probs)
plt.plot([beta_mle], [c_mle], color='red', marker='*', label='MLE Estimate')
plt.colorbar()
plt.legend()
plt.xlabel('$\\beta$')
plt.ylabel('$c$')
plt.title('Joint Posterior Distribution of Parameters')

Out[7]:

<matplotlib.text.Text at 0xa43fd2c>

In [8]:

plt.plot(betas, scaled_probs.sum(axis=0), label='$\\beta$')
plt.axvline(x=beta_mle, label='MLE $\\beta$ Estimate', color='red', linestyle='--')
plt.title('$P(\\beta)$')
plt.legend(loc='upper left')

Out[8]:

<matplotlib.legend.Legend at 0xa1f8cac>