import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.misc
import scipy.optimize
import scipy.stats
data = np.loadtxt('data/Mixturevals.txt')
print len(data), 'data points'
plt.hist(data, histtype='step', log=True)
plt.title("Histogram of the data")
1000 data points
<matplotlib.text.Text at 0x9e571cc>
Plotting a histogram of the data with $p(x)$ and various exponential PDFs shows that many of the points likely came from $p(x)$ since their values are so large. Zooming in shows that there are more small values than the $p(x)$ distribution can explain, however, so many of the points also likely came from an exponential distribution.
def plot_data(data):
x = np.arange(0, 250, 0.1)
y_other = (2.0 / np.pi) / (1.0 + x**2)
plt.hist(data, normed=True, bins=20, label='Data', histtype='step')
plt.plot(x, y_other, label='$p(x)$')
for beta in (0.1, 0.5, 5):
plt.plot(x, scipy.stats.expon(scale=(1.0 / beta)).pdf(x), label="$Exponential({0})$".format(beta))
plt.yscale('log')
plt.ylim(10e-10, 0.1)
plt.legend()
plt.title("Histogram with possible PDFs")
plot_data(data)
def plot_data_zoomed(data):
x = np.logspace(-10, 3, num=1000, base=2)
y_other = (2.0 / np.pi) / (1.0 + x**2)
plt.hist(data, normed=True, bins=np.logspace(-10, 3, num=20, base=2), label='Data', histtype='step')
plt.plot(x, y_other, label='$p(x)$')
plt.xscale('log')
plt.xlim(10e-3, 4)
plt.legend()
plot_data_zoomed(data)
Minimize the negative log probability of the data to get the maximum likelihood estimate of the parameters.
def _log_prob(beta, c, data):
prob_exp = beta * np.exp(-beta * data)
prob_p = (2.0 / np.pi) / (1.0 + data**2)
return np.sum(np.log((prob_exp * c) + (prob_p * (1.0 - c))))
log_prob = np.vectorize(_log_prob, excluded=[2])
def negative_log_prob_data(params, data):
beta, c = params
return -log_prob(beta, c, data)
guess = [2, 0.5]
beta_mle, c_mle = scipy.optimize.fmin(negative_log_prob_data, guess, args=(data,))
print "Maximum likelihood estimates of beta and c:", beta_mle, c_mle
Optimization terminated successfully. Current function value: 1721.730613 Iterations: 81 Function evaluations: 151 Maximum likelihood estimates of beta and c: 0.665297780416 0.359560194534
-c:4: RuntimeWarning: invalid value encountered in log
Plot the joint posterior distribution of the parameters, then marginalize over $c$ to get the posterior distribution of $\beta$ alone.
betas = np.arange(0, 1, 0.005)
cs = np.arange(0, 1, 0.005)
betas_grid, cs_grid = np.meshgrid(betas, cs)
log_probs = log_prob(betas_grid, cs_grid, data)
scaled_probs = np.exp(log_probs - log_probs.max())
scaled_probs = scaled_probs / scaled_probs.sum()
plt.contour(betas_grid, cs_grid, scaled_probs)
plt.plot([beta_mle], [c_mle], color='red', marker='*', label='MLE Estimate')
plt.colorbar()
plt.legend()
plt.xlabel('$\\beta$')
plt.ylabel('$c$')
plt.title('Joint Posterior Distribution of Parameters')
<matplotlib.text.Text at 0xa43fd2c>
plt.plot(betas, scaled_probs.sum(axis=0), label='$\\beta$')
plt.axvline(x=beta_mle, label='MLE $\\beta$ Estimate', color='red', linestyle='--')
plt.title('$P(\\beta)$')
plt.legend(loc='upper left')
<matplotlib.legend.Legend at 0xa1f8cac>