I share here a number of the free code examples that I think just scratch the surface of the really good content of the book.
Free, open source code to reproduce every example in the book.
from IPython.display import Image
import matplotlib.pyplot as plt
# The following line is an ipython notebook trick
%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 8 # plotsize
Image(filename='text_cover1.png')
Bayesian Blocks is a dynamic histogramming method which optimizes one of several possible fitness functions to determine an optimal binning for data, where the bins are not necessarily uniform width.
The code below uses a fitness function suitable for event data with possible repeats.
source: http://www.astroml.org/examples/algorithms/plot_bayesian_blocks.html
# Author: Jake VanderPlas <vanderplas@astro.washington.edu>
# License: BSD
# The figure is an example from astroML: see http://astroML.github.com
# Example has been modified slightly by Jonathan Whitmore
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt
from astroML.plotting import hist
# draw a set of variables
np.random.seed(0)
t = np.concatenate([stats.cauchy(-5, 1.8).rvs(500),
stats.cauchy(-4, 0.8).rvs(2000),
stats.cauchy(-1, 0.3).rvs(500),
stats.cauchy(2, 0.8).rvs(1000),
stats.cauchy(4, 1.5).rvs(500)])
# truncate values to a reasonable range
t = t[(t > -15) & (t < 15)]
4323
plt.rcParams['figure.figsize'] = 12, 8 # plotsize
xval = np.linspace(-15, 15, 2000)
jw = stats.cauchy(-5, 1.8).pdf(xval) * 500
jw += stats.cauchy(-4, 0.8).pdf(xval) * 2000
jw += stats.cauchy(-1, 0.3).pdf(xval) * 500
jw += stats.cauchy(2, 0.8).pdf(xval) * 1000
jw += stats.cauchy(4, 1.5).pdf(xval)* 500
jw = jw / 2000.0
plt.xlabel("t", fontsize=25)
plt.ylabel("Some PDF", fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.ylim(0, 0.5)
plt.plot(xval, jw)
[<matplotlib.lines.Line2D at 0x12423df10>]
stats.cauchy?
plt.rcParams['figure.figsize'] = 12, 8 # plotsize
alpha = .9
# First figure: show normal histogram binning
fig = plt.figure(figsize=(10, 4))
fig.subplots_adjust(left=0.1, right=0.95, bottom=0.15)
ax1 = fig.add_subplot(121)
ax1.hist(t, bins=15, histtype='stepfilled', alpha=alpha, normed=True)
ax1.set_xlabel('t')
ax1.set_ylabel('P(t)')
ax2 = fig.add_subplot(122)
ax2.hist(t, bins=200, histtype='stepfilled', alpha=alpha, normed=True)
ax2.set_xlabel('t')
ax2.set_ylabel('P(t)')
<matplotlib.text.Text at 0x1222ee4d0>
np.random.seed(0)
N = 10000
mu_gamma_f = [(5, 1.0, 0.1),
(7, 0.5, 0.5),
(9, 0.1, 0.1),
(12, 0.5, 0.2),
(14, 1.0, 0.1)]
true_pdf = lambda x: sum([f * stats.cauchy(mu, gamma).pdf(x)
for (mu, gamma, f) in mu_gamma_f])
jw_modify_true_pdf = np.array([true_pdf(x) for x in t])
x = np.concatenate([stats.cauchy(mu, gamma).rvs(int(f * N))
for (mu, gamma, f) in mu_gamma_f])
np.random.shuffle(x)
x = x[x > -10]
x = x[x < 30]
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(bottom=0.08, top=0.95, right=0.95, hspace=0.1)
N_values = (500, 5000)
subplots = (211, 212)
for N, subplot in zip(N_values, subplots):
ax = fig.add_subplot(subplot)
xN = x[:N]
t = np.linspace(-10, 30, 1000)
# plot the results
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')
# hist(xN, bins='knuth', ax=ax, normed=True,
# histtype='stepfilled', alpha=0.3,
# label='Knuth Histogram')
hist(xN, bins='blocks', ax=ax, normed=True,
histtype='step', color='red', linewidth=2.0,
label="Bayesian Blocks")
ax.plot(t, true_pdf(t), '-', color='black',
label="Generating Distribution", linewidth=2.0, alpha=0.5)
# label the plot
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes, fontsize=25)
ax.set_ylabel('$p(x)$', fontsize=25.0)
ax.legend(loc='upper right', prop=dict(size=15))
if subplot == 212:
ax.set_xlabel('$x$', fontsize=25.0)
#ax.set_xticklabels(fontsize=20)
ax.set_xlim(0, 20)
ax.set_ylim(-0.01, 0.4001)
plt.show()
An important feature of this method is that the bins are optimal in a quantitative sense, meaning that statistical significance can be attached to the bin configuration. -- p228
astroml
.# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt
from astroML.density_estimation import XDGMM
from astroML.crossmatch import crossmatch
from astroML.datasets import fetch_sdss_S82standards, fetch_imaging_sample
from astroML.plotting.tools import draw_ellipse
from astroML.decorators import pickle_results
from astroML.stats import sigmaG
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
#from astroML.plotting import setup_text_plots
#setup_text_plots(fontsize=8, usetex=False)
#------------------------------------------------------------
# define u-g-r-i-z extinction from Berry et al, arXiv 1111.4985
# multiply extinction by A_r
extinction_vector = np.array([1.810, 1.400, 1.0, 0.759, 0.561])
#----------------------------------------------------------------------
# Fetch and process the noisy imaging data
data_noisy = fetch_imaging_sample()
# select only stars
data_noisy = data_noisy[data_noisy['type'] == 6]
# Get the extinction-corrected magnitudes for each band
X = np.vstack([data_noisy[f + 'RawPSF'] for f in 'ugriz']).T
Xerr = np.vstack([data_noisy[f + 'psfErr'] for f in 'ugriz']).T
# extinction terms from Berry et al, arXiv 1111.4985
X -= (extinction_vector * data_noisy['rExtSFD'][:, None])
#----------------------------------------------------------------------
# Fetch and process the stacked imaging data
data_stacked = fetch_sdss_S82standards()
# cut to RA, DEC range of imaging sample
RA = data_stacked['RA']
DEC = data_stacked['DEC']
data_stacked = data_stacked[(RA > 0) & (RA < 10) &
(DEC > -1) & (DEC < 1)]
# get stacked magnitudes for each band
Y = np.vstack([data_stacked['mmu_' + f] for f in 'ugriz']).T
Yerr = np.vstack([data_stacked['msig_' + f] for f in 'ugriz']).T
# extinction terms from Berry et al, arXiv 1111.4985
Y -= (extinction_vector * data_stacked['A_r'][:, None])
# quality cuts
g = Y[:, 1]
mask = ((Yerr.max(1) < 0.05) &
(g < 20))
data_stacked = data_stacked[mask]
Y = Y[mask]
Yerr = Yerr[mask]
/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/astropy/logger.py:531: RuntimeWarning: log file u'/Users/jwhitmore/.astropy/config/astropy.log' could not be opened for writing: [Errno 13] Permission denied: u'/Users/jwhitmore/.astropy/config/astropy.log' '{1}'.format(log_file_path, unicode(e)), RuntimeWarning)
#----------------------------------------------------------------------
# cross-match
# the imaging sample contains both standard and variable stars. We'll
# perform a cross-match with the standard star catalog and choose objects
# which are common to both.
Xlocs = np.hstack((data_noisy['ra'][:, np.newaxis],
data_noisy['dec'][:, np.newaxis]))
Ylocs = np.hstack((data_stacked['RA'][:, np.newaxis],
data_stacked['DEC'][:, np.newaxis]))
print "number of noisy points: ", Xlocs.shape
print "number of stacked points:", Ylocs.shape
# find all points within 0.9 arcsec. This cutoff was selected
# by plotting a histogram of the log(distances).
dist, ind = crossmatch(Xlocs, Ylocs, max_distance=0.9 / 3600.)
noisy_mask = (~np.isinf(dist))
stacked_mask = ind[noisy_mask]
# select the data
data_noisy = data_noisy[noisy_mask]
X = X[noisy_mask]
Xerr = Xerr[noisy_mask]
data_stacked = data_stacked[stacked_mask]
Y = Y[stacked_mask]
Yerr = Yerr[stacked_mask]
# double-check that our cross-match succeeded
assert X.shape == Y.shape
print "size after crossmatch:", X.shape
number of noisy points: (82003, 2) number of stacked points: (13377, 2) size after crossmatch: (12313, 5)
#----------------------------------------------------------------------
# perform extreme deconvolution on the noisy sample
# first define mixing matrix W
W = np.array([[0, 1, 0, 0, 0], # g magnitude
[1, -1, 0, 0, 0], # u-g color
[0, 1, -1, 0, 0], # g-r color
[0, 0, 1, -1, 0], # r-i color
[0, 0, 0, 1, -1]]) # i-z color
X = np.dot(X, W.T)
Y = np.dot(Y, W.T)
# compute error covariance from mixing matrix
Xcov = np.zeros(Xerr.shape + Xerr.shape[-1:])
Xcov[:, range(Xerr.shape[1]), range(Xerr.shape[1])] = Xerr ** 2
# each covariance C = WCW^T
# best way to do this is with a tensor dot-product
Xcov = np.tensordot(np.dot(Xcov, W.T), W, (-2, -1))
#----------------------------------------------------------------------
# This is a long calculation: save results to file
@pickle_results("XD_stellar.pkl")
def compute_XD(n_clusters=12, rseed=0, n_iter=100, verbose=True):
np.random.seed(rseed)
clf = XDGMM(n_clusters, n_iter=n_iter, tol=1E-5, verbose=verbose)
clf.fit(X, Xcov)
return clf
clf = compute_XD(12)
@pickle_results: using precomputed results from 'XD_stellar.pkl'
#------------------------------------------------------------
# Fit and sample from the underlying distribution
np.random.seed(42)
X_sample = clf.sample(X.shape[0])
#------------------------------------------------------------
# plot the results
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(left=0.12, right=0.95,
bottom=0.1, top=0.95,
wspace=0.02, hspace=0.02)
# only plot 1/10 of the stars for clarity
ax1 = fig.add_subplot(221)
ax1.scatter(Y[::10, 2], Y[::10, 3], s=9, lw=0, c='k')
ax2 = fig.add_subplot(222)
ax2.scatter(X[::10, 2], X[::10, 3], s=9, lw=0, c='k')
ax3 = fig.add_subplot(223)
ax3.scatter(X_sample[::10, 2], X_sample[::10, 3], s=9, lw=0, c='k')
ax4 = fig.add_subplot(224)
for i in range(clf.n_components):
draw_ellipse(clf.mu[i, 2:4], clf.V[i, 2:4, 2:4], scales=[2],
ec='k', fc='gray', alpha=0.2, ax=ax4)
titles = ["Standard Stars", "Single Epoch",
"Extreme Deconvolution\n resampling",
"Extreme Deconvolution\n cluster locations"]
ax = [ax1, ax2, ax3, ax4]
for i in range(4):
ax[i].set_xlim(-0.6, 1.8)
ax[i].set_ylim(-0.6, 1.8)
ax[i].xaxis.set_major_locator(plt.MultipleLocator(0.5))
ax[i].yaxis.set_major_locator(plt.MultipleLocator(0.5))
ax[i].text(0.05, 0.95, titles[i],
ha='left', va='top', transform=ax[i].transAxes)
if i in (0, 1):
ax[i].xaxis.set_major_formatter(plt.NullFormatter())
else:
ax[i].set_xlabel('$g-r$')
if i in (1, 3):
ax[i].yaxis.set_major_formatter(plt.NullFormatter())
else:
ax[i].set_ylabel('$r-i$')
#------------------------------------------------------------
# Second figure: the width of the locus
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
labels = ['single epoch', 'standard stars', 'XD resampled']
linestyles = ['solid', 'dashed', 'dotted']
for data, label, ls in zip((X, Y, X_sample), labels, linestyles):
g = data[:, 0]
gr = data[:, 2]
ri = data[:, 3]
r = g - gr
i = r - ri
mask = (gr > 0.3) & (gr < 1.0)
g = g[mask]
r = r[mask]
i = i[mask]
w = -0.227 * g + 0.792 * r - 0.567 * i + 0.05
sigma = sigmaG(w)
ax.hist(w, bins=np.linspace(-0.08, 0.08, 100), linestyle=ls,
histtype='step', label=label + '\n\t' + r'$\sigma_G=%.3f$' % sigma,
normed=True)
ax.legend(loc=2)
ax.text(0.95, 0.95, '$w = -0.227g + 0.792r$\n$ - 0.567i + 0.05$',
transform=ax.transAxes, ha='right', va='top')
ax.set_xlim(-0.07, 0.07)
ax.set_ylim(0, 55)
ax.set_xlabel('$w$')
ax.set_ylabel('$N(w)$')
<matplotlib.text.Text at 0x10ca18450>
# Author: Jake VanderPlas <vanderplas@astro.washington.edu>
# License: BSD
# The figure is an example from astroML: see http://astroML.github.com
import numpy as np
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from astroML.datasets import fetch_sdss_galaxy_colors
from astroML.plotting import scatter_contour
n_neighbors = 1
data = fetch_sdss_galaxy_colors()
N = len(data)
# shuffle data
np.random.seed(0)
np.random.shuffle(data)
# put colors in a matrix
X = np.zeros((N, 4))
X[:, 0] = data['u'] - data['g']
X[:, 1] = data['g'] - data['r']
X[:, 2] = data['r'] - data['i']
X[:, 3] = data['i'] - data['z']
z = data['redshift']
# divide into training and testing data
Ntrain = N / 2
Xtrain = X[:Ntrain]
ztrain = z[:Ntrain]
Xtest = X[Ntrain:]
ztest = z[Ntrain:]
knn = KNeighborsRegressor(n_neighbors, weights='uniform')
zpred = knn.fit(Xtrain, ztrain).predict(Xtest)
axis_lim = np.array([-0.1, 2.5])
rms = np.sqrt(np.mean((ztest - zpred) ** 2))
print "RMS error = %.2g" % rms
RMS error = 0.24
ax = plt.axes()
plt.scatter(ztest, zpred, c='k', lw=0, s=4)
plt.plot(axis_lim, axis_lim, '--k')
plt.plot(axis_lim, axis_lim + rms, ':k')
plt.plot(axis_lim, axis_lim - rms, ':k')
plt.xlim(axis_lim)
plt.ylim(axis_lim)
plt.text(0.99, 0.02, "RMS error = %.2g" % rms,
ha='right', va='bottom', transform=ax.transAxes,
bbox=dict(ec='w', fc='w'), fontsize=16)
plt.title('Photo-z: Nearest Neigbor Regression', fontsize=20)
plt.xlabel(r'$\mathrm{z_{spec}}$', fontsize=24)
plt.ylabel(r'$\mathrm{z_{phot}}$', fontsize=24)
plt.show()
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm
from astroML.density_estimation import GaussianMixture1D
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=18, usetex=True)
#------------------------------------------------------------
# Generate the data
mu1_in = 0
sigma1_in = 0.3
mu2_in = 1
sigma2_in = 1
ratio_in = 1.5
N = 200
np.random.seed(10)
gm = GaussianMixture1D([mu1_in, mu2_in],
[sigma1_in, sigma2_in],
[ratio_in, 1])
x_sample = gm.sample(N)
#------------------------------------------------------------
# Get the MLE fit for a single gaussian
sample_mu = np.mean(x_sample)
sample_std = np.std(x_sample, ddof=1)
#------------------------------------------------------------
# Plot the sampled data
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(x_sample, 20, histtype='stepfilled', normed=True, fc='#CCCCCC')
x = np.linspace(-2.1, 4.1, 1000)
factor1 = ratio_in / (1. + ratio_in)
factor2 = 1. / (1. + ratio_in)
ax.plot(x, gm.pdf(x), '-k', label='true distribution')
ax.plot(x, gm.pdf_individual(x), ':k')
ax.plot(x, norm.pdf(x, sample_mu, sample_std), '--k', label='best fit normal')
ax.legend(loc=1)
ax.set_xlim(-2.1, 4.1)
ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)$')
ax.set_title('Input pdf and sampled data')
ax.text(0.95, 0.80, ('$\mu_1 = 0;\ \sigma_1=0.3$\n'
'$\mu_2=1;\ \sigma_2=1.0$\n'
'$\mathrm{ratio}=1.5$'),
transform=ax.transAxes, ha='right', va='top')
plt.show()
/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/sklearn/utils/__init__.py:93: DeprecationWarning: Function eval is deprecated; GMM.eval was renamed to GMM.score_samples in 0.14 and will be removed in 0.16. warnings.warn(msg, category=DeprecationWarning) /Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/sklearn/utils/__init__.py:93: DeprecationWarning: Function eval is deprecated; GMM.eval was renamed to GMM.score_samples in 0.14 and will be removed in 0.16. warnings.warn(msg, category=DeprecationWarning)