from IPython.display import Image import matplotlib.pyplot as plt # The following line is an ipython notebook trick %matplotlib inline plt.rcParams['figure.figsize'] = 12, 8 # plotsize Image(filename='text_cover1.png') # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com # Example has been modified slightly by Jonathan Whitmore import numpy as np from scipy import stats from matplotlib import pyplot as plt from astroML.plotting import hist # draw a set of variables np.random.seed(0) t = np.concatenate([stats.cauchy(-5, 1.8).rvs(500), stats.cauchy(-4, 0.8).rvs(2000), stats.cauchy(-1, 0.3).rvs(500), stats.cauchy(2, 0.8).rvs(1000), stats.cauchy(4, 1.5).rvs(500)]) # truncate values to a reasonable range t = t[(t > -15) & (t < 15)] plt.rcParams['figure.figsize'] = 12, 8 # plotsize xval = np.linspace(-15, 15, 2000) jw = stats.cauchy(-5, 1.8).pdf(xval) * 500 jw += stats.cauchy(-4, 0.8).pdf(xval) * 2000 jw += stats.cauchy(-1, 0.3).pdf(xval) * 500 jw += stats.cauchy(2, 0.8).pdf(xval) * 1000 jw += stats.cauchy(4, 1.5).pdf(xval)* 500 jw = jw / 2000.0 plt.xlabel("t", fontsize=25) plt.ylabel("Some PDF", fontsize=25) plt.xticks(fontsize=25) plt.yticks(fontsize=25) plt.ylim(0, 0.5) plt.plot(xval, jw) stats.cauchy? plt.rcParams['figure.figsize'] = 12, 8 # plotsize alpha = .9 # First figure: show normal histogram binning fig = plt.figure(figsize=(10, 4)) fig.subplots_adjust(left=0.1, right=0.95, bottom=0.15) ax1 = fig.add_subplot(121) ax1.hist(t, bins=15, histtype='stepfilled', alpha=alpha, normed=True) ax1.set_xlabel('t') ax1.set_ylabel('P(t)') ax2 = fig.add_subplot(122) ax2.hist(t, bins=200, histtype='stepfilled', alpha=alpha, normed=True) ax2.set_xlabel('t') ax2.set_ylabel('P(t)') np.random.seed(0) N = 10000 mu_gamma_f = [(5, 1.0, 0.1), (7, 0.5, 0.5), (9, 0.1, 0.1), (12, 0.5, 0.2), (14, 1.0, 0.1)] true_pdf = lambda x: sum([f * stats.cauchy(mu, gamma).pdf(x) for (mu, gamma, f) in mu_gamma_f]) jw_modify_true_pdf = np.array([true_pdf(x) for x in t]) x = np.concatenate([stats.cauchy(mu, gamma).rvs(int(f * N)) for (mu, gamma, f) in mu_gamma_f]) np.random.shuffle(x) x = x[x > -10] x = x[x < 30] fig = plt.figure(figsize=(10, 10)) fig.subplots_adjust(bottom=0.08, top=0.95, right=0.95, hspace=0.1) N_values = (500, 5000) subplots = (211, 212) for N, subplot in zip(N_values, subplots): ax = fig.add_subplot(subplot) xN = x[:N] t = np.linspace(-10, 30, 1000) # plot the results ax.plot(xN, -0.005 * np.ones(len(xN)), '|k') # hist(xN, bins='knuth', ax=ax, normed=True, # histtype='stepfilled', alpha=0.3, # label='Knuth Histogram') hist(xN, bins='blocks', ax=ax, normed=True, histtype='step', color='red', linewidth=2.0, label="Bayesian Blocks") ax.plot(t, true_pdf(t), '-', color='black', label="Generating Distribution", linewidth=2.0, alpha=0.5) # label the plot ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top', transform=ax.transAxes, fontsize=25) ax.set_ylabel('$p(x)$', fontsize=25.0) ax.legend(loc='upper right', prop=dict(size=15)) if subplot == 212: ax.set_xlabel('$x$', fontsize=25.0) #ax.set_xticklabels(fontsize=20) ax.set_xlim(0, 20) ax.set_ylim(-0.01, 0.4001) plt.show() # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.density_estimation import XDGMM from astroML.crossmatch import crossmatch from astroML.datasets import fetch_sdss_S82standards, fetch_imaging_sample from astroML.plotting.tools import draw_ellipse from astroML.decorators import pickle_results from astroML.stats import sigmaG #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. #from astroML.plotting import setup_text_plots #setup_text_plots(fontsize=8, usetex=False) #------------------------------------------------------------ # define u-g-r-i-z extinction from Berry et al, arXiv 1111.4985 # multiply extinction by A_r extinction_vector = np.array([1.810, 1.400, 1.0, 0.759, 0.561]) #---------------------------------------------------------------------- # Fetch and process the noisy imaging data data_noisy = fetch_imaging_sample() # select only stars data_noisy = data_noisy[data_noisy['type'] == 6] # Get the extinction-corrected magnitudes for each band X = np.vstack([data_noisy[f + 'RawPSF'] for f in 'ugriz']).T Xerr = np.vstack([data_noisy[f + 'psfErr'] for f in 'ugriz']).T # extinction terms from Berry et al, arXiv 1111.4985 X -= (extinction_vector * data_noisy['rExtSFD'][:, None]) #---------------------------------------------------------------------- # Fetch and process the stacked imaging data data_stacked = fetch_sdss_S82standards() # cut to RA, DEC range of imaging sample RA = data_stacked['RA'] DEC = data_stacked['DEC'] data_stacked = data_stacked[(RA > 0) & (RA < 10) & (DEC > -1) & (DEC < 1)] # get stacked magnitudes for each band Y = np.vstack([data_stacked['mmu_' + f] for f in 'ugriz']).T Yerr = np.vstack([data_stacked['msig_' + f] for f in 'ugriz']).T # extinction terms from Berry et al, arXiv 1111.4985 Y -= (extinction_vector * data_stacked['A_r'][:, None]) # quality cuts g = Y[:, 1] mask = ((Yerr.max(1) < 0.05) & (g < 20)) data_stacked = data_stacked[mask] Y = Y[mask] Yerr = Yerr[mask] #---------------------------------------------------------------------- # cross-match # the imaging sample contains both standard and variable stars. We'll # perform a cross-match with the standard star catalog and choose objects # which are common to both. Xlocs = np.hstack((data_noisy['ra'][:, np.newaxis], data_noisy['dec'][:, np.newaxis])) Ylocs = np.hstack((data_stacked['RA'][:, np.newaxis], data_stacked['DEC'][:, np.newaxis])) print "number of noisy points: ", Xlocs.shape print "number of stacked points:", Ylocs.shape # find all points within 0.9 arcsec. This cutoff was selected # by plotting a histogram of the log(distances). dist, ind = crossmatch(Xlocs, Ylocs, max_distance=0.9 / 3600.) noisy_mask = (~np.isinf(dist)) stacked_mask = ind[noisy_mask] # select the data data_noisy = data_noisy[noisy_mask] X = X[noisy_mask] Xerr = Xerr[noisy_mask] data_stacked = data_stacked[stacked_mask] Y = Y[stacked_mask] Yerr = Yerr[stacked_mask] # double-check that our cross-match succeeded assert X.shape == Y.shape print "size after crossmatch:", X.shape #---------------------------------------------------------------------- # perform extreme deconvolution on the noisy sample # first define mixing matrix W W = np.array([[0, 1, 0, 0, 0], # g magnitude [1, -1, 0, 0, 0], # u-g color [0, 1, -1, 0, 0], # g-r color [0, 0, 1, -1, 0], # r-i color [0, 0, 0, 1, -1]]) # i-z color X = np.dot(X, W.T) Y = np.dot(Y, W.T) # compute error covariance from mixing matrix Xcov = np.zeros(Xerr.shape + Xerr.shape[-1:]) Xcov[:, range(Xerr.shape[1]), range(Xerr.shape[1])] = Xerr ** 2 # each covariance C = WCW^T # best way to do this is with a tensor dot-product Xcov = np.tensordot(np.dot(Xcov, W.T), W, (-2, -1)) #---------------------------------------------------------------------- # This is a long calculation: save results to file @pickle_results("XD_stellar.pkl") def compute_XD(n_clusters=12, rseed=0, n_iter=100, verbose=True): np.random.seed(rseed) clf = XDGMM(n_clusters, n_iter=n_iter, tol=1E-5, verbose=verbose) clf.fit(X, Xcov) return clf clf = compute_XD(12) #------------------------------------------------------------ # Fit and sample from the underlying distribution np.random.seed(42) X_sample = clf.sample(X.shape[0]) #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(10, 10)) fig.subplots_adjust(left=0.12, right=0.95, bottom=0.1, top=0.95, wspace=0.02, hspace=0.02) # only plot 1/10 of the stars for clarity ax1 = fig.add_subplot(221) ax1.scatter(Y[::10, 2], Y[::10, 3], s=9, lw=0, c='k') ax2 = fig.add_subplot(222) ax2.scatter(X[::10, 2], X[::10, 3], s=9, lw=0, c='k') ax3 = fig.add_subplot(223) ax3.scatter(X_sample[::10, 2], X_sample[::10, 3], s=9, lw=0, c='k') ax4 = fig.add_subplot(224) for i in range(clf.n_components): draw_ellipse(clf.mu[i, 2:4], clf.V[i, 2:4, 2:4], scales=[2], ec='k', fc='gray', alpha=0.2, ax=ax4) titles = ["Standard Stars", "Single Epoch", "Extreme Deconvolution\n resampling", "Extreme Deconvolution\n cluster locations"] ax = [ax1, ax2, ax3, ax4] for i in range(4): ax[i].set_xlim(-0.6, 1.8) ax[i].set_ylim(-0.6, 1.8) ax[i].xaxis.set_major_locator(plt.MultipleLocator(0.5)) ax[i].yaxis.set_major_locator(plt.MultipleLocator(0.5)) ax[i].text(0.05, 0.95, titles[i], ha='left', va='top', transform=ax[i].transAxes) if i in (0, 1): ax[i].xaxis.set_major_formatter(plt.NullFormatter()) else: ax[i].set_xlabel('$g-r$') if i in (1, 3): ax[i].yaxis.set_major_formatter(plt.NullFormatter()) else: ax[i].set_ylabel('$r-i$') #------------------------------------------------------------ # Second figure: the width of the locus fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) labels = ['single epoch', 'standard stars', 'XD resampled'] linestyles = ['solid', 'dashed', 'dotted'] for data, label, ls in zip((X, Y, X_sample), labels, linestyles): g = data[:, 0] gr = data[:, 2] ri = data[:, 3] r = g - gr i = r - ri mask = (gr > 0.3) & (gr < 1.0) g = g[mask] r = r[mask] i = i[mask] w = -0.227 * g + 0.792 * r - 0.567 * i + 0.05 sigma = sigmaG(w) ax.hist(w, bins=np.linspace(-0.08, 0.08, 100), linestyle=ls, histtype='step', label=label + '\n\t' + r'$\sigma_G=%.3f$' % sigma, normed=True) ax.legend(loc=2) ax.text(0.95, 0.95, '$w = -0.227g + 0.792r$\n$ - 0.567i + 0.05$', transform=ax.transAxes, ha='right', va='top') ax.set_xlim(-0.07, 0.07) ax.set_ylim(0, 55) ax.set_xlabel('$w$') ax.set_ylabel('$N(w)$') # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from sklearn.neighbors import KNeighborsRegressor from astroML.datasets import fetch_sdss_galaxy_colors from astroML.plotting import scatter_contour n_neighbors = 1 data = fetch_sdss_galaxy_colors() N = len(data) # shuffle data np.random.seed(0) np.random.shuffle(data) # put colors in a matrix X = np.zeros((N, 4)) X[:, 0] = data['u'] - data['g'] X[:, 1] = data['g'] - data['r'] X[:, 2] = data['r'] - data['i'] X[:, 3] = data['i'] - data['z'] z = data['redshift'] # divide into training and testing data Ntrain = N / 2 Xtrain = X[:Ntrain] ztrain = z[:Ntrain] Xtest = X[Ntrain:] ztest = z[Ntrain:] knn = KNeighborsRegressor(n_neighbors, weights='uniform') zpred = knn.fit(Xtrain, ztrain).predict(Xtest) axis_lim = np.array([-0.1, 2.5]) rms = np.sqrt(np.mean((ztest - zpred) ** 2)) print "RMS error = %.2g" % rms ax = plt.axes() plt.scatter(ztest, zpred, c='k', lw=0, s=4) plt.plot(axis_lim, axis_lim, '--k') plt.plot(axis_lim, axis_lim + rms, ':k') plt.plot(axis_lim, axis_lim - rms, ':k') plt.xlim(axis_lim) plt.ylim(axis_lim) plt.text(0.99, 0.02, "RMS error = %.2g" % rms, ha='right', va='bottom', transform=ax.transAxes, bbox=dict(ec='w', fc='w'), fontsize=16) plt.title('Photo-z: Nearest Neigbor Regression', fontsize=20) plt.xlabel(r'$\mathrm{z_{spec}}$', fontsize=24) plt.ylabel(r'$\mathrm{z_{phot}}$', fontsize=24) plt.show() import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm from astroML.density_estimation import GaussianMixture1D #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=18, usetex=True) #------------------------------------------------------------ # Generate the data mu1_in = 0 sigma1_in = 0.3 mu2_in = 1 sigma2_in = 1 ratio_in = 1.5 N = 200 np.random.seed(10) gm = GaussianMixture1D([mu1_in, mu2_in], [sigma1_in, sigma2_in], [ratio_in, 1]) x_sample = gm.sample(N) #------------------------------------------------------------ # Get the MLE fit for a single gaussian sample_mu = np.mean(x_sample) sample_std = np.std(x_sample, ddof=1) #------------------------------------------------------------ # Plot the sampled data fig, ax = plt.subplots(figsize=(10, 6)) ax.hist(x_sample, 20, histtype='stepfilled', normed=True, fc='#CCCCCC') x = np.linspace(-2.1, 4.1, 1000) factor1 = ratio_in / (1. + ratio_in) factor2 = 1. / (1. + ratio_in) ax.plot(x, gm.pdf(x), '-k', label='true distribution') ax.plot(x, gm.pdf_individual(x), ':k') ax.plot(x, norm.pdf(x, sample_mu, sample_std), '--k', label='best fit normal') ax.legend(loc=1) ax.set_xlim(-2.1, 4.1) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') ax.set_title('Input pdf and sampled data') ax.text(0.95, 0.80, ('$\mu_1 = 0;\ \sigma_1=0.3$\n' '$\mu_2=1;\ \sigma_2=1.0$\n' '$\mathrm{ratio}=1.5$'), transform=ax.transAxes, ha='right', va='top') plt.show()