%pylab inline from figures import plot_sdss_filters plot_sdss_filters() from figures import plot_redshifts plot_redshifts() import numpy as np from datasets import fetch_sdss_galaxy_mags # This will download a ~3MB file the first time you call the function data = fetch_sdss_galaxy_mags() print data.shape print data.dtype redshift = data['redshift'] mags = np.vstack([data[f] for f in 'ugriz']).transpose() colors = mags[:, :-1] - mags[:, 1:] print colors.shape from sklearn import cross_validation ctrain, ctest, ztrain, ztest = cross_validation.train_test_split(colors, redshift) from sklearn.tree import DecisionTreeRegressor clf = DecisionTreeRegressor() print cross_validation.cross_val_score(clf, colors, redshift, cv=4) # We'll use this function several times below def plot_redshifts(ztrue, zpred): """scatter-plot the true vs predicted redshifts""" fig, ax = plt.subplots(figsize=(8, 8)) ax.plot(ztrue, zpred, '.k') # plot trend lines, +/- 0.1 in z ax.plot([0, 3], [0, 3], '--k') ax.plot([0, 3], [0.2, 3.2], ':k') ax.plot([0.2, 3.2], [0, 3], ':k') ax.text(2.9, 0.1, "RMS = %.2g" % np.sqrt(np.mean((ztrue - zpred) ** 2)), ha='right', va='bottom') ax.set_xlim(0, 3) ax.set_ylim(0, 3) ax.set_xlabel('True redshift') ax.set_ylabel('Predicted redshift') clf = DecisionTreeRegressor() clf.fit(ctrain, ztrain) zpred = clf.predict(ctest) plot_redshifts(ztest, zpred) # we'll explore results for max_depth from 1 to 20 max_depth_array = np.arange(1, 21) train_error = np.zeros(len(max_depth_array)) test_error = np.zeros(len(max_depth_array)) for i, max_depth in enumerate(max_depth_array): clf = DecisionTreeRegressor(max_depth=max_depth) clf.fit(ctrain, ztrain) ztrain_pred = clf.predict(ctrain) ztest_pred = clf.predict(ctest) train_error[i] = np.sqrt(np.mean((ztrain_pred - ztrain) ** 2)) test_error[i] = np.sqrt(np.mean((ztest_pred - ztest) ** 2)) plt.plot(max_depth_array, train_error, label='training') plt.plot(max_depth_array, test_error, label='validation') plt.grid() plt.legend(loc=3) plt.xlabel('max_depth') plt.ylabel('error') from sklearn.grid_search import GridSearchCV clf = DecisionTreeRegressor() grid = GridSearchCV(clf, param_grid=dict(max_depth=max_depth_array)) grid.fit(colors, redshift) print grid.best_params_ clf = DecisionTreeRegressor(max_depth=7) clf.fit(ctrain, ztrain) zpred = clf.predict(ctest) plot_redshifts(ztest, zpred) def outlier_fraction(y_pred, y_true, cutoff=0.2): return np.sum((abs(y_pred - y_true) > cutoff)) * 1. / len(y_pred) train_outfrac = np.zeros(len(max_depth_array)) test_outfrac = np.zeros(len(max_depth_array)) for i, max_depth in enumerate(max_depth_array): clf = DecisionTreeRegressor(max_depth=max_depth) clf.fit(ctrain, ztrain) ztrain_pred = clf.predict(ctrain) ztest_pred = clf.predict(ctest) train_outfrac[i] = outlier_fraction(ztrain_pred, ztrain) test_outfrac[i] = outlier_fraction(ztest_pred, ztest) plt.plot(max_depth_array, train_outfrac) plt.plot(max_depth_array, test_outfrac) plt.grid() clf = DecisionTreeRegressor(max_depth=20) clf.fit(ctrain, ztrain) zpred = clf.predict(ctest) plot_redshifts(ztest, zpred)