%pylab inline

from figures import plot_sdss_filters
plot_sdss_filters()

from figures import plot_redshifts
plot_redshifts()

import numpy as np
from datasets import fetch_sdss_galaxy_mags

# This will download a ~3MB file the first time you call the function
data = fetch_sdss_galaxy_mags()

print data.shape
print data.dtype

redshift = data['redshift']
mags = np.vstack([data[f] for f in 'ugriz']).transpose()
colors = mags[:, :-1] - mags[:, 1:]
print colors.shape

from sklearn import cross_validation
ctrain, ctest, ztrain, ztest = cross_validation.train_test_split(colors, redshift)

from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor()

print cross_validation.cross_val_score(clf, colors, redshift, cv=4)

# We'll use this function several times below
def plot_redshifts(ztrue, zpred):
    """scatter-plot the true vs predicted redshifts"""
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.plot(ztrue, zpred, '.k')
    
    # plot trend lines, +/- 0.1 in z
    ax.plot([0, 3], [0, 3], '--k')
    ax.plot([0, 3], [0.2, 3.2], ':k')
    ax.plot([0.2, 3.2], [0, 3], ':k')
    
    ax.text(2.9, 0.1,
            "RMS = %.2g" % np.sqrt(np.mean((ztrue - zpred) ** 2)),
            ha='right', va='bottom')

    ax.set_xlim(0, 3)
    ax.set_ylim(0, 3)
    
    ax.set_xlabel('True redshift')
    ax.set_ylabel('Predicted redshift')

clf = DecisionTreeRegressor()
clf.fit(ctrain, ztrain)
zpred = clf.predict(ctest)

plot_redshifts(ztest, zpred)

# we'll explore results for max_depth from 1 to 20
max_depth_array = np.arange(1, 21)
train_error = np.zeros(len(max_depth_array))
test_error = np.zeros(len(max_depth_array))

for i, max_depth in enumerate(max_depth_array):
    clf = DecisionTreeRegressor(max_depth=max_depth)
    clf.fit(ctrain, ztrain)

    ztrain_pred = clf.predict(ctrain)
    ztest_pred = clf.predict(ctest)

    train_error[i] = np.sqrt(np.mean((ztrain_pred - ztrain) ** 2))
    test_error[i] = np.sqrt(np.mean((ztest_pred - ztest) ** 2))

plt.plot(max_depth_array, train_error, label='training')
plt.plot(max_depth_array, test_error, label='validation')
plt.grid()
plt.legend(loc=3)
plt.xlabel('max_depth')
plt.ylabel('error')

from sklearn.grid_search import GridSearchCV
clf = DecisionTreeRegressor()
grid = GridSearchCV(clf, param_grid=dict(max_depth=max_depth_array))
grid.fit(colors, redshift)
print grid.best_params_

clf = DecisionTreeRegressor(max_depth=7)
clf.fit(ctrain, ztrain)
zpred = clf.predict(ctest)
plot_redshifts(ztest, zpred)

def outlier_fraction(y_pred, y_true, cutoff=0.2):
    return np.sum((abs(y_pred - y_true) > cutoff)) * 1. / len(y_pred)

train_outfrac = np.zeros(len(max_depth_array))
test_outfrac = np.zeros(len(max_depth_array))

for i, max_depth in enumerate(max_depth_array):
    clf = DecisionTreeRegressor(max_depth=max_depth)
    clf.fit(ctrain, ztrain)

    ztrain_pred = clf.predict(ctrain)
    ztest_pred = clf.predict(ctest)

    train_outfrac[i] = outlier_fraction(ztrain_pred, ztrain)
    test_outfrac[i] = outlier_fraction(ztest_pred, ztest)

plt.plot(max_depth_array, train_outfrac)
plt.plot(max_depth_array, test_outfrac)
plt.grid()

clf = DecisionTreeRegressor(max_depth=20)
clf.fit(ctrain, ztrain)
zpred = clf.predict(ctest)
plot_redshifts(ztest, zpred)