## get the data locally ... I put this on a gist
!curl -k -O https://gist.githubusercontent.com/anonymous/53781fe86383c435ff10/raw/4cc80a638e8e083775caec3005ae2feaf92b8d5b/qso10000.csv
!curl -k -O https://gist.githubusercontent.com/anonymous/2984cf01a2485afd2c3e/raw/964d4f52c989428628d42eb6faad5e212e79b665/star1000.csv
!curl -k -O https://gist.githubusercontent.com/anonymous/2984cf01a2485afd2c3e/raw/335cd1953e72f6c7cafa9ebb81b43c47cb757a9d/galaxy1000.csv

# For pretty plotting
!pip install --upgrade seaborn

import pandas as pd
pd.set_option('display.max_columns', None)
%pylab inline
import seaborn as sns
sns.set()
import copy

pd.read_csv("qso10000.csv",index_col=0).head()

qsos = pd.read_csv("qso10000.csv",index_col=0,usecols=["objid","dered_r","spec_z","u_g_color",\
                                                "g_r_color","r_i_color","i_z_color","diff_u",\
                                                "diff_g1","diff_i","diff_z"])

qso_features = copy.copy(qsos)
qso_redshifts = qsos["spec_z"]
del qso_features["spec_z"]
qso_features.head()

bins =  hist(qso_redshifts.values,bins=100) ; xlabel("redshift") ; ylabel("N")

import matplotlib as mpl
import matplotlib.cm as cm
## truncate the color at z=2.5 just to keep some contrast.
norm = mpl.colors.Normalize(vmin=min(qso_redshifts.values), vmax=2.5)
cmap = cm.jet_r
#x = 0.3
m = cm.ScalarMappable(norm=norm, cmap=cmap)
rez = pd.scatter_matrix(qso_features[0:2000], alpha=0.2,figsize=[15,15],color=m.to_rgba(qso_redshifts.values))

min(qso_features["dered_r"].values)

qsos = pd.read_csv("qso10000.csv",index_col=0,usecols=["objid","dered_r","spec_z","u_g_color",\
                                                "g_r_color","r_i_color","i_z_color","diff_u",\
                                                "diff_g1","diff_i","diff_z"])

qsos = qsos[(qsos["dered_r"] > -9999) & (qsos["g_r_color"] > -10) & (qsos["g_r_color"] < 10)]
qso_features = copy.copy(qsos)
qso_redshifts = qsos["spec_z"]
del qso_features["spec_z"]

rez = pd.scatter_matrix(qso_features[0:2000], alpha=0.2,figsize=[15,15],\
                        color=m.to_rgba(qso_redshifts.values))

qsos.to_csv("qsos.clean.csv")

X = qso_features.values  # 9-d feature space
Y = qso_redshifts.values # redshifts

print "feature vector shape=", X.shape
print "class shape=", Y.shape

# half of data
half = floor(len(Y)/2)
train_X = X[:half]
train_Y = Y[:half]
test_X = X[half:]
test_Y = Y[half:]

from sklearn import linear_model
clf = linear_model.LinearRegression()

clf.

# fit the model
clf.fit(train_X, train_Y)

# now do the prediction
Y_lr_pred = clf.predict(test_X)

# how well did we do?
from sklearn.metrics import mean_squared_error
mse = np.sqrt(mean_squared_error(test_Y,Y_lr_pred)) ; print "MSE",mse

plot(test_Y,Y_lr_pred - test_Y,'o',alpha=0.2)
title("Linear Regression Residuals - MSE = %.1f" % mse)
xlabel("Spectroscopic Redshift")
ylabel("Residual")
hlines(0,min(test_Y),max(test_Y),color="red")

# here's the MSE guessing the AVERAGE value
print "naive mse", ((1./len(train_Y))*(train_Y - train_Y.mean())**2).sum()

mean_squared_error?

from sklearn import neighbors
from sklearn import preprocessing
X_scaled = preprocessing.scale(X) # many methods work better on scaled X
clf1 = neighbors.KNeighborsRegressor(5)
train_X = X_scaled[:half]
test_X = X_scaled[half:]
clf1.fit(train_X,train_Y)

Y_knn_pred = clf1.predict(test_X)
mse = mean_squared_error(test_Y,Y_knn_pred) ; print "MSE (KNN)", mse
plot(test_Y, Y_knn_pred - test_Y,'o',alpha=0.2)
title("k-NN Residuals - MSE = %.1f" % mse)
xlabel("Spectroscopic Redshift")
ylabel("Residual")
hlines(0,min(test_Y),max(test_Y),color="red")

from sklearn import neighbors
from sklearn import preprocessing

X_scaled = preprocessing.scale(X) # many methods work better on scaled X
train_X = X_scaled[:half]
train_Y = Y[:half]
test_X = X_scaled[half:]
test_Y = Y[half:]
clf1 = neighbors.KNeighborsRegressor(10)
clf1.fit(train_X,train_Y)

Y_knn_pred = clf1.predict(test_X)
mse = mean_squared_error(test_Y,Y_knn_pred) ; print mse
scatter(test_Y, Y_knn_pred - test_Y,alpha=0.2)
title("k-NN Residuals - MSE = %.1f" % mse)
xlabel("Spectroscopic Redshift")
ylabel("Residual")
hlines(0,min(test_Y),max(test_Y),color="red")

from sklearn.ensemble import RandomForestRegressor
clf2 = RandomForestRegressor(n_estimators=100, 
                            criterion='mse', max_depth=None, 
                            min_samples_split=2, min_samples_leaf=1, 
                            max_features='auto', max_leaf_nodes=None, 
                            bootstrap=True, oob_score=False, n_jobs=1, 
                            random_state=None, verbose=0, 
                            min_density=None, compute_importances=None)
clf2.fit(train_X,train_Y)

Y_rf_pred = clf2.predict(test_X)
mse = mean_squared_error(test_Y,Y_rf_pred) ; print mse
scatter(test_Y, Y_rf_pred - test_Y,alpha=0.2)
title("RF Residuals - MSE = %.1f" % mse)
xlabel("Spectroscopic Redshift")
ylabel("Residual")
hlines(0,min(test_Y),max(test_Y),color="red")

from sklearn import cross_validation

from sklearn import linear_model
clf = linear_model.LinearRegression()
from sklearn.cross_validation import cross_val_score

def print_cv_score_summary(model, xx, yy, cv):
    scores = cross_val_score(model, xx, yy, cv=cv, n_jobs=1)
    print("mean: {:3f}, stdev: {:3f}".format(
        np.mean(scores), np.std(scores)))

print_cv_score_summary(clf,X,Y,cv=cross_validation.KFold(len(Y), 5))

print_cv_score_summary(clf,X,Y,
    cv=cross_validation.KFold(len(Y),10,shuffle=True,random_state=1))

print_cv_score_summary(clf2,X,Y,
    cv=cross_validation.KFold(len(Y),10,shuffle=True,random_state=1))

all_sources = pd.read_csv("qso10000.csv",index_col=0,usecols=["objid","dered_r","u_g_color",\
                                                "g_r_color","r_i_color","i_z_color","diff_u",\
                                                "diff_g1","diff_i","diff_z","class"])[:1000]

all_sources = all_sources.append(pd.read_csv("star1000.csv",index_col=0,usecols=["objid","dered_r","u_g_color",\
                                                "g_r_color","r_i_color","i_z_color","diff_u",\
                                                "diff_g1","diff_i","diff_z","class"]))

all_sources = all_sources.append(pd.read_csv("galaxy1000.csv",index_col=0,usecols=["objid","dered_r","u_g_color",\
                                                "g_r_color","r_i_color","i_z_color","diff_u",\
                                                "diff_g1","diff_i","diff_z","class"]))

all_sources = all_sources[(all_sources["dered_r"] > -9999) & (all_sources["g_r_color"] > -10) & (all_sources["g_r_color"] < 10)]
all_features = copy.copy(all_sources)
all_label = all_sources["class"]
del all_features["class"]
X = copy.copy(all_features.values)
Y = copy.copy(all_label.values)

all_sources.tail()

print "feature vector shape=", X.shape
print "class shape=", Y.shape

Y[Y=="QSO"] = 0
Y[Y=="STAR"] = 1
Y[Y=="GALAXY"] = 2

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200,oob_score=True)
clf.fit(X,Y)

sorted(zip(all_sources.columns.values,clf.feature_importances_),key=lambda q: q[1],reverse=True)

clf.oob_score_  ## "Out of Bag" Error


import numpy as np
from sklearn import svm, datasets
cmap = cm.jet_r

# import some data to play with

X = all_features.values[:, 1:3] # use only two features for training and plotting purposes

h = .02  # step size in the mesh

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
svc = svm.SVC(kernel='linear', C=C).fit(X, Y)
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, Y)
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, Y)
lin_svc = svm.LinearSVC(C=C).fit(X, Y)

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# title for the plots
titles = ['SVC with linear kernel',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3) kernel',
          'LinearSVC (linear kernel)']


norm = mpl.colors.Normalize(vmin=min(Y), vmax=max(Y))
m = cm.ScalarMappable(norm=norm, cmap=cmap)

for i, clf in enumerate((svc, rbf_svc, poly_svc, lin_svc)):
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    subplot(2, 2, i + 1)
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    contourf(xx, yy, Z,cmap=cm.Paired)
    axis('off')

    # Plot also the training points
    scatter(X[:, 0], X[:, 1], c=m.to_rgba(Y),cmap=cm.Paired)

    title(titles[i])

# fit a support vector machine classifier
from sklearn import grid_search
from sklearn import svm
from sklearn import metrics
import logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

# instantiate the SVM object
sdss_svm = svm.SVC()

X = all_features.values
Y = all_label.values

# parameter values over which we will search
parameters = {'kernel':('linear', 'rbf'), \
              'gamma':[0.5, 0.3, 0.1,  0.01],
              'C':[0.1, 2, 4, 5, 10, 20,30]}
#parameters = {'kernel':('linear', 'rbf')}

# do a grid search to find the highest 3-fold CV zero-one score
svm_tune = grid_search.GridSearchCV(sdss_svm, parameters, score_func=metrics.accuracy_score,\
                                    n_jobs = -1, cv = 3,verbose=1)
svm_opt = svm_tune.fit(X, Y)

# print the best score and estimator
print(svm_opt.best_score_)
print(svm_opt.best_estimator_)

from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0)

classifier = svm.SVC(**svm_opt.best_estimator_.get_params())
y_pred = classifier.fit(X_train, y_train).predict(X_test)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

# instantiate the SVM object
sdss_rf = RandomForestClassifier()

X = all_features.values
Y = all_label.values

# parameter values over which we will search
parameters = {'n_estimators':(10,50,200),"max_features": ["auto",3,5],
              'criterion':["gini","entropy"],"min_samples_leaf": [1,2]}
#parameters = {'kernel':('linear', 'rbf')}

# do a grid search to find the highest 3-fold CV zero-one score
rf_tune = grid_search.GridSearchCV(sdss_rf, parameters, score_func=metrics.accuracy_score,\
                                    n_jobs = -1, cv = 3,verbose=1)
rf_opt = rf_tune.fit(X, Y)

# print the best score and estimator
print(rf_opt.best_score_)
print(rf_opt.best_estimator_)


clf.get_params()

svm_opt.best_estimator_.get_params()

grid_search.GridSearchCV?