## get the data locally ... I put this on a gist !curl -k -O https://gist.githubusercontent.com/anonymous/53781fe86383c435ff10/raw/4cc80a638e8e083775caec3005ae2feaf92b8d5b/qso10000.csv !curl -k -O https://gist.githubusercontent.com/anonymous/2984cf01a2485afd2c3e/raw/964d4f52c989428628d42eb6faad5e212e79b665/star1000.csv !curl -k -O https://gist.githubusercontent.com/anonymous/2984cf01a2485afd2c3e/raw/335cd1953e72f6c7cafa9ebb81b43c47cb757a9d/galaxy1000.csv # For pretty plotting !pip install --upgrade seaborn import pandas as pd pd.set_option('display.max_columns', None) %pylab inline import seaborn as sns sns.set() import copy pd.read_csv("qso10000.csv",index_col=0).head() qsos = pd.read_csv("qso10000.csv",index_col=0,usecols=["objid","dered_r","spec_z","u_g_color",\ "g_r_color","r_i_color","i_z_color","diff_u",\ "diff_g1","diff_i","diff_z"]) qso_features = copy.copy(qsos) qso_redshifts = qsos["spec_z"] del qso_features["spec_z"] qso_features.head() bins = hist(qso_redshifts.values,bins=100) ; xlabel("redshift") ; ylabel("N") import matplotlib as mpl import matplotlib.cm as cm ## truncate the color at z=2.5 just to keep some contrast. norm = mpl.colors.Normalize(vmin=min(qso_redshifts.values), vmax=2.5) cmap = cm.jet_r #x = 0.3 m = cm.ScalarMappable(norm=norm, cmap=cmap) rez = pd.scatter_matrix(qso_features[0:2000], alpha=0.2,figsize=[15,15],color=m.to_rgba(qso_redshifts.values)) min(qso_features["dered_r"].values) qsos = pd.read_csv("qso10000.csv",index_col=0,usecols=["objid","dered_r","spec_z","u_g_color",\ "g_r_color","r_i_color","i_z_color","diff_u",\ "diff_g1","diff_i","diff_z"]) qsos = qsos[(qsos["dered_r"] > -9999) & (qsos["g_r_color"] > -10) & (qsos["g_r_color"] < 10)] qso_features = copy.copy(qsos) qso_redshifts = qsos["spec_z"] del qso_features["spec_z"] rez = pd.scatter_matrix(qso_features[0:2000], alpha=0.2,figsize=[15,15],\ color=m.to_rgba(qso_redshifts.values)) qsos.to_csv("qsos.clean.csv") X = qso_features.values # 9-d feature space Y = qso_redshifts.values # redshifts print "feature vector shape=", X.shape print "class shape=", Y.shape # half of data half = floor(len(Y)/2) train_X = X[:half] train_Y = Y[:half] test_X = X[half:] test_Y = Y[half:] from sklearn import linear_model clf = linear_model.LinearRegression() clf. # fit the model clf.fit(train_X, train_Y) # now do the prediction Y_lr_pred = clf.predict(test_X) # how well did we do? from sklearn.metrics import mean_squared_error mse = np.sqrt(mean_squared_error(test_Y,Y_lr_pred)) ; print "MSE",mse plot(test_Y,Y_lr_pred - test_Y,'o',alpha=0.2) title("Linear Regression Residuals - MSE = %.1f" % mse) xlabel("Spectroscopic Redshift") ylabel("Residual") hlines(0,min(test_Y),max(test_Y),color="red") # here's the MSE guessing the AVERAGE value print "naive mse", ((1./len(train_Y))*(train_Y - train_Y.mean())**2).sum() mean_squared_error? from sklearn import neighbors from sklearn import preprocessing X_scaled = preprocessing.scale(X) # many methods work better on scaled X clf1 = neighbors.KNeighborsRegressor(5) train_X = X_scaled[:half] test_X = X_scaled[half:] clf1.fit(train_X,train_Y) Y_knn_pred = clf1.predict(test_X) mse = mean_squared_error(test_Y,Y_knn_pred) ; print "MSE (KNN)", mse plot(test_Y, Y_knn_pred - test_Y,'o',alpha=0.2) title("k-NN Residuals - MSE = %.1f" % mse) xlabel("Spectroscopic Redshift") ylabel("Residual") hlines(0,min(test_Y),max(test_Y),color="red") from sklearn import neighbors from sklearn import preprocessing X_scaled = preprocessing.scale(X) # many methods work better on scaled X train_X = X_scaled[:half] train_Y = Y[:half] test_X = X_scaled[half:] test_Y = Y[half:] clf1 = neighbors.KNeighborsRegressor(10) clf1.fit(train_X,train_Y) Y_knn_pred = clf1.predict(test_X) mse = mean_squared_error(test_Y,Y_knn_pred) ; print mse scatter(test_Y, Y_knn_pred - test_Y,alpha=0.2) title("k-NN Residuals - MSE = %.1f" % mse) xlabel("Spectroscopic Redshift") ylabel("Residual") hlines(0,min(test_Y),max(test_Y),color="red") from sklearn.ensemble import RandomForestRegressor clf2 = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None) clf2.fit(train_X,train_Y) Y_rf_pred = clf2.predict(test_X) mse = mean_squared_error(test_Y,Y_rf_pred) ; print mse scatter(test_Y, Y_rf_pred - test_Y,alpha=0.2) title("RF Residuals - MSE = %.1f" % mse) xlabel("Spectroscopic Redshift") ylabel("Residual") hlines(0,min(test_Y),max(test_Y),color="red") from sklearn import cross_validation from sklearn import linear_model clf = linear_model.LinearRegression() from sklearn.cross_validation import cross_val_score def print_cv_score_summary(model, xx, yy, cv): scores = cross_val_score(model, xx, yy, cv=cv, n_jobs=1) print("mean: {:3f}, stdev: {:3f}".format( np.mean(scores), np.std(scores))) print_cv_score_summary(clf,X,Y,cv=cross_validation.KFold(len(Y), 5)) print_cv_score_summary(clf,X,Y, cv=cross_validation.KFold(len(Y),10,shuffle=True,random_state=1)) print_cv_score_summary(clf2,X,Y, cv=cross_validation.KFold(len(Y),10,shuffle=True,random_state=1)) all_sources = pd.read_csv("qso10000.csv",index_col=0,usecols=["objid","dered_r","u_g_color",\ "g_r_color","r_i_color","i_z_color","diff_u",\ "diff_g1","diff_i","diff_z","class"])[:1000] all_sources = all_sources.append(pd.read_csv("star1000.csv",index_col=0,usecols=["objid","dered_r","u_g_color",\ "g_r_color","r_i_color","i_z_color","diff_u",\ "diff_g1","diff_i","diff_z","class"])) all_sources = all_sources.append(pd.read_csv("galaxy1000.csv",index_col=0,usecols=["objid","dered_r","u_g_color",\ "g_r_color","r_i_color","i_z_color","diff_u",\ "diff_g1","diff_i","diff_z","class"])) all_sources = all_sources[(all_sources["dered_r"] > -9999) & (all_sources["g_r_color"] > -10) & (all_sources["g_r_color"] < 10)] all_features = copy.copy(all_sources) all_label = all_sources["class"] del all_features["class"] X = copy.copy(all_features.values) Y = copy.copy(all_label.values) all_sources.tail() print "feature vector shape=", X.shape print "class shape=", Y.shape Y[Y=="QSO"] = 0 Y[Y=="STAR"] = 1 Y[Y=="GALAXY"] = 2 from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=200,oob_score=True) clf.fit(X,Y) sorted(zip(all_sources.columns.values,clf.feature_importances_),key=lambda q: q[1],reverse=True) clf.oob_score_ ## "Out of Bag" Error import numpy as np from sklearn import svm, datasets cmap = cm.jet_r # import some data to play with X = all_features.values[:, 1:3] # use only two features for training and plotting purposes h = .02 # step size in the mesh # we create an instance of SVM and fit out data. We do not scale our # data since we want to plot the support vectors C = 1.0 # SVM regularization parameter svc = svm.SVC(kernel='linear', C=C).fit(X, Y) rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, Y) poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, Y) lin_svc = svm.LinearSVC(C=C).fit(X, Y) # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # title for the plots titles = ['SVC with linear kernel', 'SVC with RBF kernel', 'SVC with polynomial (degree 3) kernel', 'LinearSVC (linear kernel)'] norm = mpl.colors.Normalize(vmin=min(Y), vmax=max(Y)) m = cm.ScalarMappable(norm=norm, cmap=cmap) for i, clf in enumerate((svc, rbf_svc, poly_svc, lin_svc)): # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. subplot(2, 2, i + 1) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) contourf(xx, yy, Z,cmap=cm.Paired) axis('off') # Plot also the training points scatter(X[:, 0], X[:, 1], c=m.to_rgba(Y),cmap=cm.Paired) title(titles[i]) # fit a support vector machine classifier from sklearn import grid_search from sklearn import svm from sklearn import metrics import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # instantiate the SVM object sdss_svm = svm.SVC() X = all_features.values Y = all_label.values # parameter values over which we will search parameters = {'kernel':('linear', 'rbf'), \ 'gamma':[0.5, 0.3, 0.1, 0.01], 'C':[0.1, 2, 4, 5, 10, 20,30]} #parameters = {'kernel':('linear', 'rbf')} # do a grid search to find the highest 3-fold CV zero-one score svm_tune = grid_search.GridSearchCV(sdss_svm, parameters, score_func=metrics.accuracy_score,\ n_jobs = -1, cv = 3,verbose=1) svm_opt = svm_tune.fit(X, Y) # print the best score and estimator print(svm_opt.best_score_) print(svm_opt.best_estimator_) from sklearn.cross_validation import train_test_split from sklearn.metrics import confusion_matrix X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0) classifier = svm.SVC(**svm_opt.best_estimator_.get_params()) y_pred = classifier.fit(X_train, y_train).predict(X_test) # Compute confusion matrix cm = confusion_matrix(y_test, y_pred) print(cm) # Show confusion matrix in a separate window plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() # instantiate the SVM object sdss_rf = RandomForestClassifier() X = all_features.values Y = all_label.values # parameter values over which we will search parameters = {'n_estimators':(10,50,200),"max_features": ["auto",3,5], 'criterion':["gini","entropy"],"min_samples_leaf": [1,2]} #parameters = {'kernel':('linear', 'rbf')} # do a grid search to find the highest 3-fold CV zero-one score rf_tune = grid_search.GridSearchCV(sdss_rf, parameters, score_func=metrics.accuracy_score,\ n_jobs = -1, cv = 3,verbose=1) rf_opt = rf_tune.fit(X, Y) # print the best score and estimator print(rf_opt.best_score_) print(rf_opt.best_estimator_) clf.get_params() svm_opt.best_estimator_.get_params() grid_search.GridSearchCV?