import numpy as np from matplotlib import pyplot as plt from IPython.display import Image, HTML %matplotlib inline %load_ext load_style %load_style ./styles/talk.css Image(url='http://scikit-learn.org/stable/_static/ml_map.png', width=900) HTML('') from sklearn.datasets import load_digits digits = load_digits() print(digits.DESCR) X, y = digits.data, digits.target print("data shape: %r, target shape: %r" % (X.shape, y.shape)) print("labels: %r" % list(np.unique(y))) def plot_gallery(data, labels, shape, interpolation='nearest'): f,ax = plt.subplots(1,5,figsize=(16,5)) for i in range(data.shape[0]): ax[i].imshow(data[i].reshape(shape), interpolation=interpolation, cmap=plt.cm.gray_r) ax[i].set_title(labels[i]) ax[i].set_xticks(()), ax[i].set_yticks(()) subsample = np.random.permutation(X.shape[0])[:5] images = X[subsample] labels = ['True label: %d' % l for l in y[subsample]] plot_gallery(images, labels, shape=(8, 8)) from sklearn.svm import SVC svc = SVC() svc.fit(X, y) svc.score(X,y) y_hat = svc.predict(X) np.alltrue(y_hat == y) Image(filename='Images/www.png') from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, \ test_size=0.25, random_state=1) print("train data shape: %r, train target shape: %r" % (X_train.shape, y_train.shape)) print("test data shape: %r, test target shape: %r" % (X_test.shape, y_test.shape)) svc = SVC().fit(X_train, y_train) train_score = svc.score(X_train, y_train) train_score test_score = svc.score(X_test, y_test) test_score svc_2 = SVC(C=100, gamma=0.001).fit(X_train, y_train) svc_2 svc_2.score(X_train, y_train) svc_2.score(X_test, y_test) sum(svc_2.predict(X_test) == y_test) / float(len(y_test)) from sklearn import cross_validation cross_validation. cross_validation.ShuffleSplit? cv = cross_validation.ShuffleSplit(len(X), n_iter=3, test_size=0.2, random_state=0) for cv_index, (train, test) in enumerate(cv): print("# Cross Validation Iteration #%d" % cv_index) print("train indices: {0}...".format(train[:10])) print("test indices: {0}...".format(test[:10])) svc = SVC(C=100, gamma=0.001).fit(X[train], y[train]) print("train score: {0:.3f}, test score: {1:.3f}\n".format( svc.score(X[train], y[train]), svc.score(X[test], y[test]))) from sklearn.cross_validation import cross_val_score svc = SVC(C=100, gamma=0.001) cv = cross_validation.ShuffleSplit(len(X), n_iter=10, test_size=0.2, random_state=0) test_scores = cross_val_score(svc, X, y, cv=cv, n_jobs=4) # n_jobs = 4 if you have a quad-core machine ... test_scores n_iter = 5 # the number of iterations should be more than that ... gammas = np.logspace(-7, -1, 10) # should be more fine grained ... cv = cross_validation.ShuffleSplit(len(X), n_iter=n_iter, test_size=0.2) train_scores = np.zeros((len(gammas), n_iter)) test_scores = np.zeros((len(gammas), n_iter)) for i, gamma in enumerate(gammas): for j, (train, test) in enumerate(cv): C = 1 clf = SVC(C=C, gamma=gamma).fit(X[train], y[train]) train_scores[i, j] = clf.score(X[train], y[train]) test_scores[i, j] = clf.score(X[test], y[test]) f, ax = plt.subplots(figsize=(12,8)) #for i in range(n_iter): # ax.semilogx(gammas, train_scores[:, i], alpha=0.2, lw=2, c='b') # ax.semilogx(gammas, test_scores[:, i], alpha=0.2, lw=2, c='g') ax.semilogx(gammas, test_scores.mean(1), lw=4, c='g', label='test score') ax.semilogx(gammas, train_scores.mean(1), lw=4, c='b', label='train score') ax.fill_between(gammas, train_scores.min(1), train_scores.max(1), color = 'b', alpha=0.2) ax.fill_between(gammas, test_scores.min(1), test_scores.max(1), color = 'g', alpha=0.2) ax.set_ylabel("score for SVC(C=%4.2f, $\gamma=\gamma$)" % ( C ),fontsize=16) ax.set_xlabel(r"$\gamma$",fontsize=16) best_gamma = gammas[np.argmax(test_scores.mean(1))] best_score = test_scores.mean(1).max() ax.text(best_gamma, best_score+0.05, "$\gamma$ = %6.4f | score=%6.4f" % (best_gamma, best_score),\ fontsize=15, bbox=dict(facecolor='w',alpha=0.5)) [x.set_fontsize(16) for x in ax.xaxis.get_ticklabels()] [x.set_fontsize(16) for x in ax.yaxis.get_ticklabels()] ax.legend(fontsize=16, loc=0) ax.set_ylim(0, 1.1) from sklearn.grid_search import GridSearchCV svc_params = { 'C': np.logspace(-1, 2, 4), 'gamma': np.logspace(-4, 0, 5), } gs_svc = GridSearchCV(SVC(), svc_params, cv=3, n_jobs=4) gs_svc.fit(X, y) gs_svc.best_params_, gs_svc.best_score_