%matplotlib inline import numpy as np import matplotlib.pyplot as plt from scipy import stats # use seaborn plotting defaults # If this causes an error, you can comment it out. import seaborn as sns sns.set() from sklearn.datasets import fetch_lfw_people faces = fetch_lfw_people(min_faces_per_person=70, resize=0.4) X, y = faces.data, faces.target from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score scores = cross_val_score(RandomForestClassifier(), X, y, cv=10) print("score = {0:.2f} +- {1:.2f}".format(scores.mean(), scores.std())) # Use our plot_with_err utility routine from the lecture def plot_with_err(x, data, color, **kwargs): mu, std = data.mean(1), data.std(1) plt.plot(x, mu, '-', c=color, **kwargs) plt.fill_between(x, mu - std, mu + std, edgecolor='none', facecolor=color, alpha=0.2) from sklearn.learning_curve import validation_curve max_depths = np.arange(1, 20, 2) val_train, val_test = validation_curve(RandomForestClassifier(), X, y, 'max_depth', max_depths, cv=5) plot_with_err(max_depths, val_train, 'red', label='train') plot_with_err(max_depths, val_test, 'blue', label='test') plt.xlabel('max_depth') plt.ylabel('score') plt.legend(loc='best'); from sklearn.learning_curve import learning_curve clf = RandomForestClassifier(max_depth=9) train_sizes = np.linspace(0.05, 1, 20) N_train, val_train, val_test = learning_curve(clf, X, y, train_sizes) plot_with_err(N_train, val_train, 'r', label='training scores') plot_with_err(N_train, val_test, 'b', label='validation scores') plt.xlabel('Training Set Size'); plt.ylabel('rms error') plt.legend(); from sklearn.svm import SVC scores = cross_val_score(SVC(), X, y, cv=3) print("score = {0:.2f} +- {1:.2f}".format(scores.mean(), scores.std())) from sklearn.decomposition import PCA X_proj = PCA(100).fit_transform(X) scores = cross_val_score(SVC(), X_proj, y, cv=3) print("score = {0:.2f} +- {1:.2f}".format(scores.mean(), scores.std())) C = 10 ** np.linspace(-9, -4, 20) val_train, val_test = validation_curve(SVC(kernel='linear'), X_proj, y, 'C', C, cv=5) plt.axes(xscale='log') plot_with_err(C, val_train, 'red', label='train') plot_with_err(C, val_test, 'blue', label='test') plt.xlabel('C') plt.ylabel('score') plt.legend(loc='best'); # Compute the score on the full dataset using the optimal value of C: scores = cross_val_score(SVC(kernel='linear', C=1E-6), X_proj, y, cv=3) print("projected score = {0:.2f} +- {1:.2f}".format(scores.mean(), scores.std())) scores = cross_val_score(SVC(kernel='linear', C=1E-6), X, y, cv=3) print("full score = {0:.2f} +- {1:.2f}".format(scores.mean(), scores.std())) # Compute the learning curve for this value of C train_sizes = np.linspace(0.05, 1, 20) N_train, val_train, val_test = learning_curve(SVC(kernel='linear', C=1E-6), X_proj, y, train_sizes) plot_with_err(N_train, val_train, 'r', label='training scores') plot_with_err(N_train, val_test, 'b', label='validation scores') plt.xlabel('Training Set Size'); plt.ylabel('rms error') plt.legend();