%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
# If this causes an error, you can comment it out.
import seaborn as sns
sns.set()

from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

X, y = faces.data, faces.target

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(RandomForestClassifier(), X, y, cv=10)
print("score = {0:.2f} +- {1:.2f}".format(scores.mean(), scores.std()))

# Use our plot_with_err utility routine from the lecture
def plot_with_err(x, data, color, **kwargs):
    mu, std = data.mean(1), data.std(1)
    plt.plot(x, mu, '-', c=color, **kwargs)
    plt.fill_between(x, mu - std, mu + std, edgecolor='none', facecolor=color, alpha=0.2)

from sklearn.learning_curve import validation_curve

max_depths = np.arange(1, 20, 2)
val_train, val_test = validation_curve(RandomForestClassifier(),
                                       X, y,
                                       'max_depth', max_depths, cv=5)

plot_with_err(max_depths, val_train, 'red', label='train')
plot_with_err(max_depths, val_test, 'blue', label='test')
plt.xlabel('max_depth')
plt.ylabel('score')
plt.legend(loc='best');

from sklearn.learning_curve import learning_curve
clf = RandomForestClassifier(max_depth=9)
train_sizes = np.linspace(0.05, 1, 20)
N_train, val_train, val_test = learning_curve(clf, X, y, train_sizes)

plot_with_err(N_train, val_train, 'r', label='training scores')
plot_with_err(N_train, val_test, 'b', label='validation scores')
plt.xlabel('Training Set Size'); plt.ylabel('rms error')
plt.legend();

from sklearn.svm import SVC

scores = cross_val_score(SVC(), X, y, cv=3)
print("score = {0:.2f} +- {1:.2f}".format(scores.mean(), scores.std()))

from sklearn.decomposition import PCA
X_proj = PCA(100).fit_transform(X)

scores = cross_val_score(SVC(), X_proj, y, cv=3)
print("score = {0:.2f} +- {1:.2f}".format(scores.mean(), scores.std()))

C = 10 ** np.linspace(-9, -4, 20)
val_train, val_test = validation_curve(SVC(kernel='linear'), X_proj, y,
                                       'C', C, cv=5)

plt.axes(xscale='log')

plot_with_err(C, val_train, 'red', label='train')
plot_with_err(C, val_test, 'blue', label='test')
plt.xlabel('C')
plt.ylabel('score')
plt.legend(loc='best');

# Compute the score on the full dataset using the optimal value of C:
scores = cross_val_score(SVC(kernel='linear', C=1E-6), X_proj, y, cv=3)
print("projected score = {0:.2f} +- {1:.2f}".format(scores.mean(), scores.std()))

scores = cross_val_score(SVC(kernel='linear', C=1E-6), X, y, cv=3)
print("full score = {0:.2f} +- {1:.2f}".format(scores.mean(), scores.std()))

# Compute the learning curve for this value of C
train_sizes = np.linspace(0.05, 1, 20)
N_train, val_train, val_test = learning_curve(SVC(kernel='linear', C=1E-6),
                                              X_proj, y, train_sizes)

plot_with_err(N_train, val_train, 'r', label='training scores')
plot_with_err(N_train, val_test, 'b', label='validation scores')
plt.xlabel('Training Set Size'); plt.ylabel('rms error')
plt.legend();