%pylab inline from sklearn import datasets lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70, resize=0.4, data_home='datasets') lfw_people.data.shape !ls datasets !du -sh datasets/lfw_home fig = plt.figure(figsize=(8, 6)) # plot several images for i in range(15): ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[]) ax.imshow(lfw_people.images[i], cmap=plt.cm.bone) plt.figure(figsize=(10, 2)) unique_targets = np.unique(lfw_people.target) counts = [(lfw_people.target == i).sum() for i in unique_targets] plt.xticks(unique_targets, lfw_people.target_names[unique_targets]) locs, labels = plt.xticks() plt.setp(labels, rotation=45, size=14) _ = plt.bar(unique_targets, counts) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(lfw_people.data, lfw_people.target, random_state=0) print X_train.shape, X_test.shape from sklearn import decomposition pca = decomposition.RandomizedPCA(n_components=150, whiten=True) pca.fit(X_train) plt.imshow(pca.mean_.reshape((50, 37)), cmap=plt.cm.bone) print pca.components_.shape fig = plt.figure(figsize=(16, 6)) for i in range(30): ax = fig.add_subplot(3, 10, i + 1, xticks=[], yticks=[]) ax.imshow(pca.components_[i].reshape((50, 37)), cmap=plt.cm.bone) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print X_train_pca.shape print X_test_pca.shape from sklearn import svm clf = svm.SVC(C=5., gamma=0.001) clf.fit(X_train_pca, y_train) fig = plt.figure(figsize=(8, 6)) for i in range(15): ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[]) ax.imshow(X_test[i].reshape((50, 37)), cmap=plt.cm.bone) y_pred = clf.predict(X_test_pca[i])[0] color = 'black' if y_pred == y_test[i] else 'red' ax.set_title(lfw_people.target_names[y_pred], fontsize='small', color=color) from sklearn import metrics y_pred = clf.predict(X_test_pca) print(metrics.classification_report(y_test, y_pred, target_names=lfw_people.target_names)) print(metrics.confusion_matrix(y_test, y_pred)) print(metrics.f1_score(y_test, y_pred)) from sklearn.pipeline import Pipeline clf = Pipeline([('pca', decomposition.RandomizedPCA(n_components=150, whiten=True)), ('svm', svm.LinearSVC(C=1.0))]) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print metrics.confusion_matrix(y_pred, y_test)