Based on a combination of the scipy 2013 sklearn tutorials and the scipy lecture notes
This example requires the PIL library. pip install PIL
or pip install Pillow
%pylab inline
Populating the interactive namespace from numpy and matplotlib
# Score function from slides
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
def score(clf, X, Y, folds=2, verbose=False, metric=accuracy_score):
predictions = np.zeros(len(Y))
for i, (train, test) in enumerate(KFold(len(X), n_folds=folds, shuffle=True)):
clf.fit(X[train], Y[train])
predictions[test] = clf.predict(X[test])
if verbose:
print("Fold {}: {}".format(i + 1, accuracy_score(Y[test], predictions[test])))
if metric:
return metric(Y, predictions)
return Y, predictions
# Display given faces in a grid
def show_faces(ims, grid_size=(16, 6)):
fig = plt.figure(figsize=grid_size)
for i in range(min(len(ims), 30)):
ax = fig.add_subplot(3, 10, i + 1, xticks=[], yticks=[])
ax.imshow(ims[i].reshape((50, 37)), cmap=plt.cm.bone)
(downloads 200MB of data!)
from sklearn import datasets
lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70, resize=0.4)
lfw_people.keys()
['images', 'data', 'target_names', 'DESCR', 'target']
PX = lfw_people['data']
PY = lfw_people['target']
PI = lfw_people['images']
show_faces(PI)
# There are 7 different people in the dataset
np.unique(PY)
array([0, 1, 2, 3, 4, 5, 6])
np.bincount(PY)
array([ 77, 236, 121, 530, 109, 71, 144])
from sklearn import decomposition
pca = decomposition.RandomizedPCA(n_components=150, whiten=True)
pca.fit(PX)
show_faces(pca.components_)
We will use an SVM to classify the output of PCA
from sklearn import svm
from sklearn.pipeline import Pipeline
# whitening is very important!
pipeline = Pipeline([('pca', decomposition.RandomizedPCA(n_components=150, whiten=True)),
('svm', svm.SVC())])
y, pred = score(pipeline, PX, PY, folds=5, metric=None)
from sklearn import metrics
metrics.confusion_matrix(y, pred)
array([[ 32, 7, 0, 38, 0, 0, 0], [ 0, 198, 0, 37, 0, 0, 1], [ 0, 5, 67, 49, 0, 0, 0], [ 0, 9, 1, 520, 0, 0, 0], [ 0, 3, 0, 42, 57, 0, 7], [ 0, 6, 0, 29, 0, 36, 0], [ 0, 4, 0, 46, 0, 0, 94]])
print metrics.classification_report(y, pred, target_names = lfw_people.target_names)
precision recall f1-score support Ariel Sharon 1.00 0.42 0.59 77 Colin Powell 0.85 0.84 0.85 236 Donald Rumsfeld 0.99 0.55 0.71 121 George W Bush 0.68 0.98 0.81 530 Gerhard Schroeder 1.00 0.52 0.69 109 Hugo Chavez 1.00 0.51 0.67 71 Tony Blair 0.92 0.65 0.76 144 avg / total 0.83 0.78 0.77 1288