#!/usr/bin/env python # coding: utf-8 # [Sebastian Raschka](http://sebastianraschka.com), 2015 # # https://github.com/rasbt/python-machine-learning-book # # Python Machine Learning - Code Examples # # Bonus Material - An Extended Nested Cross-Validation Example # For an explanation of nested cross-validation, please see: # # - Chapter 6, section "Algorithm-selection-with-nested-cross-validation" (open the code example via [nbviewer](http://nbviewer.ipython.org/github/rasbt/python-machine-learning-book/blob/master/code/ch06/ch06.ipynb#Algorithm-selection-with-nested-cross-validation)) # - FAQ, section: [How do I evaluate a model?](https://github.com/rasbt/python-machine-learning-book/blob/master/faq/evaluate-a-model.md) #
# Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s). # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -v -p numpy,pandas,matplotlib,scikit-learn") # ### Dataset and Estimator Setup # In[2]: from sklearn.grid_search import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split # load and split data iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) # pipeline setup cls = SVC(C=10.0, kernel='rbf', gamma=0.1, decision_function_shape='ovr') kernel_svm = Pipeline([('std', StandardScaler()), ('svc', cls)]) # gridsearch setup param_grid = [ {'svc__C': [1, 10, 100, 1000], 'svc__gamma': [0.001, 0.0001], 'svc__kernel': ['rbf']}, ] # setup multiple GridSearchCV objects, 1 for each algorithm gs_svm = GridSearchCV(estimator=kernel_svm, param_grid=param_grid, scoring='accuracy', n_jobs=-1, cv=5, verbose=0, refit=True, pre_dispatch='2*n_jobs') # ## A. Nested Crossvalidation - Quick Version # Here, the `cross_val_function` runs the 5 outer loops, and the the `GridSearch` object (`gs`) peforms the hyperparameter optimization during the 5 inner loops. # In[5]: import numpy as np from sklearn.cross_validation import cross_val_score scores = cross_val_score(gs_svm, X_train, y_train, scoring='accuracy', cv=5) print('\nAverage Accuracy %.2f +/- %.2f' % (np.mean(scores), np.std(scores))) # ## B. Nested Crossvalidation - Manual Approach Printing the Model Parameters # In[10]: from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import accuracy_score import numpy as np params = [] scores = [] skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=False, random_state=1) for train_idx, test_idx in skfold: gs_svm.fit(X_train[train_idx], y_train[train_idx]) y_pred = gs_svm.predict(X_train[test_idx]) acc = accuracy_score(y_true=y_train[test_idx], y_pred=y_pred) params.append(gs_svm.best_params_) scores.append(acc) # In[11]: print('SVM models:') for idx, m in enumerate(zip(params, scores)): print('%s. Acc: %.2f Params: %s' % (idx+1, m[1], m[0])) print('\nAverage Accuracy %.2f +/- %.2f' % (np.mean(scores), np.std(scores))) # ## Regular K-fold CV to Optimize the Model on the Complete Training Set # Repeat the nested cross-validation for different algorithms. Then, pick the "best" algorithm (not the best model!). Next, use the complete training set to tune the best algorithm via grid search: # In[12]: gs_svm.fit(X_train, y_train) print('Best parameters %s' % gs_svm.best_params_) # In[14]: train_acc = accuracy_score(y_true=y_train, y_pred=gs_svm.predict(X_train)) test_acc = accuracy_score(y_true=y_test, y_pred=gs_svm.predict(X_test)) print('Training accuracy: %.2f' % train_acc) print('Test accuracy: %.2f' % test_acc) print('Parameters: %s' % gs_svm.best_params_)