#!/usr/bin/env python # coding: utf-8 # # Cross Validation # # As a first step we need a classifier. # In[1]: import numpy as np from sklearn.linear_model import LogisticRegression from sklearn import cross_validation from sklearn import metrics # Scikit-learn has datasets that are already ready for use: # In[2]: from sklearn.datasets import load_breast_cancer data = load_breast_cancer() # A scikit-learn data object is container object with whose interesting attributes are: # * ‘data’, the data to learn, # * ‘target’, the classification labels, # * ‘target_names’, the meaning of the labels, # * ‘feature_names’, the meaning of the features, and # * ‘DESCR’, the full description of the dataset. # # # In[3]: X = data.data y = data.target data.target_names data.feature_names # In[4]: X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) classifier = LogisticRegression() classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) # Let's compute the accuracy of our predictions (in two different ways): # In[5]: len(np.where(np.equal(y_pred, y_test))[0])/len(y_test) np.sum(y_pred==y_test)/len(y_test) # We can do the same using scikit-learn: # In[6]: metrics.accuracy_score(y_test, y_pred) # Now let's compute accuracy using cross-validation instead: # In[7]: cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy') # You can obtain accuracy for other metrics, such as area under the ROC curve: # In[8]: cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc') # It's a good idea to first obtain the predictions, and then compute accuracy: # In[9]: y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5) metrics.accuracy_score(y, y_predict) # Here's an alternative way of doing cross-validation. We first divide the data into folds: # In[10]: cv = cross_validation.StratifiedKFold(y, 5) # Using this division of data into folds we can run cross-validation: # In[11]: y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) metrics.accuracy_score(y, y_predict) # We can see how examples were divided into folds by looking at the test_folds attribute: # # In[12]: print(cv.test_folds) # Hmm... this is not ideal, so let's shuffle things a bit... # In[13]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) print (cv.test_folds) # If you run division into folds multiple times you will get a different answer: # In[14]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) print (cv.test_folds) # If you want to consistently get the same division into folds: # In[15]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) # random_state sets the seed for the random number generator.