#!/usr/bin/env python
# coding: utf-8

# # Cross Validation
# 
# As a first step we need a classifier.

# In[1]:


import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn import metrics


# Scikit-learn has datasets that are already ready for use:

# In[2]:


from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()


# A scikit-learn data object is container object with whose interesting attributes are: 
#   * ‘data’, the data to learn, 
#   * ‘target’, the classification labels, 
#   * ‘target_names’, the meaning of the labels,
#   * ‘feature_names’, the meaning of the features, and 
#   * ‘DESCR’, the full description of the dataset.
# 
# 

# In[3]:


X = data.data
y = data.target
data.target_names
data.feature_names


# In[4]:


X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)


# Let's compute the accuracy of our predictions (in two different ways):

# In[5]:


len(np.where(np.equal(y_pred, y_test))[0])/len(y_test)

np.sum(y_pred==y_test)/len(y_test)


# We can do the same using scikit-learn:

# In[6]:


metrics.accuracy_score(y_test, y_pred)


# Now let's compute accuracy using cross-validation instead:

# In[7]:


cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy')


# You can obtain accuracy for other metrics, such as area under the ROC curve:

# In[8]:


cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc')


# It's a good idea to first obtain the predictions, and then compute accuracy:

# In[9]:


y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5)
metrics.accuracy_score(y, y_predict)


# Here's an alternative way of doing cross-validation.  We first divide the data into folds:

# In[10]:


cv = cross_validation.StratifiedKFold(y, 5)


# Using this division of data into folds we can run cross-validation:

# In[11]:


y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv)
metrics.accuracy_score(y, y_predict)


# We can see how examples were divided into folds by looking at the test_folds attribute:
# 

# In[12]:


print(cv.test_folds)
 

# Hmm... this is not ideal, so let's shuffle things a bit...

# In[13]:


cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
print (cv.test_folds)
 

# If you run division into folds multiple times you will get a different answer:

# In[14]:


cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
print (cv.test_folds)


# If you want to consistently get the same division into folds:

# In[15]:


cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0)
# random_state sets the seed for the random number generator.