#!/usr/bin/env python
# coding: utf-8

# # Principal Components Analysis
# 
# In this notebook we'll explore the digits data with PCA.

# In[10]:


get_ipython().run_line_magic('matplotlib', 'inline')

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


# Load the digits data and center or standardize it:

# In[26]:


digits = datasets.load_digits()
X = digits.data
y = digits.target
scaler = StandardScaler(with_std=False).fit(X)
X = scaler.transform(X)
X.shape


# Let's apply PCA to the data and keep the first 10 principal components:

# In[31]:


pca = PCA(n_components=20)
X_reduced = pca.fit_transform(X)

print (pca.explained_variance_ratio_)
X_reduced.shape


# Now, let's visualize the first two principal components:

# In[16]:


plt.scatter(X_reduced[:, 0], X_reduced[:, 1], alpha=0.5, c=y, cmap=plt.cm.Paired)


# Let's take a look at the next two principal components:

# In[17]:


plt.scatter(X_reduced[:, 2], X_reduced[:, 3], alpha=0.5, c=y, cmap=plt.cm.Paired)


# Let's go higher:

# In[32]:


plt.scatter(X_reduced[:, 18], X_reduced[:, 19], alpha=0.5, c=y, cmap=plt.cm.Paired)


# Let's compute baseline accuracy with all the features:

# In[24]:


cv_generator = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
param_grid = {'C': np.logspace(-2, 1, 4)}
print(param_grid)
classifier = GridSearchCV(SVC(kernel='linear', C=1), param_grid, cv=cv_generator)
cv_results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False)
np.mean(cv_results['test_score'])


# And now with the PCA-selected features:

# In[30]:


cv_results = cross_validate(classifier, X_reduced, y, cv=cv_generator, scoring='accuracy', return_train_score=False)
np.mean(cv_results['test_score'])