Principal components analysis and singular value decomposition¶

In [3]:

import pandas as pd
import numpy as np

To plot dendrograms together with a heatmap we'll need to define our own function:

In [4]:

# a simple function to compute hierarchical cluster on both rows and columns, and plot heatmaps
def heatmap(dm):
    from scipy.cluster.hierarchy import linkage, dendrogram
    from scipy.spatial.distance import pdist, squareform
    
    D1 = squareform(pdist(dm, metric='euclidean'))
    D2 = squareform(pdist(dm.T, metric='euclidean'))
    
    f = figure(figsize=(8, 8))

    # add first dendrogram
    ax1 = f.add_axes([0.09, 0.1, 0.2, 0.6])
    Y = linkage(D1, method='complete')
    Z1 = dendrogram(Y, orientation='right')
    ax1.set_xticks([])
    ax1.set_yticks([])

    # add second dendrogram
    ax2 = f.add_axes([0.3, 0.71, 0.6, 0.2])
    Y = linkage(D2, method='complete')
    Z2 = dendrogram(Y)
    ax2.set_xticks([])
    ax2.set_yticks([])

    # add matrix plot
    axmatrix = f.add_axes([0.3, 0.1, 0.6, 0.6])
    idx1 = Z1['leaves']
    idx2 = Z2['leaves']
    D = dm[idx1, :]
    D = D[:, idx2]
    im = axmatrix.matshow(D, aspect='auto', cmap='hot')
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])
    
    return {'ordered' : D, 'rorder' : Z1['leaves'], 'corder' : Z2['leaves']}

Here again we'll use rpy2 to generate the same data as in the course video.

In [5]:

%load_ext rmagic

In [6]:

%%R -o dataMatrix
set.seed(12345); par(mar=rep(0.2,4))
dataMatrix <- matrix(rnorm(400),nrow=40)

set.seed(678910)
for(i in 1:40){
   coinFlip <- rbinom(1,size=1,prob=0.5)
   if(coinFlip){
     dataMatrix[i,] <- dataMatrix[i,] + rep(c(0,3),each=5)
   }
}

In [8]:

# dataMatrix is generated and pattern-added using the same R commands as in the video
f, ax = subplots(ncols=1)

ax.matshow(dataMatrix, aspect='auto', cmap='hot');

In [10]:

# the heatmap after a pattern is added
# note: the resulting cluster pattern is somehow mirrored compared to the one in the video
hh = heatmap(dataMatrix)

In [11]:

# patterns in rows and columns; compute and plot column and row means
f, (ax1, ax2, ax3) = subplots(ncols=3)

dataMatrixOrdered = hh['ordered']

ax1.matshow(dataMatrixOrdered, cmap='hot')

ax2.plot(np.mean(dataMatrixOrdered, axis=1), range(39, -1, -1), 'ok')
ax2.set_xlabel('Row Mean')
ax2.set_ylabel('Row')

ax3.plot(np.mean(dataMatrixOrdered, axis=0), 'ok')
ax3.set_xlabel('Column')
ax3.set_ylabel('Column Mean')

f.tight_layout();

SVD

To perform SVD we need to normalize the matrix. Here we'll define a function to do that, however the functionality is actually available in scikit-learn.

In [12]:

# a simple scale function to normalize a matrix
def scale(matrix):
    from numpy import mean, std
    return (matrix - mean(matrix, axis=0)) / std(matrix, axis=0)

In [14]:

# compute SVD
# note: the U and V matrices computed in numpy are different from R
# the resulting plots below aren't thus the same as the ones in video
U, D, V = np.linalg.svd(scale(dataMatrixOrdered))

f, (ax1, ax2, ax3) = subplots(ncols=3)

ax1.matshow(dataMatrixOrdered, cmap='hot')

ax2.plot(U[:,0], range(39, -1, -1), 'ok')
ax2.set_xlabel('First left singular vector')
ax2.set_ylabel('Row')
ax2.set_xticks([-0.2, 0.0, 0.2])

ax3.plot(V[:,0], 'ok')
ax3.set_xlabel('Column')
ax3.set_ylabel('First right singular vector')

f.tight_layout();

In [15]:

# the D matrix, however, is the same
f, (ax1, ax2) = subplots(ncols=2)

ax1.plot(D, 'ok')
ax1.set_xlabel('Column')
ax1.set_ylabel('Singular value')

ax2.plot(D**2 / np.sum(D**2), 'ok')
ax2.set_xlabel('Column')
ax2.set_ylabel('Percent of variance explained')

f.tight_layout();

The video talks about the relation between SVD and PCA. There is a PCA functionality provided in scikit-learn, but I don't include it here.

Continuing with SVD:

In [17]:

# components of the SVD - variance explained
constantMatrix = dataMatrixOrdered * 0
constantMatrix[:,5:] = 1

U1, D1, V1 = np.linalg.svd(constantMatrix)

f, (ax1, ax2, ax3) = subplots(ncols=3)

ax1.matshow(constantMatrix, cmap='autumn')

ax2.plot(D1, 'ok')
ax2.set_xlabel('Column')
ax2.set_ylabel('Singular value')

ax3.plot(D1**2 / np.sum(D1**2), 'ok')
ax3.set_xlabel('Column')
ax3.set_ylabel('Percent of variance explained')

f.tight_layout();

In [19]:

# again generate a new dataMatrix with R

In [20]:

%%R -o dataMatrix
set.seed(12345); par(mar=rep(0.2,4))
dataMatrix <- matrix(rnorm(400),nrow=40)

set.seed(678910)
for(i in 1:40){
   coinFlip1 <- rbinom(1,size=1,prob=0.5)
   coinFlip2 <- rbinom(1,size=1,prob=0.5)
   
   if(coinFlip1){
     dataMatrix[i,] <- dataMatrix[i,] + rep(c(0,5),each=5)
   }
   if(coinFlip2){
     dataMatrix[i,] <- dataMatrix[i,] + rep(c(0,5),5)
   }
}

In [21]:

# cluster the dataMatrix
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform

hh = dendrogram(linkage(pdist(dataMatrix)))

dataMatrixOrdered = dataMatrix[hh['leaves'], :]

In [22]:

# SVD - true pattern that was added to the dataMatrix
f, (ax1, ax2, ax3) = subplots(ncols=3)

ax1.matshow(dataMatrixOrdered, cmap='hot', origin='lower')

x = np.array([0, 1])

ax2.plot(np.repeat(x, 5), 'ok')
ax2.set_xlabel('Column')
ax2.set_ylabel('Pattern 1')

ax3.plot(np.tile(x, 5), 'ok')
ax3.set_xlabel('Column')
ax3.set_ylabel('Pattern 2')

f.tight_layout();

In [23]:

# V and patterns of variance in rows
# again, the SVD results here are not comparable to the video
U, D, V = np.linalg.svd(scale(dataMatrixOrdered))

f, (ax1, ax2, ax3) = subplots(ncols=3)

ax1.matshow(dataMatrixOrdered, cmap='hot')

ax2.plot(V[:,0], 'ok')
ax2.set_xlabel('Column')
ax2.set_ylabel('First right singular vector')
ax2.set_ylim([-0.8, 0.6])

ax3.plot(V[:,1], 'ok')
ax3.set_xlabel('Column')
ax3.set_ylabel('Second right singular vector')
ax3.set_ylim([-0.8, 0.6])

f.tight_layout();

In [24]:

# D and variance explained
f, (ax1, ax2) = subplots(ncols=2)

ax1.plot(D, 'ok')
ax1.set_xlabel('Column')
ax1.set_ylabel('Singular value')

ax2.plot(D**2 / np.sum(D**2), 'ok')
ax2.set_xlabel('Column')
ax2.set_ylabel('Percent of variance explained')

f.tight_layout();

In [34]:

# SVD performance
bigMatrix = np.reshape(np.random.normal(size=1e4*40), (1e4, -1))

# this computes full U matrix; warning: slow
print 'timing for computing full SVD matrices:'
%timeit numpy.linalg.svd(scale(bigMatrix))
print '-----------------------'

# this won't produce complete U and V matrices, but way faster
# I believe this is the comparable result compared to R
# by looking at the dimension of the U matrix from R's SVD, 
# R by default doesn't compute full U matrix
print 'timing for computing incomplete SVD matrices:'
%timeit numpy.linalg.svd(scale(bigMatrix), full_matrices=False)

timing for computing full SVD matrices:
1 loops, best of 3: 30.7 s per loop
-----------------------
timing for computing incomplete SVD matrices:
10 loops, best of 3: 86.5 ms per loop

In [35]:

# for comparison, this is how R performs on my machine
from IPython.core.display import Image
Image('https://dl.dropbox.com/u/12210898/Rscreenshot.png')

Out[35]:

In [27]:

# SVD and NaN values
import random

dataMatrix2 = dataMatrixOrdered.ravel()
dataMatrix2[random.sample(range(100), 40)] = NaN
dataMatrix2 = dataMatrix2.reshape(dataMatrixOrdered.shape)

# ERROR
np.linalg.svd(scale(dataMatrix2))

---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
<ipython-input-27-e1d0ccd9dff7> in <module>()
      7 
      8 # error
----> 9 np.linalg.svd(scale(dataMatrix2))

/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/linalg/linalg.pyc in svd(a, full_matrices, compute_uv)
   1319                                  work, lwork, iwork, 0)
   1320     if results['info'] > 0:
-> 1321         raise LinAlgError, 'SVD did not converge'
   1322     s = s.astype(_realType(result_t))
   1323     if compute_uv:

LinAlgError: SVD did not converge

In [28]:

# interpolate NaN values and compare the SVD results
def nan_helper(y):
    return np.isnan(y), lambda z: z.nonzero()[0]
    
idnan, x = nan_helper(dataMatrix2)
dataMatrix2[idnan] = np.interp(x(idnan), x(~idnan), dataMatrix2[~idnan])

U1, D1, V1 = np.linalg.svd(scale(dataMatrixOrdered))
U2, D2, V2 = np.linalg.svd(scale(dataMatrix2))

f, (ax1, ax2) = subplots(ncols=2)
ax1.plot(V1[:,0], 'ok')
ax1.set_ylim([-0.8, 0.8])
ax2.plot(V2[:,0], 'ok')
ax2.set_ylim([-0.8, 0.8])
f.tight_layout();

In [29]:

%%R -o faceData
download.file("https://spark-public.s3.amazonaws.com/dataanalysis/face.rda",destfile="./data/face.rda", method="curl")
load("../data/face.rda")

In [30]:

# face example
matshow(faceData, cmap='hot');

In [32]:

# face example - variance explained
U, D, V = np.linalg.svd(scale(faceData))

plot(D**2 / np.sum(D**2), 'ok')
xlabel('Singular vector')
ylabel('Variance explained');

In [33]:

# face example - create approximation
# note: the V matrix from numpy SVD is transposed compared to R
# therefore there's no need to transpose it again for the reconstruction
approx1 = np.dot(U[:,:1], np.dot(np.diag(D)[:1,:1], V[:1,:]))
approx5 = np.dot(U[:,:5], np.dot(np.diag(D)[:5,:5], V[:5,:]))
approx10 = np.dot(U[:,:10], np.dot(np.diag(D)[:10,:10], V[:10,:]))

f, (ax1, ax2, ax3, ax4) = subplots(ncols=4)

ax1.matshow(faceData, cmap='hot')
ax2.matshow(approx10, cmap='hot')
ax3.matshow(approx5, cmap='hot')
ax4.matshow(approx1, cmap='hot');

In [ ]: