import pandas as pd
import numpy as np
To plot dendrograms together with a heatmap we'll need to define our own function:
# a simple function to compute hierarchical cluster on both rows and columns, and plot heatmaps
def heatmap(dm):
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform
D1 = squareform(pdist(dm, metric='euclidean'))
D2 = squareform(pdist(dm.T, metric='euclidean'))
f = figure(figsize=(8, 8))
# add first dendrogram
ax1 = f.add_axes([0.09, 0.1, 0.2, 0.6])
Y = linkage(D1, method='complete')
Z1 = dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])
# add second dendrogram
ax2 = f.add_axes([0.3, 0.71, 0.6, 0.2])
Y = linkage(D2, method='complete')
Z2 = dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])
# add matrix plot
axmatrix = f.add_axes([0.3, 0.1, 0.6, 0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = dm[idx1, :]
D = D[:, idx2]
im = axmatrix.matshow(D, aspect='auto', cmap='hot')
axmatrix.set_xticks([])
axmatrix.set_yticks([])
return {'ordered' : D, 'rorder' : Z1['leaves'], 'corder' : Z2['leaves']}
Here again we'll use rpy2
to generate the same data as in the course video.
%load_ext rmagic
%%R -o dataMatrix
set.seed(12345); par(mar=rep(0.2,4))
dataMatrix <- matrix(rnorm(400),nrow=40)
set.seed(678910)
for(i in 1:40){
coinFlip <- rbinom(1,size=1,prob=0.5)
if(coinFlip){
dataMatrix[i,] <- dataMatrix[i,] + rep(c(0,3),each=5)
}
}
# dataMatrix is generated and pattern-added using the same R commands as in the video
f, ax = subplots(ncols=1)
ax.matshow(dataMatrix, aspect='auto', cmap='hot');
# the heatmap after a pattern is added
# note: the resulting cluster pattern is somehow mirrored compared to the one in the video
hh = heatmap(dataMatrix)
# patterns in rows and columns; compute and plot column and row means
f, (ax1, ax2, ax3) = subplots(ncols=3)
dataMatrixOrdered = hh['ordered']
ax1.matshow(dataMatrixOrdered, cmap='hot')
ax2.plot(np.mean(dataMatrixOrdered, axis=1), range(39, -1, -1), 'ok')
ax2.set_xlabel('Row Mean')
ax2.set_ylabel('Row')
ax3.plot(np.mean(dataMatrixOrdered, axis=0), 'ok')
ax3.set_xlabel('Column')
ax3.set_ylabel('Column Mean')
f.tight_layout();
SVD
To perform SVD we need to normalize the matrix. Here we'll define a function to do that, however the functionality is actually available in scikit-learn
.
# a simple scale function to normalize a matrix
def scale(matrix):
from numpy import mean, std
return (matrix - mean(matrix, axis=0)) / std(matrix, axis=0)
# compute SVD
# note: the U and V matrices computed in numpy are different from R
# the resulting plots below aren't thus the same as the ones in video
U, D, V = np.linalg.svd(scale(dataMatrixOrdered))
f, (ax1, ax2, ax3) = subplots(ncols=3)
ax1.matshow(dataMatrixOrdered, cmap='hot')
ax2.plot(U[:,0], range(39, -1, -1), 'ok')
ax2.set_xlabel('First left singular vector')
ax2.set_ylabel('Row')
ax2.set_xticks([-0.2, 0.0, 0.2])
ax3.plot(V[:,0], 'ok')
ax3.set_xlabel('Column')
ax3.set_ylabel('First right singular vector')
f.tight_layout();
# the D matrix, however, is the same
f, (ax1, ax2) = subplots(ncols=2)
ax1.plot(D, 'ok')
ax1.set_xlabel('Column')
ax1.set_ylabel('Singular value')
ax2.plot(D**2 / np.sum(D**2), 'ok')
ax2.set_xlabel('Column')
ax2.set_ylabel('Percent of variance explained')
f.tight_layout();
The video talks about the relation between SVD and PCA. There is a PCA functionality provided in scikit-learn
, but I don't include it here.
Continuing with SVD:
# components of the SVD - variance explained
constantMatrix = dataMatrixOrdered * 0
constantMatrix[:,5:] = 1
U1, D1, V1 = np.linalg.svd(constantMatrix)
f, (ax1, ax2, ax3) = subplots(ncols=3)
ax1.matshow(constantMatrix, cmap='autumn')
ax2.plot(D1, 'ok')
ax2.set_xlabel('Column')
ax2.set_ylabel('Singular value')
ax3.plot(D1**2 / np.sum(D1**2), 'ok')
ax3.set_xlabel('Column')
ax3.set_ylabel('Percent of variance explained')
f.tight_layout();
# again generate a new dataMatrix with R
%%R -o dataMatrix
set.seed(12345); par(mar=rep(0.2,4))
dataMatrix <- matrix(rnorm(400),nrow=40)
set.seed(678910)
for(i in 1:40){
coinFlip1 <- rbinom(1,size=1,prob=0.5)
coinFlip2 <- rbinom(1,size=1,prob=0.5)
if(coinFlip1){
dataMatrix[i,] <- dataMatrix[i,] + rep(c(0,5),each=5)
}
if(coinFlip2){
dataMatrix[i,] <- dataMatrix[i,] + rep(c(0,5),5)
}
}
# cluster the dataMatrix
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform
hh = dendrogram(linkage(pdist(dataMatrix)))
dataMatrixOrdered = dataMatrix[hh['leaves'], :]
# SVD - true pattern that was added to the dataMatrix
f, (ax1, ax2, ax3) = subplots(ncols=3)
ax1.matshow(dataMatrixOrdered, cmap='hot', origin='lower')
x = np.array([0, 1])
ax2.plot(np.repeat(x, 5), 'ok')
ax2.set_xlabel('Column')
ax2.set_ylabel('Pattern 1')
ax3.plot(np.tile(x, 5), 'ok')
ax3.set_xlabel('Column')
ax3.set_ylabel('Pattern 2')
f.tight_layout();
# V and patterns of variance in rows
# again, the SVD results here are not comparable to the video
U, D, V = np.linalg.svd(scale(dataMatrixOrdered))
f, (ax1, ax2, ax3) = subplots(ncols=3)
ax1.matshow(dataMatrixOrdered, cmap='hot')
ax2.plot(V[:,0], 'ok')
ax2.set_xlabel('Column')
ax2.set_ylabel('First right singular vector')
ax2.set_ylim([-0.8, 0.6])
ax3.plot(V[:,1], 'ok')
ax3.set_xlabel('Column')
ax3.set_ylabel('Second right singular vector')
ax3.set_ylim([-0.8, 0.6])
f.tight_layout();
# D and variance explained
f, (ax1, ax2) = subplots(ncols=2)
ax1.plot(D, 'ok')
ax1.set_xlabel('Column')
ax1.set_ylabel('Singular value')
ax2.plot(D**2 / np.sum(D**2), 'ok')
ax2.set_xlabel('Column')
ax2.set_ylabel('Percent of variance explained')
f.tight_layout();
# SVD performance
bigMatrix = np.reshape(np.random.normal(size=1e4*40), (1e4, -1))
# this computes full U matrix; warning: slow
print 'timing for computing full SVD matrices:'
%timeit numpy.linalg.svd(scale(bigMatrix))
print '-----------------------'
# this won't produce complete U and V matrices, but way faster
# I believe this is the comparable result compared to R
# by looking at the dimension of the U matrix from R's SVD,
# R by default doesn't compute full U matrix
print 'timing for computing incomplete SVD matrices:'
%timeit numpy.linalg.svd(scale(bigMatrix), full_matrices=False)
timing for computing full SVD matrices: 1 loops, best of 3: 30.7 s per loop ----------------------- timing for computing incomplete SVD matrices: 10 loops, best of 3: 86.5 ms per loop
# for comparison, this is how R performs on my machine
from IPython.core.display import Image
Image('https://dl.dropbox.com/u/12210898/Rscreenshot.png')
# SVD and NaN values
import random
dataMatrix2 = dataMatrixOrdered.ravel()
dataMatrix2[random.sample(range(100), 40)] = NaN
dataMatrix2 = dataMatrix2.reshape(dataMatrixOrdered.shape)
# ERROR
np.linalg.svd(scale(dataMatrix2))
--------------------------------------------------------------------------- LinAlgError Traceback (most recent call last) <ipython-input-27-e1d0ccd9dff7> in <module>() 7 8 # error ----> 9 np.linalg.svd(scale(dataMatrix2)) /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/linalg/linalg.pyc in svd(a, full_matrices, compute_uv) 1319 work, lwork, iwork, 0) 1320 if results['info'] > 0: -> 1321 raise LinAlgError, 'SVD did not converge' 1322 s = s.astype(_realType(result_t)) 1323 if compute_uv: LinAlgError: SVD did not converge
# interpolate NaN values and compare the SVD results
def nan_helper(y):
return np.isnan(y), lambda z: z.nonzero()[0]
idnan, x = nan_helper(dataMatrix2)
dataMatrix2[idnan] = np.interp(x(idnan), x(~idnan), dataMatrix2[~idnan])
U1, D1, V1 = np.linalg.svd(scale(dataMatrixOrdered))
U2, D2, V2 = np.linalg.svd(scale(dataMatrix2))
f, (ax1, ax2) = subplots(ncols=2)
ax1.plot(V1[:,0], 'ok')
ax1.set_ylim([-0.8, 0.8])
ax2.plot(V2[:,0], 'ok')
ax2.set_ylim([-0.8, 0.8])
f.tight_layout();
%%R -o faceData
download.file("https://spark-public.s3.amazonaws.com/dataanalysis/face.rda",destfile="./data/face.rda", method="curl")
load("../data/face.rda")
# face example
matshow(faceData, cmap='hot');
# face example - variance explained
U, D, V = np.linalg.svd(scale(faceData))
plot(D**2 / np.sum(D**2), 'ok')
xlabel('Singular vector')
ylabel('Variance explained');
# face example - create approximation
# note: the V matrix from numpy SVD is transposed compared to R
# therefore there's no need to transpose it again for the reconstruction
approx1 = np.dot(U[:,:1], np.dot(np.diag(D)[:1,:1], V[:1,:]))
approx5 = np.dot(U[:,:5], np.dot(np.diag(D)[:5,:5], V[:5,:]))
approx10 = np.dot(U[:,:10], np.dot(np.diag(D)[:10,:10], V[:10,:]))
f, (ax1, ax2, ax3, ax4) = subplots(ncols=4)
ax1.matshow(faceData, cmap='hot')
ax2.matshow(approx10, cmap='hot')
ax3.matshow(approx5, cmap='hot')
ax4.matshow(approx1, cmap='hot');