import pandas as pd import numpy as np # a simple function to compute hierarchical cluster on both rows and columns, and plot heatmaps def heatmap(dm): from scipy.cluster.hierarchy import linkage, dendrogram from scipy.spatial.distance import pdist, squareform D1 = squareform(pdist(dm, metric='euclidean')) D2 = squareform(pdist(dm.T, metric='euclidean')) f = figure(figsize=(8, 8)) # add first dendrogram ax1 = f.add_axes([0.09, 0.1, 0.2, 0.6]) Y = linkage(D1, method='complete') Z1 = dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) # add second dendrogram ax2 = f.add_axes([0.3, 0.71, 0.6, 0.2]) Y = linkage(D2, method='complete') Z2 = dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) # add matrix plot axmatrix = f.add_axes([0.3, 0.1, 0.6, 0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] D = dm[idx1, :] D = D[:, idx2] im = axmatrix.matshow(D, aspect='auto', cmap='hot') axmatrix.set_xticks([]) axmatrix.set_yticks([]) return {'ordered' : D, 'rorder' : Z1['leaves'], 'corder' : Z2['leaves']} %load_ext rmagic %%R -o dataMatrix set.seed(12345); par(mar=rep(0.2,4)) dataMatrix <- matrix(rnorm(400),nrow=40) set.seed(678910) for(i in 1:40){ coinFlip <- rbinom(1,size=1,prob=0.5) if(coinFlip){ dataMatrix[i,] <- dataMatrix[i,] + rep(c(0,3),each=5) } } # dataMatrix is generated and pattern-added using the same R commands as in the video f, ax = subplots(ncols=1) ax.matshow(dataMatrix, aspect='auto', cmap='hot'); # the heatmap after a pattern is added # note: the resulting cluster pattern is somehow mirrored compared to the one in the video hh = heatmap(dataMatrix) # patterns in rows and columns; compute and plot column and row means f, (ax1, ax2, ax3) = subplots(ncols=3) dataMatrixOrdered = hh['ordered'] ax1.matshow(dataMatrixOrdered, cmap='hot') ax2.plot(np.mean(dataMatrixOrdered, axis=1), range(39, -1, -1), 'ok') ax2.set_xlabel('Row Mean') ax2.set_ylabel('Row') ax3.plot(np.mean(dataMatrixOrdered, axis=0), 'ok') ax3.set_xlabel('Column') ax3.set_ylabel('Column Mean') f.tight_layout(); # a simple scale function to normalize a matrix def scale(matrix): from numpy import mean, std return (matrix - mean(matrix, axis=0)) / std(matrix, axis=0) # compute SVD # note: the U and V matrices computed in numpy are different from R # the resulting plots below aren't thus the same as the ones in video U, D, V = np.linalg.svd(scale(dataMatrixOrdered)) f, (ax1, ax2, ax3) = subplots(ncols=3) ax1.matshow(dataMatrixOrdered, cmap='hot') ax2.plot(U[:,0], range(39, -1, -1), 'ok') ax2.set_xlabel('First left singular vector') ax2.set_ylabel('Row') ax2.set_xticks([-0.2, 0.0, 0.2]) ax3.plot(V[:,0], 'ok') ax3.set_xlabel('Column') ax3.set_ylabel('First right singular vector') f.tight_layout(); # the D matrix, however, is the same f, (ax1, ax2) = subplots(ncols=2) ax1.plot(D, 'ok') ax1.set_xlabel('Column') ax1.set_ylabel('Singular value') ax2.plot(D**2 / np.sum(D**2), 'ok') ax2.set_xlabel('Column') ax2.set_ylabel('Percent of variance explained') f.tight_layout(); # components of the SVD - variance explained constantMatrix = dataMatrixOrdered * 0 constantMatrix[:,5:] = 1 U1, D1, V1 = np.linalg.svd(constantMatrix) f, (ax1, ax2, ax3) = subplots(ncols=3) ax1.matshow(constantMatrix, cmap='autumn') ax2.plot(D1, 'ok') ax2.set_xlabel('Column') ax2.set_ylabel('Singular value') ax3.plot(D1**2 / np.sum(D1**2), 'ok') ax3.set_xlabel('Column') ax3.set_ylabel('Percent of variance explained') f.tight_layout(); # again generate a new dataMatrix with R %%R -o dataMatrix set.seed(12345); par(mar=rep(0.2,4)) dataMatrix <- matrix(rnorm(400),nrow=40) set.seed(678910) for(i in 1:40){ coinFlip1 <- rbinom(1,size=1,prob=0.5) coinFlip2 <- rbinom(1,size=1,prob=0.5) if(coinFlip1){ dataMatrix[i,] <- dataMatrix[i,] + rep(c(0,5),each=5) } if(coinFlip2){ dataMatrix[i,] <- dataMatrix[i,] + rep(c(0,5),5) } } # cluster the dataMatrix from scipy.cluster.hierarchy import linkage, dendrogram from scipy.spatial.distance import pdist, squareform hh = dendrogram(linkage(pdist(dataMatrix))) dataMatrixOrdered = dataMatrix[hh['leaves'], :] # SVD - true pattern that was added to the dataMatrix f, (ax1, ax2, ax3) = subplots(ncols=3) ax1.matshow(dataMatrixOrdered, cmap='hot', origin='lower') x = np.array([0, 1]) ax2.plot(np.repeat(x, 5), 'ok') ax2.set_xlabel('Column') ax2.set_ylabel('Pattern 1') ax3.plot(np.tile(x, 5), 'ok') ax3.set_xlabel('Column') ax3.set_ylabel('Pattern 2') f.tight_layout(); # V and patterns of variance in rows # again, the SVD results here are not comparable to the video U, D, V = np.linalg.svd(scale(dataMatrixOrdered)) f, (ax1, ax2, ax3) = subplots(ncols=3) ax1.matshow(dataMatrixOrdered, cmap='hot') ax2.plot(V[:,0], 'ok') ax2.set_xlabel('Column') ax2.set_ylabel('First right singular vector') ax2.set_ylim([-0.8, 0.6]) ax3.plot(V[:,1], 'ok') ax3.set_xlabel('Column') ax3.set_ylabel('Second right singular vector') ax3.set_ylim([-0.8, 0.6]) f.tight_layout(); # D and variance explained f, (ax1, ax2) = subplots(ncols=2) ax1.plot(D, 'ok') ax1.set_xlabel('Column') ax1.set_ylabel('Singular value') ax2.plot(D**2 / np.sum(D**2), 'ok') ax2.set_xlabel('Column') ax2.set_ylabel('Percent of variance explained') f.tight_layout(); # SVD performance bigMatrix = np.reshape(np.random.normal(size=1e4*40), (1e4, -1)) # this computes full U matrix; warning: slow print 'timing for computing full SVD matrices:' %timeit numpy.linalg.svd(scale(bigMatrix)) print '-----------------------' # this won't produce complete U and V matrices, but way faster # I believe this is the comparable result compared to R # by looking at the dimension of the U matrix from R's SVD, # R by default doesn't compute full U matrix print 'timing for computing incomplete SVD matrices:' %timeit numpy.linalg.svd(scale(bigMatrix), full_matrices=False) # for comparison, this is how R performs on my machine from IPython.core.display import Image Image('https://dl.dropbox.com/u/12210898/Rscreenshot.png') # SVD and NaN values import random dataMatrix2 = dataMatrixOrdered.ravel() dataMatrix2[random.sample(range(100), 40)] = NaN dataMatrix2 = dataMatrix2.reshape(dataMatrixOrdered.shape) # ERROR np.linalg.svd(scale(dataMatrix2)) # interpolate NaN values and compare the SVD results def nan_helper(y): return np.isnan(y), lambda z: z.nonzero()[0] idnan, x = nan_helper(dataMatrix2) dataMatrix2[idnan] = np.interp(x(idnan), x(~idnan), dataMatrix2[~idnan]) U1, D1, V1 = np.linalg.svd(scale(dataMatrixOrdered)) U2, D2, V2 = np.linalg.svd(scale(dataMatrix2)) f, (ax1, ax2) = subplots(ncols=2) ax1.plot(V1[:,0], 'ok') ax1.set_ylim([-0.8, 0.8]) ax2.plot(V2[:,0], 'ok') ax2.set_ylim([-0.8, 0.8]) f.tight_layout(); %%R -o faceData download.file("https://spark-public.s3.amazonaws.com/dataanalysis/face.rda",destfile="./data/face.rda", method="curl") load("../data/face.rda") # face example matshow(faceData, cmap='hot'); # face example - variance explained U, D, V = np.linalg.svd(scale(faceData)) plot(D**2 / np.sum(D**2), 'ok') xlabel('Singular vector') ylabel('Variance explained'); # face example - create approximation # note: the V matrix from numpy SVD is transposed compared to R # therefore there's no need to transpose it again for the reconstruction approx1 = np.dot(U[:,:1], np.dot(np.diag(D)[:1,:1], V[:1,:])) approx5 = np.dot(U[:,:5], np.dot(np.diag(D)[:5,:5], V[:5,:])) approx10 = np.dot(U[:,:10], np.dot(np.diag(D)[:10,:10], V[:10,:])) f, (ax1, ax2, ax3, ax4) = subplots(ncols=4) ax1.matshow(faceData, cmap='hot') ax2.matshow(approx10, cmap='hot') ax3.matshow(approx5, cmap='hot') ax4.matshow(approx1, cmap='hot');