import numpy as np
As always, we create the same data as in video via rpy2
.
%load_ext rmagic
%%R -o x,y
set.seed(1234); par(mar=c(0,0,0,0))
x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2)
y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2)
plot(x,y,col='blue',pch=19,cex=2)
text(x+0.05,y+0.05,labels=as.character(0:11))
Perform k-means clustering. We'll use numpy
here, but the same function is also available in scikit-learn
.
from scipy.cluster.vq import kmeans, vq
data = np.vstack(zip(x, y))
centers, _ = kmeans(data, 3)
cluster, _ = vq(data, centers)
cluster
array([2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0])
# plot the clusters
plot(data[cluster==0,0], data[cluster==0,1], 'og',
data[cluster==1,0], data[cluster==1,1], 'or',
data[cluster==2,0], data[cluster==2,1], 'ok')
plot(centers[:,0], centers[:,1], '+', markersize=15);
Again generate data using rpy2
.
%%R -i x,y -o dataMatrix
dataFrame <- data.frame(x=x,y=y)
set.seed(1234)
dataMatrix <- as.matrix(dataFrame)[sample(1:12),]
The 'heatmap' plot as in the course video is generated here using matplotlib
's matshow
function.
# show heatmaps of the original matrix and after reordered by cluster
f, (ax1, ax2) = subplots(ncols=2)
ax1.matshow(dataMatrix, aspect='auto', cmap='hot')
centers, _ = kmeans(dataMatrix, 3)
cluster, _ = vq(dataMatrix, centers)
dataMatrixOrdered = np.vstack((dataMatrix[cluster==0,:], dataMatrix[cluster==1,:], dataMatrix[cluster==2,:]))
ax2.matshow(dataMatrixOrdered, aspect='auto', cmap='hot')
f.tight_layout();