from __future__ import division import pandas as pd import numpy as np import scipy as sp import scipy.sparse as ss import matplotlib.pyplot as plt import pylab as pl from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.cluster import Ward from sklearn.preprocessing import Binarizer %matplotlib inline # Dataset imported from R using write.csv(USArrests, "/tmp/USArrests.csv", row.names=FALSE) # Each row in this dataset corresponds to one of the 50 US states. usa_df = pd.read_csv("../data/USArrests.csv") usa_df.head() usa_df.describe() collist = usa_df.columns[1:] X = usa_df[collist].values Xdiv = np.linalg.norm(X, ord=2, axis=1) usa_df[collist] = X / Xdiv[:, None] usa_df.head() usa_df.describe() X = usa_df[collist].values cosim = np.matrix(X) * np.matrix(X.T) cosim def heatmap(X, labels): """ Based on this stack overflow discussion """ """ http://stackoverflow.com/questions/14391959/heatmap-in-matplotlib-with-pcolor """ fig, ax = plt.subplots() heatmap = ax.pcolor(X, cmap=plt.cm.Blues, alpha=0.8) # Format fig = plt.gcf() fig.set_size_inches(8, 11) # turn off the frame ax.set_frame_on(False) # put the major ticks at the middle of each cell ax.set_yticks(np.arange(X.shape[0]) + 0.5, minor=False) ax.set_xticks(np.arange(X.shape[1]) + 0.5, minor=False) # want a more natural, table-like display ax.invert_yaxis() ax.xaxis.tick_top() # Set the labels ax.set_xticklabels(labels, minor=False) ax.set_yticklabels(labels, minor=False) # rotate the xticks plt.xticks(rotation=90) ax.grid(False) # Turn off all the ticks ax = plt.gca() for t in ax.xaxis.get_major_ticks(): t.tick1On = False t.tick2On = False for t in ax.yaxis.get_major_ticks(): t.tick1On = False t.tick2On = False heatmap(np.array(cosim), usa_df["State"].values) # PCA seems to consider each row as a component. So doing a PCA.fit(X) will decompose # our 50x4 matrix to 2x4. So we need to fit the transpose of X (4x50) and transpose the # resulting principal components 2x50. pca = PCA(n_components=2) pca.fit(X.T) print "Explained variance:", pca.explained_variance_ratio_ X_pr = pca.components_.T X_pr[0:5, :] # Calculate cosine similarity using X_pr and plot a heat map X_pdiv = np.linalg.norm(X_pr, ord=2, axis=1) X_prn = X_pr / X_pdiv[:, None] cosim_p = np.matrix(X_prn) * np.matrix(X_prn.T) cosim_p heatmap(np.array(cosim_p), usa_df["State"].values) X = np.random.rand(100, 2) X_mean = 8 + (4 * np.random.rand(4, 2)) # N(8,4) which = np.random.choice(np.array([0,1,2,3]), size=100, replace=True) for i in range(0, X.shape[0]): X[i] = X[i] + X_mean[which[i], :] # Plot the points fig, ax = plt.subplots() ax.scatter(X[which == 0][:, 0], X[which == 0][:, 1], c='blue') ax.scatter(X[which == 1][:, 0], X[which == 1][:, 1], c='green') ax.scatter(X[which == 2][:, 0], X[which == 2][:, 1], c='red') ax.scatter(X[which == 3][:, 0], X[which == 3][:, 1], c='cyan') kmeans = KMeans(n_clusters=4, n_init=15) kmeans.fit(X) ypred = kmeans.predict(X) # Print confusion matrix. Note that the matrix is not aligned because we don't know # the correspondence between the assigned cluster and the generated cluster, but the # matrix should show one high value per row and/or column. confusion_matrix = np.zeros((4, 4)) for i in range(0, which.shape[0]): actual = which[i] predicted = ypred[i] confusion_matrix[actual, predicted] = confusion_matrix[actual, predicted] + 1 print confusion_matrix # Plot points with cluster centers (marked with +) fig, ax = plt.subplots() ax.scatter(X[which == 0][:, 0], X[which == 0][:, 1], c='blue') ax.scatter(X[which == 1][:, 0], X[which == 1][:, 1], c='green') ax.scatter(X[which == 2][:, 0], X[which == 2][:, 1], c='red') ax.scatter(X[which == 3][:, 0], X[which == 3][:, 1], c='cyan') for cc in kmeans.cluster_centers_: ax.plot(cc[0], cc[1], marker='+', color='black', markersize=20) # produce a connectivity matrix based on cosine similarity norms = np.linalg.norm(X, ord=2, axis=1) X_n = X / norms[:, None] cosim = np.matrix(X_n) * np.matrix(X_n.T) binarizer = Binarizer(threshold=0.5).fit_transform(cosim) cosim_sparse = ss.csr_matrix(cosim) # run the clustering ward = Ward(n_clusters=4, connectivity=cosim_sparse) ypred = ward.fit_predict(X) # compute the confusion matrix for hierarchical clustering confusion_matrix = np.zeros((4, 4)) for i in range(0, which.shape[0]): actual = which[i] predicted = ypred[i] confusion_matrix[actual, predicted] = confusion_matrix[actual, predicted] + 1 confusion_matrix