from __future__ import division
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse as ss
import matplotlib.pyplot as plt
import pylab as pl
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import Ward
from sklearn.preprocessing import Binarizer
%matplotlib inline

# Dataset imported from R using write.csv(USArrests, "/tmp/USArrests.csv", row.names=FALSE)
# Each row in this dataset corresponds to one of the 50 US states.
usa_df = pd.read_csv("../data/USArrests.csv")
usa_df.head()

usa_df.describe()

collist = usa_df.columns[1:]
X = usa_df[collist].values
Xdiv = np.linalg.norm(X, ord=2, axis=1)
usa_df[collist] = X / Xdiv[:, None]
usa_df.head()

usa_df.describe()

X = usa_df[collist].values
cosim = np.matrix(X) * np.matrix(X.T)
cosim

def heatmap(X, labels):
    """ Based on this stack overflow discussion """
    """ http://stackoverflow.com/questions/14391959/heatmap-in-matplotlib-with-pcolor """
    fig, ax = plt.subplots()
    heatmap = ax.pcolor(X, cmap=plt.cm.Blues, alpha=0.8)
    # Format
    fig = plt.gcf()
    fig.set_size_inches(8, 11)
    # turn off the frame
    ax.set_frame_on(False)
    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(X.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(X.shape[1]) + 0.5, minor=False)
    # want a more natural, table-like display
    ax.invert_yaxis()
    ax.xaxis.tick_top()
    # Set the labels
    ax.set_xticklabels(labels, minor=False)
    ax.set_yticklabels(labels, minor=False)
    # rotate the xticks
    plt.xticks(rotation=90)
    ax.grid(False)
    # Turn off all the ticks
    ax = plt.gca()
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

heatmap(np.array(cosim), usa_df["State"].values)

# PCA seems to consider each row as a component. So doing a PCA.fit(X) will decompose
# our 50x4 matrix to 2x4. So we need to fit the transpose of X (4x50) and transpose the 
# resulting principal components 2x50.
pca = PCA(n_components=2)
pca.fit(X.T)
print "Explained variance:", pca.explained_variance_ratio_
X_pr = pca.components_.T
X_pr[0:5, :]

# Calculate cosine similarity using X_pr and plot a heat map
X_pdiv = np.linalg.norm(X_pr, ord=2, axis=1)
X_prn = X_pr / X_pdiv[:, None]
cosim_p = np.matrix(X_prn) * np.matrix(X_prn.T)
cosim_p

heatmap(np.array(cosim_p), usa_df["State"].values)

X = np.random.rand(100, 2)
X_mean = 8 + (4 * np.random.rand(4, 2))  # N(8,4)
which = np.random.choice(np.array([0,1,2,3]), size=100, replace=True)
for i in range(0, X.shape[0]):
    X[i] = X[i] + X_mean[which[i], :]

# Plot the points
fig, ax = plt.subplots()
ax.scatter(X[which == 0][:, 0], X[which == 0][:, 1], c='blue')
ax.scatter(X[which == 1][:, 0], X[which == 1][:, 1], c='green')
ax.scatter(X[which == 2][:, 0], X[which == 2][:, 1], c='red')
ax.scatter(X[which == 3][:, 0], X[which == 3][:, 1], c='cyan')

kmeans = KMeans(n_clusters=4, n_init=15)
kmeans.fit(X)
ypred = kmeans.predict(X)

# Print confusion matrix. Note that the matrix is not aligned because we don't know
# the correspondence between the assigned cluster and the generated cluster, but the
# matrix should show one high value per row and/or column.
confusion_matrix = np.zeros((4, 4))
for i in range(0, which.shape[0]):
    actual = which[i]
    predicted = ypred[i]
    confusion_matrix[actual, predicted] = confusion_matrix[actual, predicted] + 1
print confusion_matrix

# Plot points with cluster centers (marked with +)
fig, ax = plt.subplots()
ax.scatter(X[which == 0][:, 0], X[which == 0][:, 1], c='blue')
ax.scatter(X[which == 1][:, 0], X[which == 1][:, 1], c='green')
ax.scatter(X[which == 2][:, 0], X[which == 2][:, 1], c='red')
ax.scatter(X[which == 3][:, 0], X[which == 3][:, 1], c='cyan')
for cc in kmeans.cluster_centers_:
    ax.plot(cc[0], cc[1], marker='+', color='black', markersize=20)

# produce a connectivity matrix based on cosine similarity
norms = np.linalg.norm(X, ord=2, axis=1)
X_n = X / norms[:, None]
cosim = np.matrix(X_n) * np.matrix(X_n.T)
binarizer = Binarizer(threshold=0.5).fit_transform(cosim)
cosim_sparse = ss.csr_matrix(cosim)

# run the clustering
ward = Ward(n_clusters=4, connectivity=cosim_sparse)
ypred = ward.fit_predict(X)

# compute the confusion matrix for hierarchical clustering
confusion_matrix = np.zeros((4, 4))
for i in range(0, which.shape[0]):
    actual = which[i]
    predicted = ypred[i]
    confusion_matrix[actual, predicted] = confusion_matrix[actual, predicted] + 1
confusion_matrix