#!/usr/bin/env python # coding: utf-8 # # Chapter 10 - Unsupervised Learning # - [Lab 1: Principal Component Analysis](#Lab-1:-Principal-Component-Analysis) # - [Lab 2: K-Means Clustering](#Lab-2:-Clustering) # - [Lab 2: Hierarchical Clustering](#10.5.3-Hierarchical-Clustering) # - [Lab 3: NCI60 Data Example](#Lab-3:-NCI60-Data-Example) # In[1]: # %load ../standard_import.txt import pandas as pd import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import scale from sklearn.decomposition import PCA from sklearn.cluster import KMeans from scipy.cluster import hierarchy get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('seaborn-white') # ## Lab 1: Principal Component Analysis # In[2]: # In R, I exported the dataset to a csv file. It is part of the base R distribution. df = pd.read_csv('Data/USArrests.csv', index_col=0) df.info() # In[3]: df.mean() # In[4]: df.var() # In[5]: X = pd.DataFrame(scale(df), index=df.index, columns=df.columns) # In[6]: # The loading vectors pca_loadings = pd.DataFrame(PCA().fit(X).components_.T, index=df.columns, columns=['V1', 'V2', 'V3', 'V4']) pca_loadings # In[7]: # Fit the PCA model and transform X to get the principal components pca = PCA() df_plot = pd.DataFrame(pca.fit_transform(X), columns=['PC1', 'PC2', 'PC3', 'PC4'], index=X.index) df_plot # In[8]: fig , ax1 = plt.subplots(figsize=(9,7)) ax1.set_xlim(-3.5,3.5) ax1.set_ylim(-3.5,3.5) # Plot Principal Components 1 and 2 for i in df_plot.index: ax1.annotate(i, (df_plot.PC1.loc[i], -df_plot.PC2.loc[i]), ha='center') # Plot reference lines ax1.hlines(0,-3.5,3.5, linestyles='dotted', colors='grey') ax1.vlines(0,-3.5,3.5, linestyles='dotted', colors='grey') ax1.set_xlabel('First Principal Component') ax1.set_ylabel('Second Principal Component') # Plot Principal Component loading vectors, using a second y-axis. ax2 = ax1.twinx().twiny() ax2.set_ylim(-1,1) ax2.set_xlim(-1,1) ax2.tick_params(axis='y', colors='orange') ax2.set_xlabel('Principal Component loading vectors', color='orange') # Plot labels for vectors. Variable 'a' is a small offset parameter to separate arrow tip and text. a = 1.07 for i in pca_loadings[['V1', 'V2']].index: ax2.annotate(i, (pca_loadings.V1.loc[i]*a, -pca_loadings.V2.loc[i]*a), color='orange') # Plot vectors ax2.arrow(0,0,pca_loadings.V1[0], -pca_loadings.V2[0]) ax2.arrow(0,0,pca_loadings.V1[1], -pca_loadings.V2[1]) ax2.arrow(0,0,pca_loadings.V1[2], -pca_loadings.V2[2]) ax2.arrow(0,0,pca_loadings.V1[3], -pca_loadings.V2[3]); # In[9]: # Standard deviation of the four principal components np.sqrt(pca.explained_variance_) # In[10]: pca.explained_variance_ # In[11]: pca.explained_variance_ratio_ # In[12]: plt.figure(figsize=(7,5)) plt.plot([1,2,3,4], pca.explained_variance_ratio_, '-o', label='Individual component') plt.plot([1,2,3,4], np.cumsum(pca.explained_variance_ratio_), '-s', label='Cumulative') plt.ylabel('Proportion of Variance Explained') plt.xlabel('Principal Component') plt.xlim(0.75,4.25) plt.ylim(0,1.05) plt.xticks([1,2,3,4]) plt.legend(loc=2); # ## Lab 2: Clustering # ### 10.5.1 K-Means Clustering # In[13]: # Generate data np.random.seed(2) X = np.random.standard_normal((50,2)) X[:25,0] = X[:25,0]+3 X[:25,1] = X[:25,1]-4 # #### K = 2 # In[14]: km1 = KMeans(n_clusters=2, n_init=20) km1.fit(X) # In[15]: km1.labels_ # See plot for K=2 below. # #### K = 3 # In[16]: np.random.seed(4) km2 = KMeans(n_clusters=3, n_init=20) km2.fit(X) # In[17]: pd.Series(km2.labels_).value_counts() # In[18]: km2.cluster_centers_ # In[19]: km2.labels_ # In[20]: # Sum of distances of samples to their closest cluster center. km2.inertia_ # In[21]: fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,5)) ax1.scatter(X[:,0], X[:,1], s=40, c=km1.labels_, cmap=plt.cm.prism) ax1.set_title('K-Means Clustering Results with K=2') ax1.scatter(km1.cluster_centers_[:,0], km1.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2) ax2.scatter(X[:,0], X[:,1], s=40, c=km2.labels_, cmap=plt.cm.prism) ax2.set_title('K-Means Clustering Results with K=3') ax2.scatter(km2.cluster_centers_[:,0], km2.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2); # ### 10.5.3 Hierarchical Clustering # #### scipy # In[22]: fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18)) for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'], [ax1,ax2,ax3]): cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0) ax1.set_title('Complete Linkage') ax2.set_title('Average Linkage') ax3.set_title('Single Linkage'); # ## Lab 3: NCI60 Data Example # ### § 10.6.1 PCA # In[23]: # In R, I exported the two elements of this ISLR dataset to csv files. # There is one file for the features and another file for the classes/types. df2 = pd.read_csv('Data/NCI60_X.csv').drop('Unnamed: 0', axis=1) df2.columns = np.arange(df2.columns.size) df2.info() # In[24]: X = pd.DataFrame(scale(df2)) X.shape # In[25]: y = pd.read_csv('Data/NCI60_y.csv', usecols=[1], skiprows=1, names=['type']) y.shape # In[26]: y.type.value_counts() # In[27]: # Fit the PCA model and transform X to get the principal components pca2 = PCA() df2_plot = pd.DataFrame(pca2.fit_transform(X)) # In[28]: fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,6)) color_idx = pd.factorize(y.type)[0] cmap = plt.cm.hsv # Left plot ax1.scatter(df2_plot.iloc[:,0], -df2_plot.iloc[:,1], c=color_idx, cmap=cmap, alpha=0.5, s=50) ax1.set_ylabel('Principal Component 2') # Right plot ax2.scatter(df2_plot.iloc[:,0], df2_plot.iloc[:,2], c=color_idx, cmap=cmap, alpha=0.5, s=50) ax2.set_ylabel('Principal Component 3') # Custom legend for the classes (y) since we do not create scatter plots per class (which could have their own labels). handles = [] labels = pd.factorize(y.type.unique()) norm = mpl.colors.Normalize(vmin=0.0, vmax=14.0) for i, v in zip(labels[0], labels[1]): handles.append(mpl.patches.Patch(color=cmap(norm(i)), label=v, alpha=0.5)) ax2.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) # xlabel for both plots for ax in fig.axes: ax.set_xlabel('Principal Component 1') # In[29]: pd.DataFrame([df2_plot.iloc[:,:5].std(axis=0, ddof=0).as_matrix(), pca2.explained_variance_ratio_[:5], np.cumsum(pca2.explained_variance_ratio_[:5])], index=['Standard Deviation', 'Proportion of Variance', 'Cumulative Proportion'], columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5']) # In[30]: df2_plot.iloc[:,:10].var(axis=0, ddof=0).plot(kind='bar', rot=0) plt.ylabel('Variances'); # In[31]: fig , (ax1,ax2) = plt.subplots(1,2, figsize=(15,5)) # Left plot ax1.plot(pca2.explained_variance_ratio_, '-o') ax1.set_ylabel('Proportion of Variance Explained') ax1.set_ylim(ymin=-0.01) # Right plot ax2.plot(np.cumsum(pca2.explained_variance_ratio_), '-ro') ax2.set_ylabel('Cumulative Proportion of Variance Explained') ax2.set_ylim(ymax=1.05) for ax in fig.axes: ax.set_xlabel('Principal Component') ax.set_xlim(-1,65) # ### § 10.6.2 Clustering # In[32]: X= pd.DataFrame(scale(df2), index=y.type, columns=df2.columns) # In[33]: fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(20,20)) for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'], [ax1,ax2,ax3]): cluster = hierarchy.dendrogram(linkage, labels=X.index, orientation='right', color_threshold=0, leaf_font_size=10, ax=ax) ax1.set_title('Complete Linkage') ax2.set_title('Average Linkage') ax3.set_title('Single Linkage'); # In[34]: plt.figure(figsize=(10,20)) cut4 = hierarchy.dendrogram(hierarchy.complete(X), labels=X.index, orientation='right', color_threshold=140, leaf_font_size=10) plt.vlines(140,0,plt.gca().yaxis.get_data_interval()[1], colors='r', linestyles='dashed'); # ##### KMeans # In[35]: np.random.seed(2) km4 = KMeans(n_clusters=4, n_init=50) km4.fit(X) # In[36]: km4.labels_ # In[37]: # Observations per KMeans cluster pd.Series(km4.labels_).value_counts().sort_index() # ##### Hierarchical # In[38]: # Observations per Hierarchical cluster cut4b = hierarchy.dendrogram(hierarchy.complete(X), truncate_mode='lastp', p=4, show_leaf_counts=True) # In[39]: # Hierarchy based on Principal Components 1 to 5 plt.figure(figsize=(10,20)) pca_cluster = hierarchy.dendrogram(hierarchy.complete(df2_plot.iloc[:,:5]), labels=y.type.values, orientation='right', color_threshold=100, leaf_font_size=10) # In[40]: cut4c = hierarchy.dendrogram(hierarchy.complete(df2_plot), truncate_mode='lastp', p=4, show_leaf_counts=True) # See also color coding in plot above.