from IPython.core.display import Image Image(filename= 'Mor Consulting CMYK.jpg') 1. As a preprocessing step prior to some other data analysis technique 2. As an exploratory data analysis technique in its own rightData summarisation Recommendation Systems - Collaborative Filtering Customer Segmentation Document Clustering - Topic Modelling Biological Data Analysis - Gene networks Social Network Analysis %pylab inline import numpy as np from sklearn.datasets.samples_generator import make_blobs, make_moons # Generate sample data np.random.seed(0) centres = [[1, 1], [-0.5, 0], [1, -1]] X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.3,0.3]]) figure(figsize=(10, 10)) colors = ['r','b','g'] for k, col in zip(range(3), colors): my_members = labels_true == k cluster_center = centres[k] scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200) from sklearn.cluster import KMeans from sklearn.metrics.pairwise import euclidean_distances ############################################################################## # Compute clustering with 3 Clusters k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10) k_means_3.fit(X) k_means_3_labels = k_means_3.labels_ k_means_3_cluster_centres = k_means_3.cluster_centers_ ############################################################################## # Plot result distance = euclidean_distances(k_means_3_cluster_centres, centres, squared=True) order = distance.argmin(axis=0) # KMeans 3 figure(figsize=(10, 10)) for k, col in zip(range(3), colors): my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k) scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200) my_members = k_means_3_labels == order[k] scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20) cluster_center = k_means_3_cluster_centres[order[k]] scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8) scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8) title('KMeans 3') # Generate sample data np.random.seed(0) centres = [[1, 0.75], [1, -0.75], [0, 0]] X0, labels0_true = make_blobs(n_samples=300, centers=centres[0], cluster_std=[[0.6,0.1]]) X1, labels1_true = make_blobs(n_samples=300, centers=centres[1], cluster_std=[[0.6,0.1]]) X2, labels2_true = make_blobs(n_samples=300, centers=centres[2], cluster_std=[[0.6,0.1]]) X = np.concatenate((X0,X1,X2)) labels_true = np.concatenate((labels0_true,labels1_true+1,labels2_true+2)) figure(figsize=(10, 10)) for k, col in zip(range(3), colors): my_members = labels_true == k cluster_center = centres[k] scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200) axis('equal') ############################################################################## # Compute clustering with 3 Clusters k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10) k_means_3.fit(X) k_means_3_labels = k_means_3.labels_ k_means_3_cluster_centres = k_means_3.cluster_centers_ ############################################################################## # Plot result distance = euclidean_distances(k_means_3_cluster_centres, centres, squared=True) order = distance.argmin(axis=0) # KMeans 3 figure(figsize=(10, 10)) for k, col in zip(range(3), colors): my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k) scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200) my_members = k_means_3_labels == order[k] scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20) cluster_center = k_means_3_cluster_centres[order[k]] scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8) scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8) axis('equal') title('KMeans 3') # Generate sample data np.random.seed(0) centres = [[1, 1, 0], [-0.5, 0, 0], [1, -1, 0]] X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.4,0.4,0.4]]) colors = ['r','b','g'] figure(figsize=(20, 6.666)) subplot(1,3,1) for k, col in zip(range(3), colors): my_members = labels_true == k scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) cluster_center = centres[k] scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200) axis('equal') subplot(1,3,2) for k, col in zip(range(3), colors): my_members = labels_true == k scatter(X[my_members, 0], X[my_members, 2], c=col, marker='o',s=20) cluster_center = centres[k] scatter(cluster_center[0], cluster_center[2], c=col, marker='o', s=200) axis('equal') subplot(1,3,3) for k, col in zip(range(3), colors): my_members = labels_true == k scatter(X[my_members, 2], X[my_members, 1], c=col, marker='o',s=20) cluster_center = centres[k] scatter(cluster_center[2], cluster_center[1], c=col, marker='o', s=200) axis('equal') k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10) k_means_3.fit(X) k_means_3_labels = k_means_3.labels_ k_means_3_cluster_centres = k_means_3.cluster_centers_ ############################################################################## # Plot result distance = euclidean_distances(k_means_3_cluster_centres, centres, squared=True) order = distance.argmin(axis=0) # KMeans 3 figure(figsize=(10, 10)) for k, col in zip(range(3), colors): my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k) scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200) my_members = k_means_3_labels == order[k] scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20) cluster_center = k_means_3_cluster_centres[order[k]] scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8) scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8) title('KMeans 3') # Generate sample data np.random.seed(0) centres = [[1, 1, 0, 0, 0, 0, 0, 0, 0], [-0.5, 0, 0, 0, 0, 0, 0, 0, 0], [1, -1, 0, 0, 0, 0, 0, 0, 0]] X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=0.4) k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10) k_means_3.fit(X) k_means_3_labels = k_means_3.labels_ k_means_3_cluster_centres = k_means_3.cluster_centers_ ############################################################################## # Plot result distance = euclidean_distances(k_means_3_cluster_centres, centres, squared=True) order = distance.argmin(axis=0) # KMeans 3 figure(figsize=(10, 10)) for k, col in zip(range(3), colors): my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k) scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200) my_members = k_means_3_labels == order[k] scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20) cluster_center = k_means_3_cluster_centres[order[k]] scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8) scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8) title('KMeans 3') # Generate sample data np.random.seed(0) centres = [[1, 1, 0], [-0.5, 0, 0], [1, -1, 0]] X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.4,0.4,1.5]]) colors = ['r','b','g'] figure(figsize=(12, 4)) subplot(1,3,1) for k, col in zip(range(3), colors): my_members = labels_true == k scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) cluster_center = centres[k] scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200) axis('equal') subplot(1,3,2) for k, col in zip(range(3), colors): my_members = labels_true == k scatter(X[my_members, 0], X[my_members, 2], c=col, marker='o',s=20) cluster_center = centres[k] scatter(cluster_center[0], cluster_center[2], c=col, marker='o', s=200) axis('equal') subplot(1,3,3) for k, col in zip(range(3), colors): my_members = labels_true == k scatter(X[my_members, 2], X[my_members, 1], c=col, marker='o',s=20) cluster_center = centres[k] scatter(cluster_center[2], cluster_center[1], c=col, marker='o', s=200) axis('equal') k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10) k_means_3.fit(X) k_means_3_labels = k_means_3.labels_ k_means_3_cluster_centres = k_means_3.cluster_centers_ ############################################################################## # Plot result distance = euclidean_distances(k_means_3_cluster_centres, centres, squared=True) order = distance.argmin(axis=0) # KMeans 3 figure(figsize=(10, 10)) for k, col in zip(range(3), colors): my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k) scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200) my_members = k_means_3_labels == order[k] scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20) cluster_center = k_means_3_cluster_centres[order[k]] scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8) scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8) title('KMeans 3') [X, true_labels] = make_moons(n_samples=1000, noise=.05) scatter(X[:, 0], X[:, 1], c=col, marker='o',s=20) axis('equal') k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10) k_means_2.fit(X) k_means_2_labels = k_means_2.labels_ k_means_2_cluster_centers = k_means_2.cluster_centers_ for k, col in zip(range(2), colors): my_members = k_means_2_labels == k scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20) cluster_center = k_means_2_cluster_centers[k] scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8) # Generate sample data np.random.seed(1) centres = [[1, -1, 1.1, 1, 0, 2, 1], [1, -1, 1, 2, 0, 0, 0], [1, -1, 1.5, 0, 0, 0, 0]] X, labels_true = make_blobs(n_samples=300, centers=centres, cluster_std = 0.4) k_means_3a = KMeans(init='k-means++', n_clusters=3, n_init=10) k_means_3a.fit(X) k_means_3a_labels = k_means_3a.labels_ k_means_3a_cluster_centres = k_means_3a.cluster_centers_ k_means_3b = KMeans(init='random', n_clusters=3, n_init=1, max_iter=1) k_means_3b.fit(X) k_means_3b_labels = k_means_3b.labels_ k_means_3b_cluster_centres = k_means_3b.cluster_centers_ distance_a = euclidean_distances(k_means_3a_cluster_centres, centres, squared=True) order_a = distance_a.argmin(axis=0) distance_b = euclidean_distances(k_means_3b_cluster_centres, centres, squared=True) order_b = distance_b.argmin(axis=0) # KMeans 3 figure(figsize=(10, 4)) wrong_a = 0 correct_a = 0 wrong_b = 0 correct_b = 0 for k in range(3): wrong_a += np.sum((k_means_3a_labels == order_a[k]) & (labels_true != k)) correct_a += np.sum((k_means_3a_labels == order_a[k]) & (labels_true == k)) wrong_b += np.sum((k_means_3b_labels == order_b[k]) & (labels_true != k)) correct_b += np.sum((k_means_3b_labels == order_b[k]) & (labels_true == k)) subplot(1,2,1) bar(range(2),[wrong_a, correct_a]) xticks([0.5,1.5],('Wrong','Correct')) title('Run 1') subplot(1,2,2) bar(range(2),[wrong_b, correct_b]) xticks([0.5,1.5],('Wrong','Correct')) title('Run 2') bar(range(2),[k_means_3a.inertia_, k_means_3b.inertia_]) xticks([0.5,1.5],('Run 1','Run 2')) title('Inertia') # Generate sample data np.random.seed(1) centers = [[1, 1], [-1, -1], [1, -1]] n_clusters = len(centers) X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std= [0.5]) # train 3 different models k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10) k_means_2.fit(X) k_means_2_labels = k_means_2.labels_ k_means_2_cluster_centers = k_means_2.cluster_centers_ k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10) k_means_3.fit(X) k_means_3_labels = k_means_3.labels_ k_means_3_cluster_centers = k_means_3.cluster_centers_ k_means_4 = KMeans(init='k-means++', n_clusters=4, n_init=10) k_means_4.fit(X) k_means_4_labels = k_means_4.labels_ k_means_4_cluster_centers = k_means_4.cluster_centers_ figure(figsize=(10, 5)) bar(range(3),[k_means_2.inertia_, k_means_3.inertia_, k_means_4.inertia_]) xticks([0.5,1.5,2.5],('k=2\n inertia =','k=3','k=4')) title("Inertia") from sklearn import metrics # Determine the silhouette scores k_means_2_silhouette_score = metrics.silhouette_score(X, k_means_2_labels, metric='euclidean') k_means_3_silhouette_score = metrics.silhouette_score(X, k_means_3_labels, metric='euclidean') k_means_4_silhouette_score = metrics.silhouette_score(X, k_means_4_labels, metric='euclidean') figure(figsize=(10, 5)) bar(range(3),[k_means_2_silhouette_score, k_means_3_silhouette_score, k_means_4_silhouette_score]) xticks([0.5,1.5,2.5],('k=2','k=3','k=4')) title("Silhouette Coefficient") Good if data well described by centroids Be carefull if data has a different scale in different dimensions Remove irrelevant dimensions if possible Measuring similarity in high dimensions can get difficult Will not work well for non-isotropic data Use "Inertia" or SSE to choose between different clustering results with the same number of clusters Use "Silhouette Coefficient" to choose number of clusters # Calculate the squared distance between each point distance = euclidean_distances(X,X,squared=True) sig = 0.5 # Turn the distance into a similarity by applying a Gaussian kernel similarity = np.exp(-distance/sig) figure(figsize=(10, 10)) imshow(similarity) figure(figsize=(12, 4)) # reorder according to class labels ind_2 = np.argsort(k_means_2_labels) subplot(1,3,1) imshow(similarity[ind_2].T[ind_2]) title('K=2 \n' + str(k_means_2_silhouette_score)) ind_3 = np.argsort(k_means_3_labels) subplot(1,3,2) imshow(similarity[ind_3].T[ind_3]) title('K=3 \n' + str(k_means_3_silhouette_score)) ind_4 = np.argsort(k_means_4_labels) subplot(1,3,3) imshow(similarity[ind_4].T[ind_4]) title('K=4 \n' + str(k_means_4_silhouette_score)) Finds clusters by looking for connectivity between cluster members Not restricted to similarity based on Euclidean distance Works for non-isotropic data Provides a way to estimate the number of clusters ################################################################################ # Construct the similarity graph # Calculate the squared distance between each point distance = euclidean_distances(X,X,squared=True) sig = 0.5 # Turn the distance into a similarity by applying a Gaussian kernel similarity = np.exp(-distance/sig) # Calculate the adjacency graph by sparsifying the simliarity matrix A = (np.ones(similarity.shape)-np.eye(similarity.shape[0]))*similarity # and sparsifying by only keeping the n most similar entries num_neighbours = 5 for i in range(A.shape[0]): s = np.sort(A[i]) s = s[::-1] #reverse order A[i][A[i]