In [97]:
from IPython.core.display import Image 
Image(filename= 'Mor Consulting CMYK.jpg')
Out[97]:

Measuring Similarity And Clustering Data

Bart Baddeley - PyData 2014

Why cluster data?

Introducing K-means clustering

Some tips on using K-means

Choosing between solutions and deciding on K

Spectral clustering

Why cluster data?

1. As a preprocessing step prior to some other data analysis technique 2. As an exploratory data analysis technique in its own right

Applications

Data summarisation Recommendation Systems - Collaborative Filtering Customer Segmentation Document Clustering - Topic Modelling Biological Data Analysis - Gene networks Social Network Analysis
In [62]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib

WARNING: pylab import has clobbered these variables: ['colors', 'eig']
`%pylab --no-import-all` prevents importing * from pylab and numpy

K - Means Clustering

In [98]:
import numpy as np
from sklearn.datasets.samples_generator import make_blobs, make_moons

# Generate sample data
np.random.seed(0)

centres = [[1, 1], [-0.5, 0], [1, -1]]
X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.3,0.3]])

figure(figsize=(10, 10))
colors = ['r','b','g']
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    cluster_center = centres[k]
    scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) 
    scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)

Run K-means with K set to 3

In [64]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
##############################################################################
# Compute clustering with 3 Clusters

k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_

##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
                               centres,
                               squared=True)
order = distance.argmin(axis=0)

# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
    my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)  
    scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)               
    my_members = k_means_3_labels == order[k]
    scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)           
    cluster_center = k_means_3_cluster_centres[order[k]]
    scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)            
    scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
             
title('KMeans 3')
Out[64]:
<matplotlib.text.Text at 0x36005b70>

K-means may not work well with data having different scales in different dimensions

In [65]:
# Generate sample data
np.random.seed(0)

centres = [[1, 0.75], [1, -0.75], [0, 0]]

X0, labels0_true = make_blobs(n_samples=300, centers=centres[0], cluster_std=[[0.6,0.1]])
X1, labels1_true = make_blobs(n_samples=300, centers=centres[1], cluster_std=[[0.6,0.1]])
X2, labels2_true = make_blobs(n_samples=300, centers=centres[2], cluster_std=[[0.6,0.1]])

X = np.concatenate((X0,X1,X2))
labels_true = np.concatenate((labels0_true,labels1_true+1,labels2_true+2))


figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    cluster_center = centres[k]
    scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) 
    scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)
axis('equal')
Out[65]:
(-2.0, 4.0, -1.5, 1.5)
In [66]:
##############################################################################
# Compute clustering with 3 Clusters

k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_

##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
                               centres,
                               squared=True)
order = distance.argmin(axis=0)

# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
    my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)  
    scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)               
    my_members = k_means_3_labels == order[k]
    scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)           
    cluster_center = k_means_3_cluster_centres[order[k]]
    scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)            
    scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
  
axis('equal')
title('KMeans 3')
Out[66]:
<matplotlib.text.Text at 0x362489e8>

We can cope with irrelevant dimensions so long as they do not vary too much

In [71]:
# Generate sample data
np.random.seed(0)

centres = [[1, 1, 0], [-0.5, 0, 0], [1, -1, 0]]
X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.4,0.4,0.4]])

colors = ['r','b','g']

figure(figsize=(20, 6.666))
subplot(1,3,1)
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) 
    cluster_center = centres[k]
    scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)
    axis('equal')
    
subplot(1,3,2)
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    scatter(X[my_members, 0], X[my_members, 2], c=col, marker='o',s=20) 
    cluster_center = centres[k]
    scatter(cluster_center[0], cluster_center[2], c=col, marker='o', s=200)
    axis('equal')
    
subplot(1,3,3)
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    scatter(X[my_members, 2], X[my_members, 1], c=col, marker='o',s=20) 
    cluster_center = centres[k]
    scatter(cluster_center[2], cluster_center[1], c=col, marker='o', s=200)
    axis('equal')
In [72]:
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_

##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
                               centres,
                               squared=True)
order = distance.argmin(axis=0)

# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
    my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)  
    scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)               
    my_members = k_means_3_labels == order[k]
    scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)           
    cluster_center = k_means_3_cluster_centres[order[k]]
    scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)            
    scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
             
title('KMeans 3')
Out[72]:
<matplotlib.text.Text at 0x36a15f98>