In [1]:

import matplotlib.pyplot as plt
import numpy as np
import sklearn.mixture

np.random.seed(42)

In [2]:

exon_data = np.loadtxt('data/Twoexondata.txt')
print exon_data.shape
plt.scatter(exon_data[:, 0], exon_data[:, 1])

(3000, 2)

Out[2]:

<matplotlib.collections.PathCollection at 0xb4ef72c>

Problem 1: K-Means¶

Implement K-Means¶

In [3]:

class Clustering(object):
    """Represents a clustering of data."""

    def __init__(self, centroids, assignments, error):
        self.centroids = centroids
        self.assignments = assignments
        self.error = error

    def __repr__(self):
        return "<Clustering error={0}>".format(self.error)

In [4]:

class KMeans(object):
    def __init__(self, num_clusters=3, num_random_inits=10, max_iters=100, error_delta=10e-5):
        """Implementation of K-means clustering.
        
        Clusters the data into `num_clusters` clusters. Does `num_random_inits` random restarts
        and tracks the clustering that minimizes the error (within-cluster sum of squares).
        Each random restart goes for a maximum of `max_iters` iterations or until the error
        decreases by less than `error_delta` in a single iteration.
        """
        self.num_clusters = num_clusters
        self.num_random_inits = num_random_inits
        self.max_iters = max_iters
        self.error_delta = error_delta
        self.data = None

    def cluster(self, data):
        """Do the clustering."""
        self.data = data
        clusterings = [self._do_single_cluster(self.data) for _ in xrange(self.num_random_inits)]
        clusterings.sort(key=lambda c: c.error)
        self.clusterings = clusterings
        return self
    
    def get_all_errors(self):
        """Return a list of the errors, one for each random restart."""
        return np.array([c.error for c in self.clusterings])

    def get_best_clustering(self):
        """Get the Clustering with the smallest error."""
        return self.clusterings[0]
    
    def get_error_coef_variation(self):
        """Get the coefficient of variation of the errors. The smaller the result,
        the less sensitive the clustering quality is to starting guesses."""
        errors = self.get_all_errors()
        return np.std(errors) / np.mean(errors)

    def get_plot(self, clustering):
        """Return a plot of the data colored by clusters."""
        fig, ax = plt.subplots()
        ax.scatter(self.data[:, 0], self.data[:, 1], c=clustering.assignments)
        x, y = clustering.centroids[:, 0], clustering.centroids[:, 1]
        ax.scatter(x, y, s=1000, marker='*', label='Centroid')
        ax.legend()
        return fig
    
    def get_best_plot(self):
        """Return a plot of the data colored by the best clustering."""
        return self.get_plot(self.get_best_clustering())

    def _do_single_cluster(self, data):
        """ Do a single clustering."""
        num_samples = data.shape[0]
        centroids = data[np.random.choice(num_samples, size=self.num_clusters, replace=False)]
        assignments = np.zeros(num_samples)
        
        error = float('inf')
        
        iteration = 0
        while iteration < self.max_iters:
            # E-step. Compute assignment of points to clusters.
            for i, sample in enumerate(data):
                centroid_dists = ((centroids - sample)**2).sum(axis=1)
                assignments[i] = np.argmin(centroid_dists)
            
            # M-step. Set each centroid's location as the mean of the points
            # assigned to it.
            new_error = 0.0
            for c, centroid in enumerate(centroids):
                centroids[c, :] = data[assignments == c].mean(axis=0)
                new_error += ((centroids[c] - data[assignments == c])**2).sum(axis=1).sum()
            
            # Quit early if the error delta is small enough.
            assert new_error <= error
            error_delta = abs(error - new_error)
            error = new_error
            if error_delta < self.error_delta:
                break

            iteration += 1

        return Clustering(centroids, assignments, error)

Run K-Means on Exon Data¶

In [5]:

def cluster_and_plot(data, num_clusters=3):
    kmeans = KMeans(num_clusters=num_clusters, num_random_inits=10, max_iters=1000)
    kmeans.cluster(data)
    clustering = kmeans.get_best_clustering()
    errors = np.array(sorted(kmeans.get_all_errors()))
    print num_clusters, 'clusters:'
    print '  Error: {0} +/- {1}'.format(errors.mean(), errors.std())
    print '  Coefficient of variation:', kmeans.get_error_coef_variation()
    #print '  Individual errors:', sorted(kmeans.get_all_errors())
    return kmeans.get_best_plot()

In [6]:

_ = cluster_and_plot(exon_data, num_clusters=3)

3 clusters:
  Error: 1525.98594799 +/- 0.133725778612
  Coefficient of variation: 8.76323787831e-05

In [7]:

_ = cluster_and_plot(exon_data, num_clusters=5)

5 clusters:
  Error: 1006.88059977 +/- 28.1882006481
  Coefficient of variation: 0.0279955743059

In [8]:

_ = cluster_and_plot(exon_data, num_clusters=8)

8 clusters:
  Error: 669.252603213 +/- 7.79066356633
  Coefficient of variation: 0.0116408416328

For each number of clusters I tried, I printed the mean error +/- one standard deviation (sum of squared distances of all points form their centroids) as well as the coefficient of variation of these errors. With 3 clusters, the coefficient of variation is very small which shows K-means is not very sensitive to the initial guesses in this case. The variation is higher for the examples with more clusters, but not by much.

Problem 2¶

In [9]:

def plot_error_ellipse(mu, sigma, std=2, ax=None, **kwargs):
    if ax is None:
        ax = plt.gca()
    num_points = 1000
    L = np.linalg.cholesky(sigma)
    points = np.linspace(0, 1, num=num_points)
    circle = np.array([np.cos(2 * np.pi * points),
                       np.sin(2 * np.pi * points)]) * std
    ellipse = L.dot(circle) + np.tile(mu, (num_points, 1)).T
    ax.plot(ellipse[0, :], ellipse[1, :], **kwargs)

In [10]:

np.random.seed(42)

In [11]:

def gmm_and_plot(data, n_components=3):
    gmm = sklearn.mixture.GMM(n_components=n_components, cvtype='full')
    gmm.fit(data)

    plt.scatter(data[:, 0], data[:, 1], c=gmm.predict(data), alpha=0.7)
    for c in xrange(gmm.n_components):
        plot_error_ellipse(gmm.means[c], gmm.covars[c], linewidth=2, color='black')
    plt.title("GMM with {0} Components".format(n_components))

In [12]:

gmm_and_plot(exon_data, n_components=3)

In [14]:

gmm_and_plot(exon_data, n_components=8)