import matplotlib.pyplot as plt
import numpy as np
import sklearn.mixture
np.random.seed(42)
exon_data = np.loadtxt('data/Twoexondata.txt')
print exon_data.shape
plt.scatter(exon_data[:, 0], exon_data[:, 1])
(3000, 2)
<matplotlib.collections.PathCollection at 0xb4ef72c>
class Clustering(object):
"""Represents a clustering of data."""
def __init__(self, centroids, assignments, error):
self.centroids = centroids
self.assignments = assignments
self.error = error
def __repr__(self):
return "<Clustering error={0}>".format(self.error)
class KMeans(object):
def __init__(self, num_clusters=3, num_random_inits=10, max_iters=100, error_delta=10e-5):
"""Implementation of K-means clustering.
Clusters the data into `num_clusters` clusters. Does `num_random_inits` random restarts
and tracks the clustering that minimizes the error (within-cluster sum of squares).
Each random restart goes for a maximum of `max_iters` iterations or until the error
decreases by less than `error_delta` in a single iteration.
"""
self.num_clusters = num_clusters
self.num_random_inits = num_random_inits
self.max_iters = max_iters
self.error_delta = error_delta
self.data = None
def cluster(self, data):
"""Do the clustering."""
self.data = data
clusterings = [self._do_single_cluster(self.data) for _ in xrange(self.num_random_inits)]
clusterings.sort(key=lambda c: c.error)
self.clusterings = clusterings
return self
def get_all_errors(self):
"""Return a list of the errors, one for each random restart."""
return np.array([c.error for c in self.clusterings])
def get_best_clustering(self):
"""Get the Clustering with the smallest error."""
return self.clusterings[0]
def get_error_coef_variation(self):
"""Get the coefficient of variation of the errors. The smaller the result,
the less sensitive the clustering quality is to starting guesses."""
errors = self.get_all_errors()
return np.std(errors) / np.mean(errors)
def get_plot(self, clustering):
"""Return a plot of the data colored by clusters."""
fig, ax = plt.subplots()
ax.scatter(self.data[:, 0], self.data[:, 1], c=clustering.assignments)
x, y = clustering.centroids[:, 0], clustering.centroids[:, 1]
ax.scatter(x, y, s=1000, marker='*', label='Centroid')
ax.legend()
return fig
def get_best_plot(self):
"""Return a plot of the data colored by the best clustering."""
return self.get_plot(self.get_best_clustering())
def _do_single_cluster(self, data):
""" Do a single clustering."""
num_samples = data.shape[0]
centroids = data[np.random.choice(num_samples, size=self.num_clusters, replace=False)]
assignments = np.zeros(num_samples)
error = float('inf')
iteration = 0
while iteration < self.max_iters:
# E-step. Compute assignment of points to clusters.
for i, sample in enumerate(data):
centroid_dists = ((centroids - sample)**2).sum(axis=1)
assignments[i] = np.argmin(centroid_dists)
# M-step. Set each centroid's location as the mean of the points
# assigned to it.
new_error = 0.0
for c, centroid in enumerate(centroids):
centroids[c, :] = data[assignments == c].mean(axis=0)
new_error += ((centroids[c] - data[assignments == c])**2).sum(axis=1).sum()
# Quit early if the error delta is small enough.
assert new_error <= error
error_delta = abs(error - new_error)
error = new_error
if error_delta < self.error_delta:
break
iteration += 1
return Clustering(centroids, assignments, error)
def cluster_and_plot(data, num_clusters=3):
kmeans = KMeans(num_clusters=num_clusters, num_random_inits=10, max_iters=1000)
kmeans.cluster(data)
clustering = kmeans.get_best_clustering()
errors = np.array(sorted(kmeans.get_all_errors()))
print num_clusters, 'clusters:'
print ' Error: {0} +/- {1}'.format(errors.mean(), errors.std())
print ' Coefficient of variation:', kmeans.get_error_coef_variation()
#print ' Individual errors:', sorted(kmeans.get_all_errors())
return kmeans.get_best_plot()
_ = cluster_and_plot(exon_data, num_clusters=3)
3 clusters: Error: 1525.98594799 +/- 0.133725778612 Coefficient of variation: 8.76323787831e-05
_ = cluster_and_plot(exon_data, num_clusters=5)
5 clusters: Error: 1006.88059977 +/- 28.1882006481 Coefficient of variation: 0.0279955743059
_ = cluster_and_plot(exon_data, num_clusters=8)
8 clusters: Error: 669.252603213 +/- 7.79066356633 Coefficient of variation: 0.0116408416328
For each number of clusters I tried, I printed the mean error +/- one standard deviation (sum of squared distances of all points form their centroids) as well as the coefficient of variation of these errors. With 3 clusters, the coefficient of variation is very small which shows K-means is not very sensitive to the initial guesses in this case. The variation is higher for the examples with more clusters, but not by much.
def plot_error_ellipse(mu, sigma, std=2, ax=None, **kwargs):
if ax is None:
ax = plt.gca()
num_points = 1000
L = np.linalg.cholesky(sigma)
points = np.linspace(0, 1, num=num_points)
circle = np.array([np.cos(2 * np.pi * points),
np.sin(2 * np.pi * points)]) * std
ellipse = L.dot(circle) + np.tile(mu, (num_points, 1)).T
ax.plot(ellipse[0, :], ellipse[1, :], **kwargs)
np.random.seed(42)
def gmm_and_plot(data, n_components=3):
gmm = sklearn.mixture.GMM(n_components=n_components, cvtype='full')
gmm.fit(data)
plt.scatter(data[:, 0], data[:, 1], c=gmm.predict(data), alpha=0.7)
for c in xrange(gmm.n_components):
plot_error_ellipse(gmm.means[c], gmm.covars[c], linewidth=2, color='black')
plt.title("GMM with {0} Components".format(n_components))
gmm_and_plot(exon_data, n_components=3)
gmm_and_plot(exon_data, n_components=8)