import numpy as np
from sklearn.cluster.hierarchical import ward_tree
from sklearn.neighbors import kneighbors_graph
import pylab as pl
%pylab inline
Populating the interactive namespace from numpy and matplotlib
dim = 10
npoints = 100
# very simple simulation: two blobs
mean1 = -5 * np.ones(dim)
cov1 = np.diag(np.ones(dim))
mean2 = 5 * np.ones(dim)
cov2 = cov1
X1 = np.random.multivariate_normal(mean1,cov1,npoints)
X2 = np.random.multivariate_normal(mean2,cov2,npoints)
X = np.vstack((X1, X2))
connectivity = kneighbors_graph(X, n_neighbors=5)
pl.figure(figsize=(5, 5))
pl.imshow(connectivity.todense(), interpolation='nearest');
# ward_tree fails when the connectivity matrix is sparse
out = ward_tree(X, connectivity)
/usr/local/lib/python2.7/site-packages/sklearn/cluster/hierarchical.py:167: UserWarning: the number of connected components of the connectivity matrix is 2 > 1. Completing it to avoid stopping the tree early. n_components=n_components)
connectivity_1 = kneighbors_graph(X, n_neighbors=npoints)
pl.figure(figsize=(5,5))
pl.imshow(connectivity_1.todense(), interpolation='nearest');
# ward_tree doesn't fail now
out = ward_tree(X, connectivity_1)
print out[0][:10]
[[195 169] [164 113] [ 91 24] [ 20 11] [158 133] [197 185] [120 116] [162 144] [193 125] [ 99 15]]
print out[0][:10]
[[195 169] [164 113] [ 91 24] [ 20 11] [158 133] [197 185] [120 116] [162 144] [193 125] [ 99 15]]