IRISのアヤメのデータに対してクラスタリングをかける。このデータはラベル数が3とあらかじめ分かっているので、k=3をHyperKEstomatorで推定できるかを確認する
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
from kHLL.kestimate import HyperKEstimator
from kHLL.hash.image import md5_for_vec
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
kestimator = HyperKEstimator(1, 5, md5_for_vec, 20)
kestimator.train(X)
estimators = {'k_means_iris_3': KMeans(n_clusters=3),
'k_means_iris_hyper': KMeans(n_clusters=kestimator.getK())}
fignum = 1
for name, est in estimators.items():
fig = plt.figure(fignum, figsize=(9, 5))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
est.fit(X)
labels = est.labels_
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
fignum = fignum + 1
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
from kHLL.kestimate import HyperKEstimator
from kHLL.hash.image import md5_for_vec
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
kestimator = HyperKEstimator(1, 5, md5_for_vec, 20)
kestimator.train(X)
estimators = {'k_means_iris_3': KMeans(n_clusters=3),
'k_means_iris_hyper': KMeans(n_clusters=kestimator.getK())}
fignum = 1
for name, est in estimators.items():
fig = plt.figure(fignum, figsize=(9, 5))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
est.fit(X)
labels = est.labels_
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
fignum = fignum + 1
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Plot the ground truth
fig = plt.figure(fignum, figsize=(9, 5))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
for name, label in [('Setosa', 0),
('Versicolour', 1),
('Virginica', 2)]:
ax.text3D(X[y == label, 3].mean(),
X[y == label, 0].mean() + 1.5,
X[y == label, 2].mean(), name,
horizontalalignment='center',
bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(np.float)
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y)
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
plt.show()
DPGMMはディリクレ過程を使ってクラスタ数も含めて推定する手法。これと実行時間を比較した。
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import DPGMM
from sklearn import datasets
from kHLL.kestimate import HyperKEstimator
from kHLL.hash.image import md5_for_vec
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
estimators = {'dpgmm': DPGMM(),
'k_means_iris_hyper': KMeans(n_clusters=kestimator.getK())}
## DPGMM
try_count = 5
dpgmm_elapsed_times = []
hyper_kmeans_elapsed_times = []
for i in xrange(1, try_count):
start = time.time()
for j in xrange(0, i):
dpgmm_model = DPGMM()
dpgmm_model.fit(X)
dpgmm_elapsed_times.append(time.time() - start)
start = time.time()
for j in xrange(0, i):
kestimator = HyperKEstimator(1, 5, md5_for_vec, 20)
kestimator.train(X)
kmeans_model= KMeans(n_clusters=3)
kmeans_model.fit(X)
hyper_kmeans_elapsed_times.append(time.time() - start)
x = range(1, try_count)
fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])
axes.plot(x, dpgmm_elapsed_times, 'r')
axes.plot(x, hyper_kmeans_elapsed_times, 'g')
axes.set_xlabel('x')
axes.set_ylabel('y')
axes.set_title('Comparison between DPGMM and HyperKMeans');