%matplotlib inline
import matplotlib.pyplot as plt
a = [1, 1.5, 3, 5, 3.5, 4.5, 3.5]
b = [1, 2, 4, 7, 5, 5, 4.5]
plt.scatter(a, b)
<matplotlib.collections.PathCollection at 0x1082f2490>
sum(a[2:])/5
3.9
sum(b[2:])/5
5.1
from scipy.cluster.vq import kmeans, vq
import numpy as np
data = np.array([[1.0, 1.0],
[1.5, 2.0],
[3.0, 4.0],
[5.0, 7.0],
[3.5, 5.0],
[4.5, 5.0],
[3.5, 4.5]])
data
array([[ 1. , 1. ], [ 1.5, 2. ], [ 3. , 4. ], [ 5. , 7. ], [ 3.5, 5. ], [ 4.5, 5. ], [ 3.5, 4.5]])
centroids, distortion = kmeans(data, k)
centroids
array([[ 3.5 , 4.75], [ 5. , 7. ], [ 1.5 , 2. ], [ 1. , 1. ], [ 3. , 4. ], [ 4.5 , 5. ]])
distortion
0.071428571428571425
idx, distort2 = vq(data, centroids)
idx
array([3, 2, 4, 1, 0, 5, 0])
distort2
array([ 0. , 0. , 0. , 0. , 0.25, 0. , 0.25])
sum(distort2)/7
0.50370721518339312
for i in range(k):
plt.plot(data[idx==i,0], data[idx==i,1], 'o')
plt.plot(centroids[:,0], centroids[:,1], 'sg', markersize=8)
plt.xlim(0, 6)
plt.ylim(0, 8)
(0, 8)
idx
array([0, 0, 1, 1, 1, 1, 1])
idx==1
array([False, False, True, True, True, True, True], dtype=bool)
abetter = np.array(a)
abetter
array([ 1. , 1.5, 3. , 5. , 3.5, 4.5, 3.5])
abetter[idx==1]
array([ 3. , 5. , 3.5, 4.5, 3.5])
kvals = []
dists = []
for k in range(1,8):
centroids, distortion = kmeans(data, k)
kvals.append(k)
dists.append(distortion)
plt.plot(kvals, dists)
[<matplotlib.lines.Line2D at 0x10954d6d0>]