%matplotlib inline import matplotlib.pyplot as plt a = [1, 1.5, 3, 5, 3.5, 4.5, 3.5] b = [1, 2, 4, 7, 5, 5, 4.5] plt.scatter(a, b) sum(a[2:])/5 sum(b[2:])/5 from scipy.cluster.vq import kmeans, vq import numpy as np data = np.array([[1.0, 1.0], [1.5, 2.0], [3.0, 4.0], [5.0, 7.0], [3.5, 5.0], [4.5, 5.0], [3.5, 4.5]]) data centroids, distortion = kmeans(data, k) centroids distortion idx, distort2 = vq(data, centroids) idx distort2 sum(distort2)/7 for i in range(k): plt.plot(data[idx==i,0], data[idx==i,1], 'o') plt.plot(centroids[:,0], centroids[:,1], 'sg', markersize=8) plt.xlim(0, 6) plt.ylim(0, 8) idx idx==1 abetter = np.array(a) abetter abetter[idx==1] kvals = [] dists = [] for k in range(1,8): centroids, distortion = kmeans(data, k) kvals.append(k) dists.append(distortion) plt.plot(kvals, dists)