Importamos las librerias necesarias:
%matplotlib inline
# importamos las librerías básicas
import random
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
A continuación se muestran las diferentes funciones que implementan el algoritmo:
# Asigna cada muestra a un cluster u otro según la distancia a este.
def cluster_assignment(X, centroids):
clusters = {}
for x in X:
norms = []
for centroid in enumerate(centroids):
norms.append([np.linalg.norm(x-centroid[1]), centroid[0]])
cluster_key = min(norms)[1]
if clusters.has_key(cluster_key):
clusters[cluster_key].append(x)
else:
clusters[cluster_key] = [x]
return clusters
# Recalcula los centros de los nuevos clusters.
def move_centroids(clusters):
new_centroids = []
for k in clusters.keys():
new_centroids.append(np.mean(clusters[k], 0))
return new_centroids
# Algoritmo K-means.
def cluster_data(X, K):
centroids = random.sample(X, K)
prev_centroids = np.array(0)
while not np.array_equal(centroids, prev_centroids):
prev_centroids = centroids
clusters = cluster_assignment(X, centroids)
centroids = move_centroids(clusters)
return (centroids, clusters)
# Calcula la función de costes.
def cost_function(centroids, clusters):
J, m = 0, 0
for k in clusters.keys():
m += len(clusters[k])
for x in clusters[k]:
J += np.linalg.norm(x-centroids[k])
return J / m
# K-means optimizado.
def kmeans(X, K, repeat = 10):
prev_F = 1000
for i in range(repeat):
centroids, clusters = cluster_data(X, K)
F = cost_function(centroids, clusters)
if F < prev_F:
prev_F = F
best_kmeans = [centroids, clusters, F]
return (best_kmeans[0], best_kmeans[1], best_kmeans[2])
# K-means con el método Elbow.
def kmeans_elbow(X, Kmax = 6, repeat = 10):
data = []
for k in range(1, Kmax):
centroids, clusters, F = kmeans(X, k, repeat)
data.append([k, F, centroids, clusters])
data_array = np.array(data)
elbow_data = data_array[:,:2]
plt.figure(1)
ax = plt.gca()
ax.plot(elbow_data[:,0], elbow_data[:,1], "b-")
ax.set_title("Elbow method")
ax.set_xlabel("K (num. of clusters)")
ax.set_ylabel("F (Cost function)")
plt.show()
K = raw_input("Enter clusters number K (max: 6): ")
K = int(K)
if 0 < K < 7:
print "Correct input. Cluster number is " + str(K)
else:
print "Invalid input. Default K = 4"
K = 4
return (data[K - 1][2], data[K - 1][3], K)
# creamos datos
X1 = np.random.multivariate_normal([0, 0], [[.6, 0], [0, .6]], 30)
X2 = np.random.multivariate_normal([-3, 3], [[.7, 0], [0, .7]], 30)
X3 = np.random.multivariate_normal([3, -3], [[.5, 0], [0, .5]], 30)
X4 = np.random.multivariate_normal([-4, -4], [[1.5, 0], [0, 1.5]], 40)
X = np.concatenate((X1, X2, X3, X4), axis=0)
# clusterizamos datos
centroids, clusters, K = kmeans_elbow(X, 10, 20)
# dibujamos datos clusterizados
markers = ["o", "o", "o", "o", "o", "o", ".", ".", "."]
colors = ["b", "g", "r", "c", "m", "y", "b", "g", "r"]
plt.figure()
plt.axis([-8, 5, -8, 6])
ax = plt.gca()
for k, clr, mkr in zip(range(K), colors, markers):
cluster = np.squeeze(clusters[k])
ax.plot(cluster[:,0], cluster[:,1], linestyle="None", markerfacecolor=clr, marker=mkr, markersize=4)
ax.plot(centroids[k][0], centroids[k][1], linestyle="None", markerfacecolor=clr, marker="*", markersize=15)
ax.set_title("Kmeans algorithm")
ax.set_xlabel("x1")
ax.set_ylabel("x2")
plt.show()
Enter clusters number K (max: 6): 4 Correct input. Cluster number is 4
# creamos datos
X1 = np.random.multivariate_normal([0, 0, 0], [[.6, 0, 0], [0, .6, 0], [0, 0, .6]], 30)
X2 = np.random.multivariate_normal([-3, 3, 3], [[.7, 0, 0], [0, .7, 0], [0, 0, .7]], 30)
X3 = np.random.multivariate_normal([3, -3, -3], [[.5, 0, 0], [0, .5, 0], [0, 0, .5]], 30)
X4 = np.random.multivariate_normal([-4, -4, -4], [[1.5, 0, 0], [0, 1.5, 0], [0, 0, 1.5]], 40)
X = np.concatenate((X1, X2, X3, X4), axis=0)
# clusterizamos datos
centroids, clusters, F = kmeans_elbow(X, 10, 20)
# dibujamos datos clusterizados
colors = ["#0099cb", "#ade601", "#fe9900", "#d8007d", "#8800cc", "#2201cc"]
markers = ["o", "v", "s", "<", ">", "."]
fig = plt.figure()
ax = fig.gca(projection="3d")
for k, clr, mkr in zip(range(K), colors, markers):
cluster = np.squeeze(np.asarray(clusters[k]))
ax.plot(cluster[:, 0], cluster[:, 1], cluster[:, 2], linestyle="None", markerfacecolor=clr, marker=mkr, markersize=5)
ax.plot([centroids[k][0]], [centroids[k][1]], [centroids[k][2]], linestyle="None", markerfacecolor=clr, marker="*", markersize=15)
ax.set_title("Datos segmentados")
ax.set_xlabel("x1")
ax.set_ylabel("x2")
ax.set_zlabel("x3")
plt.show()
Enter clusters number K (max: 6): 4 Correct input. Cluster number is 4