Notebook

Algoritmo K-means

Importamos las librerias necesarias:

In [1]:

%matplotlib inline

# importamos las librerías básicas
import random
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

A continuación se muestran las diferentes funciones que implementan el algoritmo:

cluster_assignment(X, centroids)
move_centroids(clusters)
cluster_data(X, K)
cost_function(centroids, clusters)
kmeans(X, K, repeat)
kmeans_elbow(X, Kmax, repeat)

In [2]:

# Asigna cada muestra a un cluster u otro según la distancia a este.
def cluster_assignment(X, centroids):
    clusters = {}
    for x in X:
        norms = []
        for centroid in enumerate(centroids):
            norms.append([np.linalg.norm(x-centroid[1]), centroid[0]])       
        cluster_key = min(norms)[1]
        if clusters.has_key(cluster_key):
            clusters[cluster_key].append(x)
        else:
            clusters[cluster_key] = [x]
    return clusters

# Recalcula los centros de los nuevos clusters.
def move_centroids(clusters):
    new_centroids = []
    for k in clusters.keys():
        new_centroids.append(np.mean(clusters[k], 0))
    return new_centroids

# Algoritmo K-means.
def cluster_data(X, K):
    centroids = random.sample(X, K)
    prev_centroids = np.array(0)
    while not np.array_equal(centroids, prev_centroids):
        prev_centroids = centroids
        clusters = cluster_assignment(X, centroids)
        centroids = move_centroids(clusters)
    return (centroids, clusters)

# Calcula la función de costes.
def cost_function(centroids, clusters):
    J, m = 0, 0
    for k in clusters.keys():
        m += len(clusters[k])
        for x in clusters[k]:
            J += np.linalg.norm(x-centroids[k])
    return J / m

# K-means optimizado.
def kmeans(X, K, repeat = 10):
    prev_F = 1000
    for i in range(repeat):
        centroids, clusters = cluster_data(X, K)
        F = cost_function(centroids, clusters)
        if F < prev_F:
            prev_F = F
            best_kmeans = [centroids, clusters, F]
    return (best_kmeans[0], best_kmeans[1], best_kmeans[2])

# K-means con el método Elbow.
def kmeans_elbow(X, Kmax = 6, repeat = 10):
    data = []
    for k in range(1, Kmax):
        centroids, clusters, F = kmeans(X, k, repeat)
        data.append([k, F, centroids, clusters])    
    data_array = np.array(data)
    elbow_data = data_array[:,:2]
    plt.figure(1)
    ax = plt.gca()
    ax.plot(elbow_data[:,0], elbow_data[:,1], "b-")  
    ax.set_title("Elbow method")
    ax.set_xlabel("K (num. of clusters)")
    ax.set_ylabel("F (Cost function)")
    plt.show()
    K = raw_input("Enter clusters number K (max: 6): ")
    K = int(K)
    if 0 < K < 7:
        print "Correct input. Cluster number is " + str(K)
    else:
        print "Invalid input. Default K = 4"
        K = 4             
    return (data[K - 1][2], data[K - 1][3], K)

Ejemplo 2D

In [3]:

# creamos datos
X1 = np.random.multivariate_normal([0, 0], [[.6, 0], [0, .6]], 30)
X2 = np.random.multivariate_normal([-3, 3], [[.7, 0], [0, .7]], 30)
X3 = np.random.multivariate_normal([3, -3], [[.5, 0], [0, .5]], 30)
X4 = np.random.multivariate_normal([-4, -4], [[1.5, 0], [0, 1.5]], 40)
X = np.concatenate((X1, X2, X3, X4), axis=0)

# clusterizamos datos
centroids, clusters, K = kmeans_elbow(X, 10, 20)

# dibujamos datos clusterizados
markers = ["o", "o", "o", "o", "o", "o", ".", ".", "."]
colors = ["b", "g", "r", "c", "m", "y", "b", "g", "r"]
plt.figure()
plt.axis([-8, 5, -8, 6])
ax = plt.gca()
for k, clr, mkr in zip(range(K), colors, markers):
    cluster = np.squeeze(clusters[k])
    ax.plot(cluster[:,0], cluster[:,1], linestyle="None", markerfacecolor=clr, marker=mkr, markersize=4)
    ax.plot(centroids[k][0], centroids[k][1], linestyle="None", markerfacecolor=clr, marker="*", markersize=15)
ax.set_title("Kmeans algorithm")
ax.set_xlabel("x1")
ax.set_ylabel("x2")
plt.show()

Enter clusters number K (max: 6): 4
Correct input. Cluster number is 4

Ejemplo 3D

In [4]:

# creamos datos
X1 = np.random.multivariate_normal([0, 0, 0], [[.6, 0, 0], [0, .6, 0], [0, 0, .6]], 30)
X2 = np.random.multivariate_normal([-3, 3, 3], [[.7, 0, 0], [0, .7, 0], [0, 0, .7]], 30)
X3 = np.random.multivariate_normal([3, -3, -3], [[.5, 0, 0], [0, .5, 0], [0, 0, .5]], 30)
X4 = np.random.multivariate_normal([-4, -4, -4], [[1.5, 0, 0], [0, 1.5, 0], [0, 0, 1.5]], 40)
X = np.concatenate((X1, X2, X3, X4), axis=0)



# clusterizamos datos
centroids, clusters, F = kmeans_elbow(X, 10, 20)

# dibujamos datos clusterizados
colors = ["#0099cb", "#ade601", "#fe9900", "#d8007d", "#8800cc", "#2201cc"]
markers = ["o", "v", "s", "<", ">", "."]
fig = plt.figure()
ax = fig.gca(projection="3d")
for k, clr, mkr in zip(range(K), colors, markers):
    cluster = np.squeeze(np.asarray(clusters[k]))
    ax.plot(cluster[:, 0], cluster[:, 1], cluster[:, 2], linestyle="None", markerfacecolor=clr, marker=mkr, markersize=5)
    ax.plot([centroids[k][0]], [centroids[k][1]], [centroids[k][2]], linestyle="None", markerfacecolor=clr, marker="*", markersize=15)
ax.set_title("Datos segmentados")
ax.set_xlabel("x1")
ax.set_ylabel("x2")
ax.set_zlabel("x3")
plt.show()

Enter clusters number K (max: 6): 4
Correct input. Cluster number is 4