Notebook

Softmax Regression¶

Build an artificial dataset by first picking $k$ random class centers. Then, for each uniformly sampled point, assign it to a class with a probability proportional to its distance to each center.

In [152]:

rcParams['figure.figsize'] = 12, 8

def dataset(n, k):
    xy = random.uniform(low=-2, high=2, size=(n, 2))
    centers = random.uniform(low=-2, high=2, size=(k, 2))
    classes = []
    for p in xy:
        # 1/dist^3, so that lower dist => higher prob
        dists = asarray([1. / linalg.norm(p - c) ** 3 for c in centers])
        dists = cumsum(dists / sum(dists))
        classes.append(where(random.random() < dists)[0][0])
    return column_stack([xy, classes]), centers

random.seed(1) # k=4, seed=12
k = 3
points, centers = dataset(1000, k)
colors =  'gbrcmykrcm'
markers = 'osd^v1234x'
for i in range(k):
    class_i = points[where(points[:,2] == i)]
    marker = colors[i % len(colors)] + markers[i % len(markers)]
    plot(class_i[:,0], class_i[:,1], marker)
    plot([centers[i,0]], [centers[i,1]], marker, ms=20)

Minimize the negative log-likelihood by batch gradient descent.

In [153]:

def softmax(x, theta):
    n = len(x)
    k = len(theta)
    sum_exps = sum(exp(asarray([dot(x, theta[i]) for i in range(k)])), axis=0)
    assert sum_exps.shape == (n,)
    return asarray([exp(dot(x, theta[i])) / sum_exps for i in range(k)]) # k x n, where each col should sum to 1

def train(points, initial_theta, alpha=0.001, stop=1e-4):
    x, y, c = points[:,0], points[:,1], points[:,2]
    n = len(points)
    k = len(initial_theta)
    bxy = column_stack([ones(n), x, y]) # first component is intercept (i.e. all 1s)
    thetas = [initial_theta]
    nll_errors = []
    classif_errors = []
    while True:
        theta = thetas[-1][:] # copy
        h = softmax(bxy, theta) 
        nll_errors.append(-sum(log(h.T[arange(n), c.astype(int)])) / n)
        classif_errors.append(sum((argmax(h, axis=0) != c).astype(int)))  
        if len(nll_errors) > 1 and nll_errors[-2] - nll_errors[-1] < stop: break
        for i in range(k):
            ci = (c == i).astype(int)
            theta[i][0] -= alpha * sum(h[i] - ci) # intercept component
            theta[i][1] -= alpha * sum((h[i] - ci) * x)
            theta[i][2] -= alpha * sum((h[i] - ci) * y)
        thetas.append(theta)
    return thetas, nll_errors, classif_errors

initial_theta = [random.sample(3) for _ in range(k - 0)]
thetas, nll_errors, classif_errors = train(points, initial_theta)
#print nll_errors
#print classif_errors

The NLL seems to be correctly minimized.

In [154]:

xlabel('Training iterations')
ylabel('NLL')
plot(range(len(nll_errors)), nll_errors, 'r-');

As well as the classification error.

In [155]:

xlabel('Training iterations')
ylabel('Classification error')
plot(range(len(classif_errors)), classif_errors, 'r-')
ylim(0);

Here I'm trying to find a valid geometric interpretation for the trained vallues of $\theta$, as I did with logistic regression (i.e. assuming that each $\theta_i$ corresponds to the $a$, $b$ and $c$ parameters of the general form equation (i.e. $ax + by +c = 0$) of the decision line for class $i$), but it doesn't seem to make sense.

In [156]:

def general_to_slope_intercept(theta):
    slope = -theta[1] / theta[2]
    intercept = -theta[0] / theta[2]
    return slope, intercept

best_theta = thetas[-1]
for i in range(k):
    class_i = points[where(points[:,2] == i)]
    marker = colors[i % len(colors)] + markers[i % len(markers)]
    plot(class_i[:,0], class_i[:,1], marker)
    plot([centers[i,0]], [centers[i,1]], marker, ms=20)
    slope, intercept = general_to_slope_intercept(best_theta[i])
    plot(linspace(-2, 2), slope * linspace(-2, 2) + intercept, 
         'r-', linewidth=2)
    axis((-2, 2, -2, 2));