Build an artificial dataset by first picking $k$ random class centers. Then, for each uniformly sampled point, assign it to a class with a probability proportional to its distance to each center.
rcParams['figure.figsize'] = 12, 8
def dataset(n, k):
xy = random.uniform(low=-2, high=2, size=(n, 2))
centers = random.uniform(low=-2, high=2, size=(k, 2))
classes = []
for p in xy:
# 1/dist^3, so that lower dist => higher prob
dists = asarray([1. / linalg.norm(p - c) ** 3 for c in centers])
dists = cumsum(dists / sum(dists))
classes.append(where(random.random() < dists)[0][0])
return column_stack([xy, classes]), centers
random.seed(1) # k=4, seed=12
k = 3
points, centers = dataset(1000, k)
colors = 'gbrcmykrcm'
markers = 'osd^v1234x'
for i in range(k):
class_i = points[where(points[:,2] == i)]
marker = colors[i % len(colors)] + markers[i % len(markers)]
plot(class_i[:,0], class_i[:,1], marker)
plot([centers[i,0]], [centers[i,1]], marker, ms=20)
Minimize the negative log-likelihood by batch gradient descent.
def softmax(x, theta):
n = len(x)
k = len(theta)
sum_exps = sum(exp(asarray([dot(x, theta[i]) for i in range(k)])), axis=0)
assert sum_exps.shape == (n,)
return asarray([exp(dot(x, theta[i])) / sum_exps for i in range(k)]) # k x n, where each col should sum to 1
def train(points, initial_theta, alpha=0.001, stop=1e-4):
x, y, c = points[:,0], points[:,1], points[:,2]
n = len(points)
k = len(initial_theta)
bxy = column_stack([ones(n), x, y]) # first component is intercept (i.e. all 1s)
thetas = [initial_theta]
nll_errors = []
classif_errors = []
while True:
theta = thetas[-1][:] # copy
h = softmax(bxy, theta)
nll_errors.append(-sum(log(h.T[arange(n), c.astype(int)])) / n)
classif_errors.append(sum((argmax(h, axis=0) != c).astype(int)))
if len(nll_errors) > 1 and nll_errors[-2] - nll_errors[-1] < stop: break
for i in range(k):
ci = (c == i).astype(int)
theta[i][0] -= alpha * sum(h[i] - ci) # intercept component
theta[i][1] -= alpha * sum((h[i] - ci) * x)
theta[i][2] -= alpha * sum((h[i] - ci) * y)
thetas.append(theta)
return thetas, nll_errors, classif_errors
initial_theta = [random.sample(3) for _ in range(k - 0)]
thetas, nll_errors, classif_errors = train(points, initial_theta)
#print nll_errors
#print classif_errors
The NLL seems to be correctly minimized.
xlabel('Training iterations')
ylabel('NLL')
plot(range(len(nll_errors)), nll_errors, 'r-');
As well as the classification error.
xlabel('Training iterations')
ylabel('Classification error')
plot(range(len(classif_errors)), classif_errors, 'r-')
ylim(0);
Here I'm trying to find a valid geometric interpretation for the trained vallues of $\theta$, as I did with logistic regression (i.e. assuming that each $\theta_i$ corresponds to the $a$, $b$ and $c$ parameters of the general form equation (i.e. $ax + by +c = 0$) of the decision line for class $i$), but it doesn't seem to make sense.
def general_to_slope_intercept(theta):
slope = -theta[1] / theta[2]
intercept = -theta[0] / theta[2]
return slope, intercept
best_theta = thetas[-1]
for i in range(k):
class_i = points[where(points[:,2] == i)]
marker = colors[i % len(colors)] + markers[i % len(markers)]
plot(class_i[:,0], class_i[:,1], marker)
plot([centers[i,0]], [centers[i,1]], marker, ms=20)
slope, intercept = general_to_slope_intercept(best_theta[i])
plot(linspace(-2, 2), slope * linspace(-2, 2) + intercept,
'r-', linewidth=2)
axis((-2, 2, -2, 2));