Logistic Regression is an algorithm for classification (blame the stat guys for that misnomer). Neural networks are built from these (shallow ones, at least). I'm going to do everything in the multiclass case. 4.3.3 (IRLS) is not covered in this notebook.
Logistic regression is discriminative, and the loss function (cross entropy) is given by:
$-\sum_{n=1}^{N}{\sum_{k=1}^{K}{t_{nk} ln(y_{nk})}}$
Where $N$ is the number of training examples, $K$ is the number of classes, $t_{nk}$ is 1 if training example $n$ is class $k$ and zero otherwise (a one-hot representation), and $y_{nk}$ is the prediction of our model of the probability that training example $n$ is class $k$.
Our model predicts a conditional probability $p(t=k|x;W)$. As we saw with linear basis function models, we can easily work with linear combinations of nonlinear functions of our input. In other words, we first apply a nonlinear transformation to our data and then learn a linear decision boundary.
Because it's important, I'm going to add a regularization penalty of the form
$\lambda tr(WW^{T})$, where $W$ is a matrix of weights and $tr$ is the trace function. In other words, we penalize the size of each weight vector for the $K$ classes. The dimensions of our weight matrix, $W$, will be $(K,D)$, where $D$ is the dimensionality of our input after being mapped to the new space. Thus $p(t=k|x;W) = W\phi(x)$, where $\phi$ is the mapping (basis functions).
Enough, let's see some code.
#imports
%pylab inline
from sklearn.datasets import make_classification, load_iris
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression #for comparison
from sklearn.cross_validation import train_test_split
Populating the interactive namespace from numpy and matplotlib
class LogisticClassifier(object):
"""
Multiclass logistic regression with regularization. Trained with gradient descent + momentum (if desired).
"""
def __init__(self, basis=None):
"""
Instantiate a logistic regression model. Specify a custom basis function here,
it should accept an array and output a (preferably higher dimensional) array.
The default is the identity function plus a bias term.
"""
self.W = array([])
self.A = None #the mixing matrix for basis mapping.
self.basis=basis
if basis == 'poly':
self.basisfunc = self.poly_basis
elif basis == 'rbf':
self.basisfunc = self.rbf_basis
elif basis == 'sigmoid':
self.basisfunc = self.sigmoid_basis
elif basis == 'rectifier':
self.basisfunc = self.rectifier_basis
else:
self.basisfunc = self.identity
def identity(self, x):
#identity basis function + a bias
return hstack((x,1))
def poly_basis(self, x):
#polynomial basis
degree = 2
#first mix the components of x in a higher dimension
xn = dot(self.A,x)
return self.identity(hstack(tuple(sum(xn**i for i in range(degree)))))
def rbf_basis(self, x):
#in this case, use the mixing matrix as centroids.
return self.identity(hstack(tuple(exp(-norm(x-mu)) for mu in self.A)))
def sigmoid_basis(self, x):
#just like a neural network layer.
xn = dot(self.A, x)
return self.identity((1+exp(-xn))**-1)
def rectifier_basis(self, x):
#used in the latest neural nets
xn = dot(self.A, x)
return self.identity(maximum(xn, 0))
def basismap(self, X):
#if X is an observation matrix (examples by dimensions),
#return each row mapped to a higher dimsional space
new_dimensions = self.basisfunc(X[0,:]).shape[0]
Xn = zeros((X.shape[0], new_dimensions))
for i,xi in enumerate(X):
Xn[i,:] = self.basisfunc(xi)
return Xn
def fit(self, X, Y, itrs=100, learn_rate=0.1, reg=0.1,
momentum=0.5, report_cost=False, proj_layer_size=10):
"""
Fit the model.
X - observation matrix (observations by dimensions)
Y - one-hot target matrix (examples by classes)
itrs - number of iterations to run
learn_rate - size of step to use for gradient descent
reg - regularization penalty (lambda above)
momentum - weight of the previous gradient in the update step
report_cost - if true, return the loss function at each step (expensive).
proj_layer_size - number of dimensions in the projection (mixing) layer. Higher -> more variance
"""
#first map to a new basis
if self.basis != 'rbf':
self.A = uniform(-1, 1, (proj_layer_size, X.shape[1]))
else:
#use the training examples as bases
self.A = X[permutation(X.shape[0])[:proj_layer_size],:]
Xn = self.basismap(X)
#set up weights
self.W = uniform(-0.1, 0.1, (Y.shape[1], Xn.shape[1]))
#optimize
costs = []
previous_grad = zeros(self.W.shape) #used in momentum
for i in range(itrs):
grad = self.grad(Xn, Y, reg) #compute gradient
self.W = self.W - learn_rate*(grad + momentum*previous_grad) #take a step, use previous gradient as well.
previous_grad = grad
if report_cost:
costs.append(self.loss(X,Y,reg))
return costs
def softmax(self, Z):
#returns sigmoid elementwise
Z = maximum(Z, -1e3)
Z = minimum(Z, 1e3)
numerator = exp(Z)
return numerator / sum(numerator, axis=1).reshape((-1,1))
def predict(self, X):
"""
If the model has been trained, makes predictions on an observation matrix (observations by features)
"""
Xn = self.basismap(X)
return self.softmax(dot(Xn, self.W.T))
def grad(self, Xn, Y, reg):
"""
Returns the gradient of the loss function wrt the weights.
"""
#Xn should be the design matrix
Yh = self.softmax(dot(Xn, self.W.T))
return -dot(Y.T-Yh.T,Xn)/Xn.shape[0] + reg*self.W
def loss(self, X, Y, reg):
#assuming X is the data matrix
Yh = self.predict(X)
return -mean(mean(Y*log(Yh))) - reg*trace(dot(self.W,self.W.T))/self.W.shape[0]
I use this plotting code a lot. It required a little modification
#nice plotting code from before
def plot_contour_scatter(X, Y, model, title_text, binarizer):
#sample from a lattice (for the nice visualization)
x1, x2 = meshgrid(arange(-5,5,0.1), arange(-5,5,0.1))
Xnew = vstack((x1.ravel(), x2.ravel())).T
Z = model.predict(Xnew).reshape((Xnew.shape[0],-1))
Zc = binarizer.inverse_transform(Z)
y = binarizer.inverse_transform(Y)
#plot - contour plot and scatter superimposed
contourf(arange(-5,5,0.1), arange(-5,5,0.1), Zc.reshape(x1.shape),
cmap ='Paired',levels=arange(0,4,0.1))
c_dict = {0:'b', 1:'g', 2:'r', 3:'y', 4:'m', 5:'k', 6:'c'}
colorsToUse= [c_dict[yi] for yi in y]
scatter(X[:,0], X[:,1], c=colorsToUse)
title(title_text)
show()
Let's try some toy examples. First we need to get some data.
# #make data
X,Y = make_classification(n_features=2, n_informative=2, n_redundant=0,
n_repeated=0, n_classes=3, n_clusters_per_class=1)
#make Y into a one-hot matrix
lb = LabelBinarizer()
Y = lb.fit_transform(Y)
Now we can fit our model. Let's try with no basis functions first.
#fit
model = LogisticClassifier()
costs = model.fit(X,Y,report_cost=True, momentum=0.9, learn_rate=0.1,itrs=500,reg=0.01)
plot(range(len(costs)), costs)
title('loss over time')
show()
plot_contour_scatter(X,Y, model, 'Logistic Regression, linear basis', lb)
How often is it wrong?
def error_rate(Yh, Y):
return sum( not_equal(argmax(Yh,axis=1), argmax(Y,axis=1))) / float(Yh.shape[0])
print error_rate(model.predict(X), Y)
0.14
Ok. How about a polynomial basis?
#fit
model = LogisticClassifier(basis='poly')
costs = model.fit(X,Y,report_cost=True, momentum=0.9, learn_rate=0.1,itrs=100,reg=0.01)
plot(range(len(costs)), costs)
title('loss over time')
show()
plot_contour_scatter(X,Y, model, 'Logistic Regression, polynomial basis', lb)
Not bad - if we zoomed out it would be clear that the decision boundary isn't linear anymore. Let's use a sigmoid basis.
#fit
model = LogisticClassifier(basis='sigmoid')
costs = model.fit(X,Y,report_cost=True, momentum=0.9, learn_rate=0.2,itrs=400,reg=1e-3)
plot(range(len(costs)), costs)
title('loss over time')
show()
plot_contour_scatter(X,Y, model, 'Logistic Regression, sigmoid basis', lb)
Looks nice. The sigmoid basis is important because it's the same thing that each layer of a neural network does (ignoring recent results in deep learning). Each layer in a neural net (Bishop ch5) performs a matrix multiply and then an elementwise sigmoid. In neural nets, however, we use the chain rule to figure out the gradient of the error w.r.t. the layer 1 weight matrix. In this example, it's the same thing but the first layer is static.
We can also simulate a rbf network by using a rbf basis.
#fit
model = LogisticClassifier(basis='rbf')
costs = model.fit(X,Y,report_cost=True, momentum=0.9, learn_rate=0.15,itrs=300,reg=0.01)
plot(range(len(costs)), costs)
title('loss over time')
show()
plot_contour_scatter(X,Y, model, 'Logistic Regression, RBF basis', lb)
Highly nonlinear decision boundaries here. This classifier is basically the same as a RBF network.
Modern neural nets (deep ones, that is) use an activation function called a rectifier.
#fit
model = LogisticClassifier(basis='rectifier')
costs = model.fit(X,Y,report_cost=True, momentum=0.9, learn_rate=0.2,itrs=100,reg=0.01)
plot(range(len(costs)), costs)
title('loss over time')
show()
plot_contour_scatter(X,Y, model, 'Logistic Regression, linear rectifier basis', lb)
Sklearn includes the iris dataset for use in validating algorithms. Let's give it a shot.
iris_data = load_iris()
X = iris_data.data
Y = iris_data.target
Y = lb.fit_transform(Y)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.25)
#ugh, sklearn doesn't take one-hot representations
Ytrain_sk = lb.inverse_transform(Ytrain)
Ytest_sk = lb.inverse_transform(Ytest)
Let's compare our implementation with the implementation already in sklearn
sklearn_lr = LogisticRegression()
linear_lr = LogisticClassifier()
linear_lr.fit(Xtrain,Ytrain, momentum=0.5, learn_rate=0.1,itrs=1000,reg=0.01)
sklearn_lr.fit(Xtrain, Ytrain_sk)
#performance
print 'ours:', error_rate(linear_lr.predict(Xtest), Ytest)
print 'sklearn:', error_rate(sklearn_lr.predict_proba(Xtest), Ytest)
ours: 0.0526315789474 sklearn: 0.105263157895
The difference isn't significant, the data set is too small. Let's use the make_classification utility to run some more tests.
X,Y = make_classification(n_samples=1000, n_features=20, n_informative=10,
n_redundant=10, n_classes=5, n_clusters_per_class=1)
Y = lb.fit_transform(Y)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.25)
#ugh, sklearn doesn't take one-hot representations
Ytrain_sk = lb.inverse_transform(Ytrain)
And now we can compare them.
sklearn_lr = LogisticRegression()
linear_lr = LogisticClassifier()
linear_lr.fit(Xtrain, Ytrain, momentum=0.5, learn_rate=0.1,
itrs=1000, reg=0.01)
sklearn_lr.fit(Xtrain, Ytrain_sk)
print 'ours:', error_rate(linear_lr.predict(Xtest), Ytest)
print 'sklearn:', error_rate(sklearn_lr.predict_proba(Xtest), Ytest)
ours: 0.284 sklearn: 0.296
We can also see how our different basis functions perform
#fit a sigmoid basis logistic regressor
sigmoid_lr = LogisticClassifier(basis='sigmoid')
sigmoid_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
#fit a polynomial basis logistic regressor
poly_lr = LogisticClassifier(basis='poly')
poly_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
#fit a rectifier basis logistic regressor
rect_lr = LogisticClassifier(basis='rectifier')
rect_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
#fit a rbf basis logistic regressor
rbf_lr = LogisticClassifier(basis='rbf')
rbf_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
print 'ours (linear):', error_rate(linear_lr.predict(Xtest), Ytest)
print 'ours (sigmoid):', error_rate(sigmoid_lr.predict(Xtest), Ytest)
print 'ours (polynomial):', error_rate(poly_lr.predict(Xtest), Ytest)
print 'ours (rectifier):', error_rate(rect_lr.predict(Xtest), Ytest)
print 'ours (rbf):', error_rate(rbf_lr.predict(Xtest), Ytest)
print 'sklearn:', error_rate(sklearn_lr.predict_proba(Xtest), Ytest)
ours (linear): 0.284 ours (sigmoid): 0.256 ours (polynomial): 0.396 ours (rectifier): 0.212 ours (rbf): 0.816 sklearn: 0.296
Very strong performance by the rectifier and sigmoid layers, very poor performance for RBF and polynomial layers. Let's run it again, on different data.
X,Y = make_classification(n_samples=1000, n_features=20, n_informative=10,
n_redundant=10, n_classes=9, n_clusters_per_class=2)
Y = lb.fit_transform(Y)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.25)
#ugh, sklearn doesn't take one-hot representations
Ytrain_sk = lb.inverse_transform(Ytrain)
sklearn_lr = LogisticRegression()
linear_lr = LogisticClassifier()
linear_lr.fit(Xtrain, Ytrain, momentum=0.5, learn_rate=0.1,
itrs=1000, reg=0.01)
sklearn_lr.fit(Xtrain, Ytrain_sk)
#fit a sigmoid basis logistic regressor
sigmoid_lr = LogisticClassifier(basis='sigmoid')
sigmoid_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
#fit a polynomial basis logistic regressor
poly_lr = LogisticClassifier(basis='poly')
poly_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
#fit a rectifier basis logistic regressor
rect_lr = LogisticClassifier(basis='rectifier')
rect_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
#fit a rbf basis logistic regressor
rbf_lr = LogisticClassifier(basis='rbf')
rbf_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
print 'ours (linear):', error_rate(linear_lr.predict(Xtest), Ytest)
print 'ours (sigmoid):', error_rate(sigmoid_lr.predict(Xtest), Ytest)
print 'ours (polynomial):', error_rate(poly_lr.predict(Xtest), Ytest)
print 'ours (rectifier):', error_rate(rect_lr.predict(Xtest), Ytest)
print 'ours (rbf):', error_rate(rbf_lr.predict(Xtest), Ytest)
print 'sklearn:', error_rate(sklearn_lr.predict_proba(Xtest), Ytest)
ours (linear): 0.636 ours (sigmoid): 0.648 ours (polynomial): 0.724 ours (rectifier): 0.532 ours (rbf): 0.908 sklearn: 0.644
Again, we see a similar distribution of scores. I'm starting to see why the rectifier is becoming more common than the sigmoid. One more time!
X,Y = make_classification(n_samples=1000, n_features=20, n_informative=10,
n_redundant=10, n_classes=9, n_clusters_per_class=2)
Y = lb.fit_transform(Y)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.25)
#ugh, sklearn doesn't take one-hot representations
Ytrain_sk = lb.inverse_transform(Ytrain)
sklearn_lr = LogisticRegression()
linear_lr = LogisticClassifier()
linear_lr.fit(Xtrain, Ytrain, momentum=0.5, learn_rate=0.1,
itrs=1000, reg=0.01)
sklearn_lr.fit(Xtrain, Ytrain_sk)
#fit a sigmoid basis logistic regressor
sigmoid_lr = LogisticClassifier(basis='sigmoid')
sigmoid_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
#fit a polynomial basis logistic regressor
poly_lr = LogisticClassifier(basis='poly')
poly_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
#fit a rectifier basis logistic regressor
rect_lr = LogisticClassifier(basis='rectifier')
rect_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
#fit a rbf basis logistic regressor
rbf_lr = LogisticClassifier(basis='rbf')
rbf_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05,
itrs=1000, reg=0.001, proj_layer_size=50)
print 'ours (linear):', error_rate(linear_lr.predict(Xtest), Ytest)
print 'ours (sigmoid):', error_rate(sigmoid_lr.predict(Xtest), Ytest)
print 'ours (polynomial):', error_rate(poly_lr.predict(Xtest), Ytest)
print 'ours (rectifier):', error_rate(rect_lr.predict(Xtest), Ytest)
print 'ours (rbf):', error_rate(rbf_lr.predict(Xtest), Ytest)
print 'sklearn:', error_rate(sklearn_lr.predict_proba(Xtest), Ytest)
ours (linear): 0.592 ours (sigmoid): 0.612 ours (polynomial): 0.752 ours (rectifier): 0.548 ours (rbf): 0.904 sklearn: 0.628
Time to draw some conclusions. First, a linear projection followed by a rectifier activation is a very competitive basis function. It won in all of our experiments, usually by a large margin. Both my RBF and polynomial activations are very bad, which is probably a bug on my part but I'm not sure. I looked into it a little and they just seem to be learning very slowly. Our linear basis logistic regression is comparable to sklearn's, probably because they're doing the exact same thing.