#imports %pylab inline from sklearn.datasets import make_classification, load_iris from sklearn.preprocessing import LabelBinarizer from sklearn.linear_model import LogisticRegression #for comparison from sklearn.cross_validation import train_test_split class LogisticClassifier(object): """ Multiclass logistic regression with regularization. Trained with gradient descent + momentum (if desired). """ def __init__(self, basis=None): """ Instantiate a logistic regression model. Specify a custom basis function here, it should accept an array and output a (preferably higher dimensional) array. The default is the identity function plus a bias term. """ self.W = array([]) self.A = None #the mixing matrix for basis mapping. self.basis=basis if basis == 'poly': self.basisfunc = self.poly_basis elif basis == 'rbf': self.basisfunc = self.rbf_basis elif basis == 'sigmoid': self.basisfunc = self.sigmoid_basis elif basis == 'rectifier': self.basisfunc = self.rectifier_basis else: self.basisfunc = self.identity def identity(self, x): #identity basis function + a bias return hstack((x,1)) def poly_basis(self, x): #polynomial basis degree = 2 #first mix the components of x in a higher dimension xn = dot(self.A,x) return self.identity(hstack(tuple(sum(xn**i for i in range(degree))))) def rbf_basis(self, x): #in this case, use the mixing matrix as centroids. return self.identity(hstack(tuple(exp(-norm(x-mu)) for mu in self.A))) def sigmoid_basis(self, x): #just like a neural network layer. xn = dot(self.A, x) return self.identity((1+exp(-xn))**-1) def rectifier_basis(self, x): #used in the latest neural nets xn = dot(self.A, x) return self.identity(maximum(xn, 0)) def basismap(self, X): #if X is an observation matrix (examples by dimensions), #return each row mapped to a higher dimsional space new_dimensions = self.basisfunc(X[0,:]).shape[0] Xn = zeros((X.shape[0], new_dimensions)) for i,xi in enumerate(X): Xn[i,:] = self.basisfunc(xi) return Xn def fit(self, X, Y, itrs=100, learn_rate=0.1, reg=0.1, momentum=0.5, report_cost=False, proj_layer_size=10): """ Fit the model. X - observation matrix (observations by dimensions) Y - one-hot target matrix (examples by classes) itrs - number of iterations to run learn_rate - size of step to use for gradient descent reg - regularization penalty (lambda above) momentum - weight of the previous gradient in the update step report_cost - if true, return the loss function at each step (expensive). proj_layer_size - number of dimensions in the projection (mixing) layer. Higher -> more variance """ #first map to a new basis if self.basis != 'rbf': self.A = uniform(-1, 1, (proj_layer_size, X.shape[1])) else: #use the training examples as bases self.A = X[permutation(X.shape[0])[:proj_layer_size],:] Xn = self.basismap(X) #set up weights self.W = uniform(-0.1, 0.1, (Y.shape[1], Xn.shape[1])) #optimize costs = [] previous_grad = zeros(self.W.shape) #used in momentum for i in range(itrs): grad = self.grad(Xn, Y, reg) #compute gradient self.W = self.W - learn_rate*(grad + momentum*previous_grad) #take a step, use previous gradient as well. previous_grad = grad if report_cost: costs.append(self.loss(X,Y,reg)) return costs def softmax(self, Z): #returns sigmoid elementwise Z = maximum(Z, -1e3) Z = minimum(Z, 1e3) numerator = exp(Z) return numerator / sum(numerator, axis=1).reshape((-1,1)) def predict(self, X): """ If the model has been trained, makes predictions on an observation matrix (observations by features) """ Xn = self.basismap(X) return self.softmax(dot(Xn, self.W.T)) def grad(self, Xn, Y, reg): """ Returns the gradient of the loss function wrt the weights. """ #Xn should be the design matrix Yh = self.softmax(dot(Xn, self.W.T)) return -dot(Y.T-Yh.T,Xn)/Xn.shape[0] + reg*self.W def loss(self, X, Y, reg): #assuming X is the data matrix Yh = self.predict(X) return -mean(mean(Y*log(Yh))) - reg*trace(dot(self.W,self.W.T))/self.W.shape[0] #nice plotting code from before def plot_contour_scatter(X, Y, model, title_text, binarizer): #sample from a lattice (for the nice visualization) x1, x2 = meshgrid(arange(-5,5,0.1), arange(-5,5,0.1)) Xnew = vstack((x1.ravel(), x2.ravel())).T Z = model.predict(Xnew).reshape((Xnew.shape[0],-1)) Zc = binarizer.inverse_transform(Z) y = binarizer.inverse_transform(Y) #plot - contour plot and scatter superimposed contourf(arange(-5,5,0.1), arange(-5,5,0.1), Zc.reshape(x1.shape), cmap ='Paired',levels=arange(0,4,0.1)) c_dict = {0:'b', 1:'g', 2:'r', 3:'y', 4:'m', 5:'k', 6:'c'} colorsToUse= [c_dict[yi] for yi in y] scatter(X[:,0], X[:,1], c=colorsToUse) title(title_text) show() # #make data X,Y = make_classification(n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1) #make Y into a one-hot matrix lb = LabelBinarizer() Y = lb.fit_transform(Y) #fit model = LogisticClassifier() costs = model.fit(X,Y,report_cost=True, momentum=0.9, learn_rate=0.1,itrs=500,reg=0.01) plot(range(len(costs)), costs) title('loss over time') show() plot_contour_scatter(X,Y, model, 'Logistic Regression, linear basis', lb) def error_rate(Yh, Y): return sum( not_equal(argmax(Yh,axis=1), argmax(Y,axis=1))) / float(Yh.shape[0]) print error_rate(model.predict(X), Y) #fit model = LogisticClassifier(basis='poly') costs = model.fit(X,Y,report_cost=True, momentum=0.9, learn_rate=0.1,itrs=100,reg=0.01) plot(range(len(costs)), costs) title('loss over time') show() plot_contour_scatter(X,Y, model, 'Logistic Regression, polynomial basis', lb) #fit model = LogisticClassifier(basis='sigmoid') costs = model.fit(X,Y,report_cost=True, momentum=0.9, learn_rate=0.2,itrs=400,reg=1e-3) plot(range(len(costs)), costs) title('loss over time') show() plot_contour_scatter(X,Y, model, 'Logistic Regression, sigmoid basis', lb) #fit model = LogisticClassifier(basis='rbf') costs = model.fit(X,Y,report_cost=True, momentum=0.9, learn_rate=0.15,itrs=300,reg=0.01) plot(range(len(costs)), costs) title('loss over time') show() plot_contour_scatter(X,Y, model, 'Logistic Regression, RBF basis', lb) #fit model = LogisticClassifier(basis='rectifier') costs = model.fit(X,Y,report_cost=True, momentum=0.9, learn_rate=0.2,itrs=100,reg=0.01) plot(range(len(costs)), costs) title('loss over time') show() plot_contour_scatter(X,Y, model, 'Logistic Regression, linear rectifier basis', lb) iris_data = load_iris() X = iris_data.data Y = iris_data.target Y = lb.fit_transform(Y) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.25) #ugh, sklearn doesn't take one-hot representations Ytrain_sk = lb.inverse_transform(Ytrain) Ytest_sk = lb.inverse_transform(Ytest) sklearn_lr = LogisticRegression() linear_lr = LogisticClassifier() linear_lr.fit(Xtrain,Ytrain, momentum=0.5, learn_rate=0.1,itrs=1000,reg=0.01) sklearn_lr.fit(Xtrain, Ytrain_sk) #performance print 'ours:', error_rate(linear_lr.predict(Xtest), Ytest) print 'sklearn:', error_rate(sklearn_lr.predict_proba(Xtest), Ytest) X,Y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, n_classes=5, n_clusters_per_class=1) Y = lb.fit_transform(Y) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.25) #ugh, sklearn doesn't take one-hot representations Ytrain_sk = lb.inverse_transform(Ytrain) sklearn_lr = LogisticRegression() linear_lr = LogisticClassifier() linear_lr.fit(Xtrain, Ytrain, momentum=0.5, learn_rate=0.1, itrs=1000, reg=0.01) sklearn_lr.fit(Xtrain, Ytrain_sk) print 'ours:', error_rate(linear_lr.predict(Xtest), Ytest) print 'sklearn:', error_rate(sklearn_lr.predict_proba(Xtest), Ytest) #fit a sigmoid basis logistic regressor sigmoid_lr = LogisticClassifier(basis='sigmoid') sigmoid_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) #fit a polynomial basis logistic regressor poly_lr = LogisticClassifier(basis='poly') poly_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) #fit a rectifier basis logistic regressor rect_lr = LogisticClassifier(basis='rectifier') rect_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) #fit a rbf basis logistic regressor rbf_lr = LogisticClassifier(basis='rbf') rbf_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) print 'ours (linear):', error_rate(linear_lr.predict(Xtest), Ytest) print 'ours (sigmoid):', error_rate(sigmoid_lr.predict(Xtest), Ytest) print 'ours (polynomial):', error_rate(poly_lr.predict(Xtest), Ytest) print 'ours (rectifier):', error_rate(rect_lr.predict(Xtest), Ytest) print 'ours (rbf):', error_rate(rbf_lr.predict(Xtest), Ytest) print 'sklearn:', error_rate(sklearn_lr.predict_proba(Xtest), Ytest) X,Y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, n_classes=9, n_clusters_per_class=2) Y = lb.fit_transform(Y) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.25) #ugh, sklearn doesn't take one-hot representations Ytrain_sk = lb.inverse_transform(Ytrain) sklearn_lr = LogisticRegression() linear_lr = LogisticClassifier() linear_lr.fit(Xtrain, Ytrain, momentum=0.5, learn_rate=0.1, itrs=1000, reg=0.01) sklearn_lr.fit(Xtrain, Ytrain_sk) #fit a sigmoid basis logistic regressor sigmoid_lr = LogisticClassifier(basis='sigmoid') sigmoid_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) #fit a polynomial basis logistic regressor poly_lr = LogisticClassifier(basis='poly') poly_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) #fit a rectifier basis logistic regressor rect_lr = LogisticClassifier(basis='rectifier') rect_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) #fit a rbf basis logistic regressor rbf_lr = LogisticClassifier(basis='rbf') rbf_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) print 'ours (linear):', error_rate(linear_lr.predict(Xtest), Ytest) print 'ours (sigmoid):', error_rate(sigmoid_lr.predict(Xtest), Ytest) print 'ours (polynomial):', error_rate(poly_lr.predict(Xtest), Ytest) print 'ours (rectifier):', error_rate(rect_lr.predict(Xtest), Ytest) print 'ours (rbf):', error_rate(rbf_lr.predict(Xtest), Ytest) print 'sklearn:', error_rate(sklearn_lr.predict_proba(Xtest), Ytest) X,Y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, n_classes=9, n_clusters_per_class=2) Y = lb.fit_transform(Y) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.25) #ugh, sklearn doesn't take one-hot representations Ytrain_sk = lb.inverse_transform(Ytrain) sklearn_lr = LogisticRegression() linear_lr = LogisticClassifier() linear_lr.fit(Xtrain, Ytrain, momentum=0.5, learn_rate=0.1, itrs=1000, reg=0.01) sklearn_lr.fit(Xtrain, Ytrain_sk) #fit a sigmoid basis logistic regressor sigmoid_lr = LogisticClassifier(basis='sigmoid') sigmoid_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) #fit a polynomial basis logistic regressor poly_lr = LogisticClassifier(basis='poly') poly_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) #fit a rectifier basis logistic regressor rect_lr = LogisticClassifier(basis='rectifier') rect_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) #fit a rbf basis logistic regressor rbf_lr = LogisticClassifier(basis='rbf') rbf_lr.fit(Xtrain, Ytrain, momentum=0.9, learn_rate=0.05, itrs=1000, reg=0.001, proj_layer_size=50) print 'ours (linear):', error_rate(linear_lr.predict(Xtest), Ytest) print 'ours (sigmoid):', error_rate(sigmoid_lr.predict(Xtest), Ytest) print 'ours (polynomial):', error_rate(poly_lr.predict(Xtest), Ytest) print 'ours (rectifier):', error_rate(rect_lr.predict(Xtest), Ytest) print 'ours (rbf):', error_rate(rbf_lr.predict(Xtest), Ytest) print 'sklearn:', error_rate(sklearn_lr.predict_proba(Xtest), Ytest)