# load the ex3data1
# it's in mat format in the course, but can be convert to txt in octave:
# save('ex3data1.txt', 'X', '-ascii')
# save('ex3data1.y.txt', 'y', '-ascii')
import numpy as np
def load_dataset():
data = []
y = []
for line in open('data/ex3data1.txt'):
data.append(map(double, line.split()))
for line in open('data/ex3data1.y.txt'):
y.append(int(float(line.strip())))
return np.array(data, np.double), np.array(y, np.int)
X, y = load_dataset()
%%time
from sklearn.base import BaseEstimator
from lbfgs import LBFGS
import lbfgs
def sigmoid(X, theta):
return 1 / (1 + np.exp(-np.dot(X, theta)))
def f(theta, g, lr, X, y):
lr.theta[:] = theta
m = X.shape[0]
predicted = sigmoid(X, theta)
g[:] = gradient(theta, X, y)
return -np.sum(1.0* y * log(predicted) + (1 - y) * log(1 - predicted)) / m
def gradient(theta, X, y):
m = X.shape[0]
predicted = sigmoid(X, theta)
error = predicted - y
return np.dot(X.T, error) / m
class LogisticRegression(BaseEstimator):
def fit(self, X, y):
n = X.shape[1]
x0 = np.zeros(n)
self.theta = np.zeros(n)
opt = LBFGS()
try:
opt.minimize(f, x0, args=[self, X, y])
except Exception, e:
print repr(e)
return self
def predict_proba(self, X):
positive = sigmoid(X, self.theta).reshape(X.shape[0], 1)
return np.hstack((1-positive, positive))
lr = LogisticRegression()
lr.fit(X, y == 10)
LBFGSError('The line-search routine reaches the maximum number of evaluations.',) CPU times: user 1.19 s, sys: 27.6 ms, total: 1.22 s Wall time: 438 ms
%%time
from sklearn.cross_validation import train_test_split
from sklearn.multiclass import OneVsRestClassifier
ovr = OneVsRestClassifier(LogisticRegression())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
ovr.fit(X_train, y_train)
print ovr.estimators_
LBFGSError('The line-search routine reaches the maximum number of evaluations.',) LBFGSError('The line-search routine reaches the maximum number of evaluations.',) LBFGSError('The line-search routine reaches the maximum number of evaluations.',) LBFGSError('The line-search routine reaches the maximum number of evaluations.',) LBFGSError('A rounding error occurred; alternatively, no line-search step satisfies the sufficient decrease and curvature conditions.',) LBFGSError('The line-search routine reaches the maximum number of evaluations.',) LBFGSError('The line-search routine reaches the maximum number of evaluations.',) LBFGSError('The line-search routine reaches the maximum number of evaluations.',) [LogisticRegression(), LogisticRegression(), LogisticRegression(), LogisticRegression(), LogisticRegression(), LogisticRegression(), LogisticRegression(), LogisticRegression(), LogisticRegression(), LogisticRegression()] CPU times: user 12 s, sys: 400 ms, total: 12.4 s Wall time: 4.14 s
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
predicted = ovr.predict(X_test)
print 'accuracy', accuracy_score(predicted, y_test)
print confusion_matrix(predicted, y_test)
accuracy 0.8645 [[186 3 1 1 0 1 1 2 0 0] [ 2 164 13 4 3 3 4 5 1 0] [ 0 2 167 1 10 0 8 3 2 4] [ 0 1 2 162 0 1 1 4 12 0] [ 0 1 14 0 165 2 0 6 2 1] [ 0 2 0 3 2 195 1 2 0 1] [ 1 3 4 2 0 0 163 1 15 0] [ 6 11 10 8 24 3 3 157 2 1] [ 2 4 3 10 6 0 6 4 174 2] [ 0 3 0 2 0 0 1 1 1 196]]