%matplotlib inline import numpy as np import matplotlib.pyplot as plt from numba import autojit from sklearn.cross_validation import train_test_split from sklearn.datasets import make_low_rank_matrix n_samples = int(1e4) n_features = 100 rng = np.random.RandomState(42) w_true = rng.normal(size=n_features) # half of non-informative features w_true[n_features // 2:] = 0 intercept_true = np.mean(np.abs(w_true)) / 50 X = make_low_rank_matrix(n_samples, n_features, effective_rank=n_features // 3, random_state=rng) # ground truth y_true = np.dot(X, w_true) + intercept_true # add some label noise y_noise = rng.normal(scale=y_true.std() / 50, size=n_samples) y_decision = y_true + y_noise # threshold as a balanced binary classification problem y = np.where(y_decision > 0, 1, -1) np.mean(y == 1) plt.title("Distribution of the target decision value") _ = plt.hist(y_decision, bins=30) plt.plot(w_true, 'o', color='g', alpha=0.8) _ = plt.title("Ground truth parameters") print(intercept_true) def predict(w, X, intercept=0): return np.where(np.dot(X, w) + intercept > 0, 1, -1) np.mean(predict(w_true, X, intercept=intercept_true) == y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) from sklearn.linear_model import PassiveAggressiveClassifier %time clf = PassiveAggressiveClassifier(C=1e6, n_iter=100).fit(X_train, y_train) plt.plot(w_true / np.abs(w_true).max(), 'o', color='g', alpha=0.6, label='true') coef_max = np.abs(clf.coef_.ravel()).max() plt.plot(clf.coef_.ravel() / coef_max, 'o', color='b', alpha=0.6, label='estimate') clf.score(X_test, y_test) from sklearn.linear_model import SGDClassifier %time clf = SGDClassifier(loss='log', penalty='l1', alpha=1e-6, n_iter=300).fit(X_train, y_train) plt.plot(w_true / np.abs(w_true).max(), 'o', color='g', alpha=0.6, label='true') coef_max = np.abs(clf.coef_.ravel()).max() plt.plot(clf.coef_.ravel() / coef_max, 'o', color='b', alpha=0.6, label='estimate') clf.score(X_test, y_test) from collections import defaultdict clf = SGDClassifier(loss='log', penalty='l1', learning_rate='invscaling', power_t=0.6, eta0=1e5, alpha=1e-6) n_iter = 1000 average_history = 100 weights = [] intercepts = [] scores = defaultdict(list) for i in range(n_iter): clf.partial_fit(X_train, y_train, classes=[-1, 1]) weights.append(clf.coef_.ravel().copy()) intercepts.append(clf.intercept_[0]) mean_coef = np.mean(weights[-average_history:], axis=0) mean_intercept = np.mean(intercepts[-average_history:], axis=0) scores['sgd_test'].append(clf.score(X_test, y_test)) scores['sgd_train'].append(clf.score(X_train, y_train)) # XXX: clf might have a non-zero bias... y_test_pred = predict(mean_coef, X_test, intercept=mean_intercept) scores['asgd_test'].append(np.mean(y_test_pred == y_test)) y_train_pred = predict(mean_coef, X_train, intercept=mean_intercept) scores['asgd_train'].append(np.mean(y_train_pred == y_train)) plt.figure(figsize=(13, 4)) plt.subplot(121) plt.plot(w_true / np.abs(w_true).max(), 'o', color='g', alpha=0.6, label='true') coef_max = np.abs(mean_coef).max() plt.plot(mean_coef / coef_max, 'o', color='b', alpha=0.6, label='estimate') print("Intercept: {}".format(clf.intercept_[0] / coef_max)) plt.subplot(122) for name, values in sorted(scores.items(), reverse=True): plt.plot(values, label=name) print("{}: {:.4f}".format(name, values[-1])) _ = plt.legend(loc='best') plt.ylim(0.90, 1) from sklearn.linear_model import LogisticRegression %time lr = LogisticRegression(C=1e6, penalty='l1').fit(X_train, y_train) plt.plot(w_true / np.abs(w_true).max(), 'o', color='g', alpha=0.6, label='true') coef_max = np.abs(lr.coef_.ravel()).max() plt.plot(lr.coef_.ravel() / coef_max, 'o', color='b', alpha=0.6, label='estimate') print("Log Reg test: {:.4f}".format(lr.score(X_test, y_test))) print("Intercept: {}".format(lr.intercept_[0] / coef_max)) from sklearn.svm import LinearSVC %time svc = LinearSVC(C=1e6).fit(X_train, y_train) plt.plot(w_true / np.abs(w_true).max(), 'o', color='g', alpha=0.6, label='true') coef_max = np.abs(svc.coef_.ravel()).max() plt.plot(svc.coef_.ravel() / coef_max, 'o', color='b', alpha=0.6, label='estimate') print("Intercept: {}".format(svc.intercept_[0] / coef_max)) print("SVM test: {:.4f}".format(svc.score(X_test, y_test)))