%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from numba import autojit
from sklearn.cross_validation import train_test_split
from sklearn.datasets import make_low_rank_matrix

n_samples = int(1e4)
n_features = 100


rng = np.random.RandomState(42)
w_true = rng.normal(size=n_features)
# half of non-informative features
w_true[n_features // 2:] = 0

intercept_true = np.mean(np.abs(w_true)) / 50

X = make_low_rank_matrix(n_samples, n_features,
                         effective_rank=n_features // 3,
                         random_state=rng)

# ground truth
y_true = np.dot(X, w_true) + intercept_true


# add some label noise
y_noise = rng.normal(scale=y_true.std() / 50, size=n_samples)
y_decision = y_true + y_noise

# threshold as a balanced binary classification problem
y = np.where(y_decision > 0, 1, -1)

np.mean(y == 1)

plt.title("Distribution of the target decision value")
_ = plt.hist(y_decision, bins=30)

plt.plot(w_true, 'o', color='g', alpha=0.8)
_ = plt.title("Ground truth parameters")

print(intercept_true)

def predict(w, X, intercept=0):
    return np.where(np.dot(X, w) + intercept > 0, 1, -1)

np.mean(predict(w_true, X, intercept=intercept_true) == y)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

from sklearn.linear_model import PassiveAggressiveClassifier

%time clf = PassiveAggressiveClassifier(C=1e6, n_iter=100).fit(X_train, y_train)

plt.plot(w_true / np.abs(w_true).max(), 'o', color='g', alpha=0.6, label='true')
coef_max = np.abs(clf.coef_.ravel()).max()
plt.plot(clf.coef_.ravel() / coef_max, 'o', color='b', alpha=0.6, label='estimate')

clf.score(X_test, y_test)

from sklearn.linear_model import SGDClassifier

%time clf = SGDClassifier(loss='log', penalty='l1', alpha=1e-6, n_iter=300).fit(X_train, y_train)

plt.plot(w_true / np.abs(w_true).max(), 'o', color='g', alpha=0.6, label='true')
coef_max = np.abs(clf.coef_.ravel()).max()
plt.plot(clf.coef_.ravel() / coef_max, 'o', color='b', alpha=0.6, label='estimate')

clf.score(X_test, y_test)

from collections import defaultdict

clf = SGDClassifier(loss='log', penalty='l1', learning_rate='invscaling',
                    power_t=0.6, eta0=1e5, alpha=1e-6)
n_iter = 1000
average_history = 100
weights = []
intercepts = []
scores = defaultdict(list)
for i in range(n_iter):
    clf.partial_fit(X_train, y_train, classes=[-1, 1])
    weights.append(clf.coef_.ravel().copy())
    intercepts.append(clf.intercept_[0])
    mean_coef = np.mean(weights[-average_history:], axis=0)
    mean_intercept = np.mean(intercepts[-average_history:], axis=0)
    
    scores['sgd_test'].append(clf.score(X_test, y_test))
    scores['sgd_train'].append(clf.score(X_train, y_train))
    
    # XXX: clf might have a non-zero bias...
    y_test_pred = predict(mean_coef, X_test, intercept=mean_intercept)
    scores['asgd_test'].append(np.mean(y_test_pred == y_test))
    y_train_pred = predict(mean_coef, X_train, intercept=mean_intercept)
    scores['asgd_train'].append(np.mean(y_train_pred == y_train))

plt.figure(figsize=(13, 4))
plt.subplot(121)
plt.plot(w_true / np.abs(w_true).max(), 'o', color='g', alpha=0.6, label='true')
coef_max = np.abs(mean_coef).max()
plt.plot(mean_coef / coef_max, 'o', color='b', alpha=0.6, label='estimate')
print("Intercept: {}".format(clf.intercept_[0] / coef_max))

plt.subplot(122)
for name, values in sorted(scores.items(), reverse=True):
    plt.plot(values, label=name)
    print("{}: {:.4f}".format(name, values[-1]))
_ = plt.legend(loc='best')
plt.ylim(0.90, 1)

from sklearn.linear_model import LogisticRegression

%time lr = LogisticRegression(C=1e6, penalty='l1').fit(X_train, y_train)

plt.plot(w_true / np.abs(w_true).max(), 'o', color='g', alpha=0.6, label='true')
coef_max = np.abs(lr.coef_.ravel()).max()
plt.plot(lr.coef_.ravel() / coef_max, 'o', color='b', alpha=0.6, label='estimate')

print("Log Reg test: {:.4f}".format(lr.score(X_test, y_test)))
print("Intercept: {}".format(lr.intercept_[0] / coef_max))

from sklearn.svm import LinearSVC

%time svc = LinearSVC(C=1e6).fit(X_train, y_train)

plt.plot(w_true / np.abs(w_true).max(), 'o', color='g', alpha=0.6, label='true')
coef_max = np.abs(svc.coef_.ravel()).max()
plt.plot(svc.coef_.ravel() / coef_max, 'o', color='b', alpha=0.6, label='estimate')

print("Intercept: {}".format(svc.intercept_[0] / coef_max))
print("SVM test: {:.4f}".format(svc.score(X_test, y_test)))