%pylab inline

import moss
import seaborn
seaborn.set()
colors = seaborn.color_palette()

# Generate two random predictors
X = randn(100, 2)
# Create y as a weighted sum of the 2 predictors and noise
noise = randn(100)
y = 0.7 * X[:, 0] + 0.5 * X[:, 1] + noise
# Threshold y so that it is a binary classification.  Let's pretend this is
# medical data and 0 means the patient died and 1 means the patient lived.
# x1 and x2 are measures of symptoms
y = (y > 0).astype(int)

plot(X[y == 0, 0], X[y == 0, 1], "o", label="dead")
plot(X[y == 1, 0], X[y == 1, 1], "o", color=colors[2], label="alive")
legend();

from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X, y)

# We can call the predict() method of the model to get estimated values
y_hat = model.predict(X)

# To be closer to the matlab tutorial, you can also use the predict_proba() method
# to return probabilistc estimates in favor of each class
y_hat2 = argmax(model.predict_proba(X), axis=1)

assert all(y_hat == y_hat2)

a_c = logical_and(y == 1, y_hat == 1)
a_i = logical_and(y == 1, y_hat == 0)
d_c = logical_and(y == 0, y_hat == 0)
d_i = logical_and(y == 0, y_hat == 1)
plot(X[d_c, 0], X[d_c, 1], "o", color=colors[0], label="dead +")
plot(X[d_i, 0], X[d_i, 1], "o", color=colors[0], alpha=0.7, label="dead -")
plot(X[a_c, 0], X[a_c, 1], "o", color=colors[2], label="alive +")
plot(X[a_i, 0], X[a_i, 1], "o", color=colors[2], alpha=0.7, label="alive -")
legend();

# Define a function to generate fake data
def make_dataset():
    X = randn(100, 2)
    noise = randn(100)
    y = 0.7 * X[:, 0] + 0.5 * X[:, 1] + noise
    y = (y > 0).astype(int)
    return X, y

# Make two datasets as above
X1, y1 = make_dataset()
X2, y2 = make_dataset()

# Fit the linear model
line = LogisticRegression().fit(X1, y1)
# Add additional columns where the predictors are squared
X1_quad = hstack((X, X ** 2))
from sklearn.preprocessing import scale
X1_quad = scale(X1_quad)

# Predict with the line and quad models
line_model = LogisticRegression().fit(X1, y1)
yhat_line1 = line_model.predict(X1)
quad_model = LogisticRegression().fit(X1_quad, y1)
yhat_quad1 = quad_model.predict(X1_quad)

# Plot the models and estimates
figure(figsize=(12, 5))
# Plot the actual classes of Y1
subplot(121)
plot(X1[y1==0, 0], X1[y1==0, 1], "o", color=colors[0], markersize=15)
plot(X1[y1==1, 0], X1[y1==1, 1], "o", color=colors[2], markersize=15)

# Plot the predicted classs from the linear model
plot(X1[yhat_line1==0, 0], X1[yhat_line1==0, 1], "o", color=colors[0], markersize=7)
plot(X1[yhat_line1==1, 0], X1[yhat_line1==1, 1], "o", color=colors[2], markersize=7)

# Now do the same for the quadratic model
subplot(122)
plot(X1[y1==0, 0], X1[y1==0, 1], "o", color=colors[0], markersize=15)
plot(X1[y1==1, 0], X1[y1==1, 1], "o", color=colors[2], markersize=15)

# Plot the predicted classs from the linear model
plot(X1[yhat_quad1==0, 0], X1[yhat_quad1==0, 1], "o", color=colors[0], markersize=7)
plot(X1[yhat_quad1==1, 0], X1[yhat_quad1==1, 1], "o", color=colors[2], markersize=7)

# Compute the accuracy for each model
acc_line = line_model.score(X1, y1)
acc_quad = quad_model.score(X1_quad, y1)
print "Linear: %.2f" % acc_line
print "Quadratic: %.2f" % acc_quad

line_cv_acc = line_model.score(X2, y2)
X2_quad = scale(hstack((X2, X2 ** 2)))
quad_cv_acc = quad_model.score(X2_quad, y2)
print "Linear model cross-validated accuracy: %.2f" % line_cv_acc
print "Quadratic model cross-validated accuracy: %.2f" % quad_cv_acc