from __future__ import division
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import LeaveOneOut
from sklearn.cross_validation import KFold
from sklearn.cross_validation import Bootstrap
from sklearn.metrics import mean_squared_error
%matplotlib inline
auto_df = pd.read_csv("../data/Auto.csv", na_values="?")
auto_df.dropna(inplace=True)
auto_df.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18 | 8 | 307 | 130 | 3504 | 12.0 | 70 | 1 | chevrolet chevelle malibu |
1 | 15 | 8 | 350 | 165 | 3693 | 11.5 | 70 | 1 | buick skylark 320 |
2 | 18 | 8 | 318 | 150 | 3436 | 11.0 | 70 | 1 | plymouth satellite |
3 | 16 | 8 | 304 | 150 | 3433 | 12.0 | 70 | 1 | amc rebel sst |
4 | 17 | 8 | 302 | 140 | 3449 | 10.5 | 70 | 1 | ford torino |
5 rows × 9 columns
ax = auto_df.plot(x="horsepower", y="mpg", style="o")
ax.set_ylabel("mpg")
<matplotlib.text.Text at 0x57dccd0>
Instead of R's glm, we use Scikit-Learn's LinearRegression to arrive at very similar results.
clf = LinearRegression()
loo = LeaveOneOut(len(auto_df))
X = auto_df[["horsepower"]].values
y = auto_df["mpg"].values
n = np.shape(X)[0]
mses = []
for train, test in loo:
Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test]
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
mses.append(mean_squared_error(ytest, ypred))
np.mean(mses)
24.231513517929265
def loo_shortcut(X, y):
""" implement one-pass LOOCV calculation for linear models from ISLR Page 180 (Eqn 5.2) """
clf = LinearRegression()
clf.fit(X, y)
ypred = clf.predict(X)
xbar = np.mean(X, axis=0)
xsum = np.sum(np.power(X - xbar, 2))
nrows = np.shape(X)[0]
mses = []
for row in range(0, nrows):
hi = (1 / nrows) + (np.sum(X[row] - xbar) ** 2 / xsum)
mse = (y[row] - ypred[row]) ** 2 / (1 - hi)
mses.append(mse)
return np.mean(mses)
loo_shortcut(auto_df[["horsepower"]].values, auto_df["mpg"].values)
24.086699242104974
# LOOCV against models of different degrees
auto_df["horsepower^2"] = auto_df["horsepower"] * auto_df["horsepower"]
auto_df["horsepower^3"] = auto_df["horsepower^2"] * auto_df["horsepower"]
auto_df["horsepower^4"] = auto_df["horsepower^3"] * auto_df["horsepower"]
auto_df["horsepower^5"] = auto_df["horsepower^4"] * auto_df["horsepower"]
auto_df["unit"] = 1
colnames = ["unit", "horsepower", "horsepower^2", "horsepower^3", "horsepower^4", "horsepower^5"]
cv_errors = []
for ncols in range(2, 6):
X = auto_df[colnames[0:ncols]]
y = auto_df["mpg"]
clf = LinearRegression()
clf.fit(X, y)
cv_errors.append(loo_shortcut(X.values, y.values))
plt.plot(range(1,5), cv_errors)
plt.xlabel("degree")
plt.ylabel("cv.error")
<matplotlib.text.Text at 0x5d52b50>
cv_errors = []
for ncols in range(2, 6):
# each ncol corresponds to a polynomial model
X = auto_df[colnames[0:ncols]].values
y = auto_df["mpg"].values
kfold = KFold(len(auto_df), n_folds=10)
mses = []
for train, test in kfold:
# each model is cross validated 10 times
Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test]
clf = LinearRegression()
clf.fit(X, y)
ypred = clf.predict(Xtest)
mses.append(mean_squared_error(ypred, ytest))
cv_errors.append(np.mean(mses))
plt.plot(range(1,5), cv_errors)
plt.xlabel("degree")
plt.ylabel("cv.error")
<matplotlib.text.Text at 0x5f7c810>
cv_errors = []
for ncols in range(2, 6):
# each ncol corresponds to a polynomial model
X = auto_df[colnames[0:ncols]].values
y = auto_df["mpg"].values
n = len(auto_df)
bs = Bootstrap(n, train_size=int(0.9*n), test_size=int(0.1*n), n_iter=10, random_state=0)
mses = []
for train, test in bs:
# each model is resampled 10 times
Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test]
clf = LinearRegression()
clf.fit(X, y)
ypred = clf.predict(Xtest)
mses.append(mean_squared_error(ypred, ytest))
cv_errors.append(np.mean(mses))
plt.plot(range(1,5), cv_errors)
plt.xlabel("degree")
plt.ylabel("cv.error")
<matplotlib.text.Text at 0x5f9f810>
def alpha(x, y):
""" allocate alpha of your assets to x and (1-alpha) to y for optimum """
vx = np.var(x)
vy = np.var(y)
cxy = np.cov(x, y)
return ((vy - cxy) / (vx + vy - 2 * cxy))[0, 1]
# From ISLR package, retrieved with write.csv(Portfolio, "portfolio.csv", row.names=FALSE)
portfolio_df = pd.read_csv("../data/Portfolio.csv")
portfolio_df.head()
X | Y | |
---|---|---|
0 | -0.895251 | -0.234924 |
1 | -1.562454 | -0.885176 |
2 | -0.417090 | 0.271888 |
3 | 1.044356 | -0.734198 |
4 | -0.315568 | 0.841983 |
5 rows × 2 columns
alpha(portfolio_df["X"].values, portfolio_df["Y"].values)
0.57665115161041181
# Find the variance of alpha - shows that bootstrapping results in a near-normal distribution
X = portfolio_df["X"].values
Y = portfolio_df["Y"].values
bs = Bootstrap(len(portfolio_df), n_iter=1000, train_size=99, random_state=0)
alphas = []
for train, test in bs:
xtrain, ytrain = X[train], Y[train]
alphas.append(alpha(xtrain, ytrain))
plt.hist(alphas)
(array([ 1., 4., 27., 101., 214., 265., 231., 111., 30., 16.]), array([ 0.23030675, 0.2930067 , 0.35570664, 0.41840658, 0.48110652, 0.54380646, 0.6065064 , 0.66920634, 0.73190628, 0.79460622, 0.85730616]), <a list of 10 Patch objects>)