import pandas as pd
import numpy as np
import scipy as sp
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline
# Data from R ISLR package - write.csv(Boston, "Boston.csv", col.names = FALSE)
boston_df = pd.read_csv("../data/Boston.csv")
boston_df.head()
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
5 rows × 14 columns
# LSTAT - % of population with low status; MEDV - median value of home
ax = boston_df.plot(x="lstat", y="medv", style="o")
ax.set_ylabel("medv")
<matplotlib.text.Text at 0x4cbaf50>
# The statsmodels library provides a small subset of models, but has more emphasis on
# parameter estimation and statistical testing. The summary output is similar to R's
# summary function.
# X is an "array" of column values, y is a single column value
X = boston_df[["lstat"]].values
X = sm.add_constant(X) # add the intercept term
y = boston_df["medv"].values
ols = sm.OLS(y, X).fit()
ols.summary()
Dep. Variable: | y | R-squared: | 0.544 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.543 |
Method: | Least Squares | F-statistic: | 601.6 |
Date: | Fri, 23 May 2014 | Prob (F-statistic): | 5.08e-88 |
Time: | 20:55:56 | Log-Likelihood: | -1641.5 |
No. Observations: | 506 | AIC: | 3287. |
Df Residuals: | 504 | BIC: | 3295. |
Df Model: | 1 |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
const | 34.5538 | 0.563 | 61.415 | 0.000 | 33.448 35.659 |
x1 | -0.9500 | 0.039 | -24.528 | 0.000 | -1.026 -0.874 |
Omnibus: | 137.043 | Durbin-Watson: | 0.892 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 291.373 |
Skew: | 1.453 | Prob(JB): | 5.36e-64 |
Kurtosis: | 5.319 | Cond. No. | 29.7 |
# Scikit Learn provides a larger number of models, but has more of a Machine Learning POV
# and doesn't come with the statistical testing data shown above. However, it produces an
# identical linear model as shown below:
reg = LinearRegression()
X = boston_df[["lstat"]].values
y = boston_df["medv"].values
reg.fit(X, y)
(reg.intercept_, reg.coef_)
(34.553840879383131, array([-0.95004935]))
# Drawing the regression line on top of the scatterplot
ax = boston_df.plot(x="lstat", y="medv", style="o")
ax.set_ylabel("medv")
lstats = boston_df["lstat"].values
xs = range(int(np.min(X[:,0])), int(np.max(X[:,0])))
ys = [reg.predict([x]) for x in xs]
ax.plot(xs, ys, 'r', linewidth=2.5)
[<matplotlib.lines.Line2D at 0x4cbae90>]
# Prediction
test_data = [[5], [10], [15]]
reg.predict(test_data)
array([ 29.80359411, 25.05334734, 20.30310057])
# regression with 2 input columns
X = boston_df[["lstat", "age"]]
reg2 = LinearRegression()
reg2.fit(X, y)
(reg2.intercept_, reg2.coef_)
(33.222760531792929, array([-1.03206856, 0.03454434]))
# regression using all input columns
xcols = boston_df.columns[0:-1]
X = boston_df[xcols]
reg3 = LinearRegression()
reg3.fit(X, y)
(reg3.intercept_, reg3.coef_)
(36.459488385089394, array([ -1.08011358e-01, 4.64204584e-02, 2.05586264e-02, 2.68673382e+00, -1.77666112e+01, 3.80986521e+00, 6.92224640e-04, -1.47556685e+00, 3.06049479e-01, -1.23345939e-02, -9.52747232e-01, 9.31168327e-03, -5.24758378e-01]))
# Plotting a fitted regression with R returns 4 graphs - Residuals vs Fitted, Normal Q-Q,
# Scale-Location (Standardized Residuals vs Fitted), and Residuals vs Leverage. Only the
# Q-Q plot is available from statsmodels. The residuals vs Fitted function is implemented
# below and is used for plot #1 and #3. The Residuals vs Leverage is TBD.
def residuals_vs_fitted(fitted, residuals, xlabel, ylabel):
plt.subplot(111)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.scatter(fitted, residuals)
polyline = np.poly1d(np.polyfit(fitted, residuals, 2)) # model non-linearity with quadratic
xs = range(int(np.min(fitted)), int(np.max(fitted)))
plt.plot(xs, polyline(xs), color='r', linewidth=2.5)
def qq_plot(residuals):
sm.qqplot(residuals)
def standardize(xs):
xmean = np.mean(xs)
xstd = np.std(xs)
return (xs - xmean) / xstd
fitted = reg3.predict(X)
residuals = y - fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals")
fig = sm.qqplot(residuals, dist="norm", line="r")
residuals_vs_fitted(fitted, std_residuals, "Fitted", "Std.Residuals")
# fitting medv ~ lstat * age
boston_df["lstat*age"] = boston_df["lstat"] * boston_df["age"]
reg5 = LinearRegression()
X = boston_df[["lstat", "age", "lstat*age"]]
y = boston_df["medv"]
reg5.fit(X, y)
(reg5.intercept_, reg5.coef_)
(36.088535934612942, array([ -1.39211684e+00, -7.20859509e-04, 4.15595185e-03]))
fitted = reg5.predict(X)
residuals = y - fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals")
# fitting medv ~ lstat + I(lstat^2)
boston_df["lstat^2"] = boston_df["lstat"] ** 2
reg6 = LinearRegression()
X = boston_df[["lstat", "lstat^2"]]
y = boston_df["medv"]
reg6.fit(X, y)
# save the predicted ys for given xs for future plot
lstats = boston_df["lstat"].values
xs = range(int(np.min(lstats)), int(np.max(lstats)))
ys6 = [reg6.predict([x, x*x]) for x in xs]
(reg6.intercept_, reg6.coef_)
(42.862007328169383, array([-2.3328211 , 0.04354689]))
fitted = reg6.predict(X)
residuals = y - fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals")
# fitting medv ~ poly(lstat,4). We already have lstat^2 and lstat from previous
boston_df["lstat^4"] = np.power(boston_df["lstat"], 4)
boston_df["lstat^3"] = np.power(boston_df["lstat"], 4)
X = boston_df[["lstat^4", "lstat^3", "lstat^2", "lstat"]]
y = boston_df["medv"]
reg7 = LinearRegression()
reg7.fit(X, y)
ys7 = [reg7.predict([x**4, x**3, x**2, x]) for x in xs]
(reg7.intercept_, reg7.coef_)
(46.800943987797865, array([ -1.17511270e-05, -1.17511460e-05, 9.23027375e-02, -3.27115207e+00]))
fitted = reg7.predict(X)
residuals = y - fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals")
# Plot the different lines. Not that the green line (reg7) follows the distribution
# better than the red line (reg6).
ax = boston_df.plot(x="lstat", y="medv", style="o")
ax.set_ylabel("medv")
plt.plot(xs, ys6, color='r', linewidth=2.5)
plt.plot(xs, ys7, color='g', linewidth=2.5)
[<matplotlib.lines.Line2D at 0x56d1b50>]
# Data from ISLR package: write.csv(Carseats, 'Carseats.csv', col.names=FALSE)
carseats_df = pd.read_csv("../data/Carseats.csv")
carseats_df.head()
Sales | CompPrice | Income | Advertising | Population | Price | ShelveLoc | Age | Education | Urban | US | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9.50 | 138 | 73 | 11 | 276 | 120 | Bad | 42 | 17 | Yes | Yes |
1 | 11.22 | 111 | 48 | 16 | 260 | 83 | Good | 65 | 10 | Yes | Yes |
2 | 10.06 | 113 | 35 | 10 | 269 | 80 | Medium | 59 | 12 | Yes | Yes |
3 | 7.40 | 117 | 100 | 4 | 466 | 97 | Medium | 55 | 14 | Yes | Yes |
4 | 4.15 | 141 | 64 | 3 | 340 | 128 | Bad | 38 | 13 | Yes | No |
5 rows × 11 columns
# convert non-numeric to factors
carseats_df["ShelveLoc"] = pd.factorize(carseats_df["ShelveLoc"])[0]
carseats_df["Urban"] = pd.factorize(carseats_df["Urban"])[0]
carseats_df["US"] = pd.factorize(carseats_df["US"])[0]
# Sales ~ . + Income:Advertising + Age:Price
carseats_df["Income:Advertising"] = carseats_df["Income"] * carseats_df["Advertising"]
carseats_df["Age:Price"] = carseats_df["Age"] * carseats_df["Price"]
X = carseats_df[carseats_df[1:].columns]
y = carseats_df["Sales"]
reg = LinearRegression()
reg.fit(X, y)
(reg.intercept_, reg.coef_)
(5.773159728050814e-14, array([ 1.00000000e+00, -7.59808882e-16, 1.38777878e-17, 5.55111512e-17, 3.25260652e-18, 4.02455846e-16, 2.22044605e-16, -3.05311332e-16, -4.16333634e-17, -4.33680869e-18, -3.64291930e-17, -8.67361738e-18, -1.22277147e-19]))
# R has a contrasts() function that shows how factors are encoded by default. We can do
# this manually using scikit-learn's OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
colnames = ["ShelveLoc", "Urban", "US"]
enc = OneHotEncoder()
X = carseats_df[colnames]
enc.fit(X)
X_tr = enc.transform(X).toarray()
colnos = enc.n_values_
colnames_tr = []
for (idx, colname) in enumerate(colnames):
for i in range(0, colnos[idx]):
colnames_tr.append(colname + "_" + str(i))
col = 0
for colname_tr in colnames_tr:
carseats_df[colname_tr] = X_tr[:, col]
col = col + 1
del carseats_df["ShelveLoc"]
del carseats_df["Urban"]
del carseats_df["US"]
carseats_df[colnames_tr].head()
ShelveLoc_0 | ShelveLoc_1 | ShelveLoc_2 | Urban_0 | Urban_1 | US_0 | US_1 | |
---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
2 | 0 | 0 | 1 | 1 | 0 | 1 | 0 |
3 | 0 | 0 | 1 | 1 | 0 | 1 | 0 |
4 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
5 rows × 7 columns
We write a convenience function to plot a scatter plot and a regression line of two variables.
def regplot(x, y, xlabel, ylabel, dot_style, line_color):
x = x.values
y = y.values
reg = LinearRegression()
X = np.matrix(x).T
reg.fit(X, y)
ax = plt.scatter(x, y, marker=dot_style)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
xs = range(int(np.min(x)), int(np.max(x)))
ys = [reg.predict(x) for x in xs]
plt.plot(xs, ys, color=line_color, linewidth=2.5)
regplot(carseats_df["Price"], carseats_df["Sales"], "Price", "Sales", 'o', 'r')