Take some data that includes a response variable (y) and one or more predictors (x1, x2). We assume that the first column is the response variable.
y | x1 | x2 |
---|---|---|
1.4 | 2 | 2 |
2.4 | 3.1 | 4.1 |
3.34 | 4.4 | 1.3 |
We apply linear regression several different ways.
from __future__ import print_function, division
import requests
import numpy as np
import seaborn as sns
import pandas as pd
import pymc3 as pm
import pystan
import matplotlib as mpl
import matplotlib.pyplot as plt
from StringIO import StringIO
from pprint import pprint
from IPython.display import display, HTML, SVG
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm, probplot
def uprint(astr): print(astr + "\n" + "-"*len(astr))
def show_html(astr): return display(HTML('<center>{}</center>'.format(astr)))
def show_svg(astr, w=1000, h=1000):
SVG_HEAD = '''<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">'''
SVG_START = '''<svg width="{w:}px" height="{h:}px" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink= "http://www.w3.org/1999/xlink">'''
return display(SVG(SVG_HEAD + SVG_START.format(w=w, h=h) + astr + '</svg>'))
# Plotting style
plt.rc("axes", titlesize=20, labelsize=15, linewidth=.25, edgecolor='#444444')
sns.set_context("notebook", font_scale=1.2, rc={})
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
:0: FutureWarning: IPython widgets are experimental and may change in the future.
Here we load the data into a DataFrame and look at a summary.
# Read in data from google spreadsheet
gsheet_key = "1q2xrN2YQ4KrGedaupDX2aM9dWGSmz23v7UF2GxwlO0A"
gsheet_url = "https://docs.google.com/spreadsheets/d/1q2xrN2YQ4KrGedaupDX2aM9dWGSmz23v7UF2GxwlO0A/edit?usp=sharing"
csv = requests.get("https://docs.google.com/spreadsheets/d/{}/export?gid=0&format=csv".format(gsheet_key))
df = pd.DataFrame.from_csv(StringIO(csv.text), index_col=False)
show_html("<h1>Data summary</h1>")
show_html(df.describe(percentiles=[]).to_html())
TEST_LOG = False
if TEST_LOG:
df["x3"] = np.log(df["x3"])
y | x1 | x2 | x3 | |
---|---|---|---|---|
count | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
mean | 0.016842 | -0.052540 | -0.027275 | 0.011383 |
std | 0.092049 | 0.969025 | 0.959984 | 0.375131 |
min | -0.204356 | -2.257561 | -2.447623 | -0.859929 |
50% | 0.012005 | 0.019029 | -0.072433 | 0.040495 |
max | 0.252106 | 2.304593 | 2.329134 | 0.934868 |
By standardizing the data (subtracting the mean and dividing by the standard deviation), we can avoid some common problems. Specifically, when variables are on vastly different scales it can cause issues.
See http://andrewgelman.com/2009/07/11/when_to_standar/ for a discussion of standardization.
df_fits = {}
for col in df.columns:
_df_fit = StandardScaler().fit(df[col])
df[col] = _df_fit.transform(df[col])
df_fits[col] = _df_fit.inverse_transform
show_html("<h1>Data overview, all-vs-all</h1>")
g = sns.PairGrid(df)
#g.fig.suptitle('Data overview, all-vs-all', y=1.05, fontsize=28)
g.map_upper(sns.regplot)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_diag(plt.hist)
Regression performs better if the data are approximately normal.
It's not uncommon for some data types to be closer to log-normal than normal, i.e., the data look normal after taking the log. For example, adult blood pressure is approximately log-normal. If that is the case, we will use the logged version.
fig, ax = plt.subplots(figsize=(16,12))
#fig.suptitle('Compare fits for data vs logged data ', y=1.05, fontsize=28)
show_html("<h1>Compare fits for data vs logged data</h1>")
plt.subplots_adjust(hspace=1)
eps = 1e-4
for i, var in enumerate(df.columns):
ax1 = plt.subplot(len(df.columns), 2, i*2+1)
ax2 = plt.subplot(len(df.columns), 2, i*2+2)
#
# Original data (no log applied)
#
(_osm, _osr), (_slope, _intercept, _R) = probplot(df[var])
ax1.scatter(_osm, _osr, color='steelblue')
ax1.plot([_osm.min(), _osm.max()], [_intercept + _slope*_osm.min(), _intercept + _slope*_osm.max()], color='seagreen')
ax1.set(title="{} probability plot (no log) $R^2:{:.3f}$".format(var, _R**2))
#
# Data with log applied
#
log_X_var = np.log(eps - df[var].min() + df[var])
(_osm, _osr), (_slope, _intercept, _Rlog) = probplot(log_X_var)
ax2.scatter(_osm, _osr, color='steelblue')
ax2.plot([_osm.min(), _osm.max()], [_intercept + _slope*_osm.min(), _intercept + _slope*_osm.max()], color='seagreen')
ax2.set(title="{} probability plot (log) $R^2:{:.3f}$".format(var, _Rlog**2))
#
# Decide whether to use original version or the logged version
#
if _Rlog > _R:
_tmpfn1, _tmpval = df_fits[var], df[var].min() # necessary to avoid recursion problem
df_fits[var] = lambda x: _tmpfn1(np.exp(x) + _tmpval - eps) # change inverse transform
df[var] = np.log(eps - df[var].min() + df[var])
annotate_ax = ax2
else:
annotate_ax = ax1
annotate_ax.annotate("Higher $R^2$", xy=(0.7,0.15), textcoords='axes fraction', color='indianred', fontsize=24)
plt.tight_layout()
/Users/briann/anaconda/lib/python2.7/site-packages/matplotlib/text.py:1857: UserWarning: You have used the `textcoords` kwarg, but not the `xytext` kwarg. This can lead to surprising results. warnings.warn("You have used the `textcoords` kwarg, but not "
X = df[df.columns[1:]].values
y = df[df.columns[0]].values
ols = LinearRegression(fit_intercept=True, normalize=False, n_jobs=-1)
ols.fit(X, y)
show_html('''
<h1>OLS linear regression results</h1>
<table>
<tr><td>Intercept</td><td>{intercept:.3f}</td></tr>
<tr><td>Coefficients</td><td>{coef:s}</td></tr>
<tr><td>Residual sum of squares</td><td>{rss:.3f}</td></tr>
<tr><td>$R^2$ of the prediction.</td><td>{score:.3f}</td></tr>
</table>'''.format(intercept = ols.intercept_,
coef = ', '.join("{:s}:{:.3f}".format(df.columns[1:][i], ols.coef_[i]) for i in range(len(ols.coef_))),
rss = np.mean((ols.predict(X) - y) ** 2),
score = ols.score(X, y))
)
Intercept | 0.000 |
Coefficients | x1:0.218, x2:0.163, x3:0.126 |
Residual sum of squares | 0.911 |
$R^2$ of the prediction. | 0.089 |
# http://www.stat.columbia.edu/~gelman/presentations/stantalk2014_handout.pdf
# Instead of a gamma use a half-normal, as Gelman recommends (stan manual "Truncated Priors")
stan_regress = """
data {
int N;
int K;
vector[N] y;
matrix[N,K] X;
}
parameters {
real intercept;
vector[K] betas;
real<lower=0.01> sigma;
}
model {
sigma ~ normal(0,1);
y ~ normal(intercept + X*betas, sigma);
}"""
stan_data = {'X': X, 'y': y, 'N': X.shape[0], 'K': X.shape[1]}
# Fit the model
fit = pystan.stan(model_code=stan_regress, data=stan_data, iter=1000, chains=4)
print(fit)
Inference for Stan model: anon_model_fa066f9745cd16eb2f6cf993e14d51c9. 4 chains, each with iter=1000; warmup=500; thin=1; post-warmup draws per chain=500, total post-warmup draws=2000. mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat intercept-2.4e-3 4.0e-3 0.1 -0.18 -0.07-4.7e-3 0.06 0.18 570.0 1.0 betas[0] 0.22 4.2e-3 0.1 0.02 0.15 0.22 0.29 0.42 589.0 1.0 betas[1] 0.16 4.3e-3 0.1 -0.04 0.09 0.16 0.23 0.36 583.0 1.0 betas[2] 0.13 3.9e-3 0.1 -0.06 0.06 0.13 0.19 0.31 625.0 1.0 sigma 0.98 3.3e-3 0.08 0.85 0.93 0.98 1.03 1.15 515.0 1.0 lp__ -48.47 0.08 1.62 -52.55 -49.32 -48.15 -47.28 -46.3 432.0 1.0 Samples were drawn using NUTS(diag_e) at Wed Apr 29 16:15:12 2015. For each parameter, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence, Rhat=1).
/Users/briann/anaconda/lib/python2.7/multiprocessing/queues.py:390: UserWarning: Pickling fit objects is an experimental feature! The relevant StanModel instance must be pickled along with this fit object. When unpickling the StanModel must be unpickled first. return send(obj) /Users/briann/anaconda/lib/python2.7/multiprocessing/queues.py:390: UserWarning: Pickling fit objects is an experimental feature! The relevant StanModel instance must be pickled along with this fit object. When unpickling the StanModel must be unpickled first. return send(obj) /Users/briann/anaconda/lib/python2.7/multiprocessing/queues.py:390: UserWarning: Pickling fit objects is an experimental feature! The relevant StanModel instance must be pickled along with this fit object. When unpickling the StanModel must be unpickled first. return send(obj) /Users/briann/anaconda/lib/python2.7/multiprocessing/queues.py:390: UserWarning: Pickling fit objects is an experimental feature! The relevant StanModel instance must be pickled along with this fit object. When unpickling the StanModel must be unpickled first. return send(obj)
Bayesian regression is more flexible than OLS. We can choose priors that best represent the data.
burnin, num_samples = 50, 200
np.random.seed(1)
# Add a column for the intercept (x_0)
X_1 = np.column_stack( (np.ones(X.shape[0]), X) )
beta_names = list(df.columns[1:]) #['beta_{}'.format(i) for i in range(X_1.shape[1])]
def regress(regression="ridge", error="normal", beta_names=beta_names, nu=5):
with pm.Model() as model:
#
# Priors
#
if regression == "ols":
betas = [pm.Uniform(beta, -10, 10) for beta in beta_names] # >10 is really slow...
elif regression == "ridge":
betas = [pm.Normal(beta, mu=0, tau=1.0) for beta in beta_names]
elif regression == "lasso":
betas = [pm.Laplace(beta, mu=0, b=1.0) for beta in beta_names]
else:
raise ValueError, "No regression of type {}".format(regression)
# prior on Normal/T error variance. Gamma(1,1) has mean=1, variance=1
tau = pm.Gamma('tau', 1, 1)
#
# Likelihood
#
# mu ~ beta0 + beta1 * x1 + beta2 * x2 + beta3 * x3
mu = sum(betas[n]*X_1[:,n] for n in range(len(betas)))
if error == "normal":
y_obs = pm.Normal('y_obs', mu=mu, tau=tau, observed=y)
elif error == "T":
y_obs = pm.T('y_obs', nu=nu, mu=mu, lam=tau, observed=y)
else:
raise ValueError, "No error model of type {}".format(error)
#
# Sample
#
start = pm.find_MAP()
print("starting trace at MAP:", start)
for k in start:
if start[k] == 0: start[k] += .01 # avoids some corner cases?
step = pm.NUTS(state=start, scaling=start)
trace = pm.sample(num_samples, step, start, progressbar=True, njobs=1)[burnin:]
#pm.traceplot(trace)
return trace
trace = {}
show_html("<h1>Ordinary Least Squares</h1>")
trace['ols'] = regress(regression="ols", error="normal")
pm.traceplot(trace['ols'])
starting trace at MAP: {'x2': array(0.198376973449216), 'x3': array(0.16143339071490181), 'x1': array(3.5875903896395584e-09), 'tau': array(1.0565049625492926)} [-----------------100%-----------------] 200 of 200 complete in 0.5 sec
/Users/briann/anaconda/lib/python2.7/site-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility from scan_perform.scan_perform import *
show_html("<h1>Ridge Regression</h1>")
trace['ridge/T'] = regress(regression="ridge", error="T")
pm.traceplot(trace['ridge/T'])
starting trace at MAP: {'x2': array(0.21341491989839798), 'x3': array(0.18130062375987596), 'x1': array(-0.010650522835671281), 'tau': array(1.3725100112803654)} [-----------------100%-----------------] 200 of 200 complete in 1.1 sec
show_html("<h1>LASSO</h1>")
trace['lasso/T'] = regress(regression="lasso", error="T")
pm.traceplot(trace['lasso/T'])
starting trace at MAP: {'x2': array(0.20724400703292245), 'x3': array(0.173618006223192), 'x1': array(-0.0010335942764926889), 'tau': array(1.37173292764498)} [-----------------100%-----------------] 200 of 200 complete in 0.5 sec
#--------------------------------------------
# Compare models
#
stats, quartiles = {}, {}
for regression in trace:
stats[regression], quartiles[regression] = {}, {}
for var in trace[regression].varnames:
post_quar = pm.stats._calculate_posterior_quantiles(trace[regression][var], qlist=[5,25,50,75,95])
_key, quartiles[regression][var] = post_quar.next(), post_quar.next()
post_stats = pm.stats._calculate_stats(trace[regression][var], batches=100, alpha=0.05)
_key, stats[regression][var] = post_stats.next(), post_stats.next()
# Graph all models
min_beta = min(min(quartiles[regression][var].values()) for regression in trace for var in beta_names)
max_beta = max(max(quartiles[regression][var].values()) for regression in trace for var in beta_names)
rh = 20
rh2 = 20*1.5*len(beta_names)
svgw = 1000
svgh = rh2 * len(beta_names)
def _norm(val): return svgw * (val - min_beta) / (max_beta - min_beta)
def _normw(val): return svgw * val / (max_beta - min_beta)
ssvg = ''
colors = ["#DD5C5C ", "#54CD7F", "#54BFFF", "#44dd99"]
for i, beta in enumerate(beta_names):
for j, regression in enumerate(trace):
color = colors[j%len(colors)]
s, q = stats[regression][beta], quartiles[regression][beta]
x = q['q25']
w = q['q75'] - q['q25']
m, lo, hi = s['mean'], q['lo'], q['hi']
y = rh2*i + rh*j
ssvg += '<line x1="{x1:f}" y1="{y1:f}" x2="{x2:f}" y2="{y2:f}" stroke="#777777" stroke-width=".5"/>'.format(
x1=_norm(lo), y1=y+rh/2, x2=_norm(hi), y2=y+rh/2)
ssvg += '<rect x="{x:f}" y="{y:f}" width="{w:f}" height="{h:f}" fill="{color:s}" />'.format(
x=_norm(x), w=_normw(w), y=y, h=rh, color=color)
ssvg += '<rect x="{x:f}" y="{y:f}" width="2px" height="{h:f}" fill="#ffffff" />'.format(
x=_norm(m), w=2, y=y, h=rh)
ssvg += '<text x="{x:f}" y="{y:f}" text-anchor="middle" dy="0.3em">{text}</text>'.format(
x=_norm(x + w/2), y=y+rh/2, text=beta)
ssvg += '<text x="{x:f}" y="{y:f}" text-anchor="end" dy="0.3em" fill="#ffffff" font-size="10">{text}</text>'.format(
x=_norm(x + w) - 3, y=y+rh/2, text=regression)
show_html("<h1>Analysis of the difference between different regressions</h1>")
show_svg(ssvg, w=svgw, h=svgh)