This is currently mainly converted notebook from my try out and test python script
The plan is to extend the prediction facilities in statsmodels 0.8. The current version in PR is focused on the confidence interval and setting up the structure.
predict
as method stays simple so it can be used in a loop for example for cross-validation without extra overhead. All additional predict features will be added as a result instance of a new get_prediction
method.
The discussion for the design for enhancements is spread out over several issues on github.
I'm not yet happy with some of the names of the attributes or columns in the data frame.
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 20 12:01:13 2014
Author: Josef Perktold
License for Code: BSD-3
"""
import numpy as np
from statsmodels.regression.linear_model import OLS, WLS
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.regression._prediction import get_prediction
from statsmodels.genmod._prediction import params_transform_univariate
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10.0, 8.0)
# from example wls.py
np.random.seed(123456)
nsample = 50
x = np.linspace(0, 20, nsample)
X = np.column_stack((x, (x - 5)**2))
from statsmodels.tools.tools import add_constant
X = add_constant(X)
beta = [5., 0.5, -0.01]
sig = 0.5
w = np.ones(nsample)
w[nsample * 6/10:] = 3
y_true = np.dot(X, beta)
e = np.random.normal(size=nsample)
y = y_true + sig * w * e
X = X[:,[0,1]]
minimal version
mod_wls = WLS(y, X, weights=1./w)
res_wls = mod_wls.fit()
pred_res = get_prediction(res_wls)
pf = pred_res.summary_frame()
pf.head()
mean | mean_se | mean_ci_lower | mean_ci_upper | obs_ci_lower | obs_ci_upper | |
---|---|---|---|---|---|---|
0 | 5.138253 | 0.198164 | 4.739818 | 5.536688 | 3.744384 | 6.532122 |
1 | 5.306936 | 0.191008 | 4.922888 | 5.690984 | 3.917112 | 6.696761 |
2 | 5.475619 | 0.183983 | 5.105697 | 5.845542 | 4.089632 | 6.861607 |
3 | 5.644303 | 0.177102 | 5.288214 | 6.000391 | 4.261943 | 7.026662 |
4 | 5.812986 | 0.170385 | 5.470404 | 6.155568 | 4.434044 | 7.191928 |
# ### WLS knowing the true variance ratio of heteroscedasticity
mod_wls = WLS(y, X, weights=1./w)
res_wls = mod_wls.fit()
prstd, iv_l, iv_u = wls_prediction_std(res_wls)
pred_res = get_prediction(res_wls)
ci = pred_res.conf_int(obs=True)
from numpy.testing import assert_allclose
assert_allclose(pred_res.se_obs, prstd, rtol=1e-13)
assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13)
print pred_res.summary_frame().head()
mean mean_se mean_ci_lower mean_ci_upper obs_ci_lower obs_ci_upper 0 5.138253 0.198164 4.739818 5.536688 3.744384 6.532122 1 5.306936 0.191008 4.922888 5.690984 3.917112 6.696761 2 5.475619 0.183983 5.105697 5.845542 4.089632 6.861607 3 5.644303 0.177102 5.288214 6.000391 4.261943 7.026662 4 5.812986 0.170385 5.470404 6.155568 4.434044 7.191928
pred_res2 = res_wls.get_prediction()
ci2 = pred_res2.conf_int(obs=True)
from numpy.testing import assert_allclose
assert_allclose(pred_res2.se_obs, prstd, rtol=1e-13)
assert_allclose(ci2, np.column_stack((iv_l, iv_u)), rtol=1e-13)
print pred_res2.summary_frame().head()
mean mean_se mean_ci_lower mean_ci_upper obs_ci_lower obs_ci_upper 0 5.138253 0.198164 4.739818 5.536688 3.744384 6.532122 1 5.306936 0.191008 4.922888 5.690984 3.917112 6.696761 2 5.475619 0.183983 5.105697 5.845542 4.089632 6.861607 3 5.644303 0.177102 5.288214 6.000391 4.261943 7.026662 4 5.812986 0.170385 5.470404 6.155568 4.434044 7.191928
pf = pred_res2.summary_frame()
c = pf.columns
dir(c)
cols = pf.columns.tolist()
cols.remove('mean_se')
ax = pf[cols].plot()
ax.plot(y, 'o')
#ax.fill_between(np.asarray(pf.index), pf['obs_ci_upper'].values, pf['obs_ci_lower'].values, facecolor='blue', alpha=0.1)
#ax.fill_between(np.asarray(pf.index), pf['mean_ci_upper'], pf['mean_ci_lower'].values, facecolor='red', alpha=0.1)
ax.fill_between(np.asarray(pf.index), pf['obs_ci_upper'].values, pf['obs_ci_lower'].values, alpha=0.1)
ax.fill_between(np.asarray(pf.index), pf['mean_ci_upper'], pf['mean_ci_lower'].values, alpha=0.1)
# there should be a pretty plotting method in `get_prediction` results
<matplotlib.collections.PolyCollection at 0x8f3aed0>
We can also calculate the confidence interval for a different significance level than the default 0.05. For example, we can get the confidence intervals with a coverage of 1 - alpha = 0.9
by specifying alpha=0.1
as keyword argument.
pf01 = pred_res2.summary_frame(alpha=0.1)
print(pred_res2.conf_int(alpha=0.1)[:5])
print(pred_res2.conf_int(obs=True, alpha=0.1)[:5])
pf01.head()
[[ 4.80588804 5.47061803] [ 4.98657249 5.62730001] [ 5.1670392 5.78419975] [ 5.34726223 5.94134314] [ 5.52721209 6.09875972]] [[ 3.9755207 6.30098538] [ 4.1475774 6.4662951 ] [ 4.31946127 6.63177767] [ 4.49117085 6.79743453] [ 4.66270478 6.96326703]]
mean | mean_se | mean_ci_lower | mean_ci_upper | obs_ci_lower | obs_ci_upper | |
---|---|---|---|---|---|---|
0 | 5.138253 | 0.198164 | 4.805888 | 5.470618 | 3.975521 | 6.300985 |
1 | 5.306936 | 0.191008 | 4.986572 | 5.627300 | 4.147577 | 6.466295 |
2 | 5.475619 | 0.183983 | 5.167039 | 5.784200 | 4.319461 | 6.631778 |
3 | 5.644303 | 0.177102 | 5.347262 | 5.941343 | 4.491171 | 6.797435 |
4 | 5.812986 | 0.170385 | 5.527212 | 6.098760 | 4.662705 | 6.963267 |
The confidence interval depends on the covariance matrix of the parameter estimates and on whether we use the t or the normal distribution. The distribution can be selected with use_t
, where use_t=True
requests the use of the t distribution and False
requests the normal distribution. We can also choose a covariance matrix of the parameter estimates that is robust to heteroscedasticity or correlation across observations by using the cov_type
argument.
As it turns out. using the heteroscedasticity robust covariance implies in this case slightly narrower confidence intervals than the nonrobust covariance.
res_wls_n = mod_wls.fit(use_t=False)
pred_wls_n = res_wls_n.get_prediction()
print(pred_wls_n.summary_frame().head())
mean mean_se mean_ci_lower mean_ci_upper obs_ci_lower obs_ci_upper 0 5.138253 0.198164 4.749859 5.526647 3.779512 6.496994 1 5.306936 0.191008 4.932567 5.681306 3.952137 6.661735 2 5.475619 0.183983 5.115020 5.836219 4.124561 6.826678 3 5.644303 0.177102 5.297188 5.991417 4.296780 6.991825 4 5.812986 0.170385 5.479037 6.146934 4.468795 7.157177
mod_ols = OLS(y, X)
res_ols = mod_ols.fit(cov_type='HC3')
pf = res_ols.get_prediction().summary_frame()
ax = pf[cols].plot() # cols defined before
ax.plot(y, 'o')
print(pf.head())
mean mean_se mean_ci_lower mean_ci_upper obs_ci_lower obs_ci_upper 0 5.239191 0.186230 4.874186 5.604195 3.253283 7.225098 1 5.400867 0.179019 5.049996 5.751739 3.417509 7.384225 2 5.562544 0.172015 5.225400 5.899688 3.581568 7.543520 3 5.724221 0.165245 5.400347 6.048094 3.745460 7.702981 4 5.885897 0.158737 5.574779 6.197015 3.909184 7.862610
res_ols = mod_ols.fit()
pf = res_ols.get_prediction().summary_frame()
ax = pf[cols].plot() # cols defined before
ax.plot(y, 'o')
print(pf.head())
mean mean_se mean_ci_lower mean_ci_upper obs_ci_lower obs_ci_upper 0 5.239191 0.277531 4.681178 5.797203 3.160356 7.318025 1 5.400867 0.269166 4.859672 5.942062 3.326483 7.475251 2 5.562544 0.260899 5.037972 6.087116 3.492435 7.632653 3 5.724221 0.252738 5.216057 6.232384 3.658208 7.790233 4 5.885897 0.244694 5.393907 6.377888 3.823803 7.947991
We can also estimate the same linear models using Generalized Linear Models, GLM, which defaults to the normal distribution. GLM corresponds in this case to OLS and there is currently no WLS equivalent in GLM since it does not yet allow for user specified weights. In the following we rescale the variables to take account of the weights. Note, this implies that we get the same parameter estimates and their covariance matrix, but we prediction and other statistics are for the transformed model and differ from WLS.
We only have linear prediction in this example since it is a linear model. The prediction for nonlinear link functions is a nonlinear transformation of the linear prediction. The confidecne intervals in this case are defined by default by endpoint transformation, which means that we calculate the confidence interval for the linear predictions and nonlinearly transform the lower and upper bounds. This is supposed to have in general better coverage than the delta method. The delta method uses a local linear approximation as is not yet implemented for the prediction interval.
There is another difference between the linear model with the assumption of normally distributed errors, and general nonlinear and nonnormal models like those in GLM or in discrete
. In the linear normal model the sampling distribution of our parameter estimate and the distribution of the error term, or equivalently of an observation, are both either t or normal distributed under our assumptions. We can therefore combine those and calculate a confidence interval for a new observation based on either the normal or the t distribution. This is not the case in nonnormal models. Take as an example the Poisson distribution, a observation for given parameters is Poisson distributed, but our parameter estimates are (asymptotically) normal distributed. We would need to combine those distributions in order to obtain confidence intervals for a new observations. This is not possible in a closed form, but we could simulate it based on our estimated model or by bootstrapping.
Currently only the confidence intervals for the "mean", that is, for the conditional expectation, is available for non-normal models.
from statsmodels.genmod.generalized_linear_model import GLM
w_sqrt = np.sqrt(w)
mod_glm = GLM(y/w_sqrt, X/w_sqrt[:,None])
res_glm = mod_glm.fit()
pred_glm = res_glm.get_prediction()
print(pred_glm.summary_frame().head())
mean mean_se mean_ci_lower mean_ci_upper 0 5.110110 0.213037 4.692565 5.527655 1 5.275544 0.205344 4.873077 5.678012 2 5.440978 0.197792 5.053314 5.828643 3 5.606413 0.190395 5.233246 5.979580 4 5.771847 0.183173 5.412834 6.130860
GLM uses the normal distribution for parameter inference by default. We can use the t distribution by setting the keyword argument use_t=True
.
res_glm_t = mod_glm.fit(use_t=True)
pred_glm_t = res_glm_t.get_prediction()
print(pred_glm_t.summary_frame().head())
mean mean_se mean_ci_lower mean_ci_upper 0 5.110110 0.213037 4.681771 5.538449 1 5.275544 0.205344 4.862672 5.688417 2 5.440978 0.197792 5.043292 5.838665 3 5.606413 0.190395 5.223598 5.989227 4 5.771847 0.183173 5.403553 6.140141
This is not really prediction but uses parts of the same code.
This is or will be equivalent to Stata's eform
for exponentiated parameters.
.....
rates = params_transform_univariate(res_glm.params, res_glm.cov_params())
print('\nRates exp(params)')
print(rates.summary_frame())
rates2 = np.column_stack((np.exp(res_glm.params),
res_glm.bse * np.exp(res_glm.params),
np.exp(res_glm.conf_int())))
assert_allclose(rates.summary_frame().values, rates2, rtol=1e-13)
Rates exp(params) mean mean_se mean_ci_lower mean_ci_upper 0 165.688573 35.297780 109.132780 251.553230 1 1.499773 0.034228 1.434167 1.568381
from statsmodels.genmod.families import links
# with identity transform
pt = params_transform_univariate(res_glm.params, res_glm.cov_params(), link=links.identity())
print(pt.tvalues)
assert_allclose(pt.tvalues, res_glm.tvalues, rtol=1e-13)
assert_allclose(pt.se_mean, res_glm.bse, rtol=1e-13)
ptt = pt.t_test()
assert_allclose(ptt[0], res_glm.tvalues, rtol=1e-13)
assert_allclose(ptt[1], res_glm.pvalues, rtol=1e-13)
[ 23.98697085 17.75989187]