#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import seaborn as sns import statsmodels.api as sm pd.options.display.max_rows = 10 # In[2]: df = (sns.load_dataset('titanic') .dropna(subset=['survived', 'age'])) df.head() # In[3]: mod = sm.Logit.from_formula('survived ~ age', df) res = mod.fit() me = res.get_margeff() # In[4]: res.summary() # In[5]: me.summary() # Using Greene's notation here (~page 736). We've got a few things # # - Exogenous: $X$, a single row is $x.T$ # - estimated coef $\hat{\beta}$ # - predicted prob $\hat{F} = \Lambda(\bf(x)^T\hat{\bf{\beta}})$ # where $\Lambda$ is the logistic CDF # - Variance of $\hat{\beta} = V$, from statsmodels # # We want a prediction interval for the predicted probability around the CEF. We have # # $Asy. Var[\hat{F}] = [\partial \hat{F} / \partial \hat{\beta}]\bf{V} [\partial \hat{F} / \partial \hat{\beta}]$ # # And the vector of derivatives is # # $[\partial \hat{F} / \partial \hat{\beta}] = [d\hat{F}/dz][\partial z / \partial \hat{\beta}] = \hat{f} x$ # # where $\hat{f} = \hat{\Lambda}(1 - \hat{\Lambda})$ (the logistic pdf I beleive). # # And so # # $Asy.Var[\hat{F}] = \hat{f}^2 x^T\bf{V}x$ # # Note that the $x$ here is a single *row* from the matrix. But vectors are always column vectors so $x$ is $Kx1$, where $K$ is the number of predictors (2 for us). # In[6]: from scipy import stats Λ = lambda x: stats.logistic().cdf(x) λ = lambda x: stats.logistic().pdf(x) β_ = res.params.values.reshape(-1, 1) V_ = res.cov_params().values # In[7]: # this is the equation from above for Var(F) # only works for a single observation though def var_π(x, β, V_): # λ(z)**s * x.T @ V_ @ x prob = λ(x.T.dot(β))**2 * x.T.dot(V_).dot(x) # maybe this is it? return prob # In[8]: # for multiple observations, but not quite vectorized fully def var_πs(xx, β, V_): α = λ(xx.dot(β))**2 out = np.empty((500, 1)) for i, x in enumerate(xx): out[i] = x.T.dot(V_).dot(x) return α * out # In[9]: # Making some fake data. xx = sm.add_constant(np.linspace(df.age.min(), df.age.max(), 500).reshape(-1, 1)) πs = Λ(xx.dot(β_)) vv = np.sqrt(var_πs(xx, β_, V_)) # In[10]: plt.plot(xx[:, 1], πs) plt.fill_between(xx[:, 1], (πs - 1.96*vv).ravel(), (πs + 1.96*vv).ravel(), alpha=.25, color='g') # In[ ]: