Statsmodels Summary class refactor

In [1]:
from statsmodels.iolib.summary import (Summary, summary_params, summary_model)
import statsmodels.api as sm
import numpy as np

Estimate OLS regression

In [2]:
spector_data = sm.datasets.spector.load()
spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)
lpm_mod = sm.OLS(spector_data.endog, spector_data.exog)
res = lpm_mod.fit()

Compute diagnostics and store them in a Dict

In [3]:
from statsmodels.stats.stattools import (jarque_bera, omni_normtest, durbin_watson)
jb, jbpv, skew, kurtosis = jarque_bera(res.wresid)
omni, omnipv = omni_normtest(res.wresid)
diagnostic = {'Omnibus:': "%.3f" % omni,
              'Prob(Omnibus):': "%.3f" % omnipv,
              'Skew:': "%.3f" % skew,
              'Kurtosis:': "%.3f" % kurtosis,
              'Durbin-Watson:': "%.3f" % durbin_watson(res.wresid),
              'Jarque-Bera (JB):': "%.3f" % jb,
              'Prob(JB):': "%.3f" % jbpv
              }
print diagnostic
{'Prob(Omnibus):': '0.916', 'Durbin-Watson:': '2.346', 'Prob(JB):': '0.920', 'Jarque-Bera (JB):': '0.167', 'Skew:': '0.141', 'Kurtosis:': '2.786', 'Omnibus:': '0.176'}

Use helper function to produce a Dict with information about the model (should produce something for most results instances)

In [4]:
model_info = summary_model(res)
print model_info
OrderedDict([('Model:', 'OLS'), ('Dependent Variable:', 'y'), ('Date:', '2013-02-02 19:27'), ('No. Observations:', '    32'), ('Df Model:', '     3'), ('Df Residuals:', '    28'), ('Scale:', '0.1506'), ('R-squared:', '0.4159'), ('Adj. R-squared:', '0.3533'), ('AIC:', '33.9565'), ('BIC:', '39.8194'), ('Log-Likelihood:', '-12.9782')])

Use helper function to produce a DataFrame with information about the estimated parameters (for typical regression use-cases)

In [5]:
params_info = summary_params(res)
params_info
Out[5]:
Coef. Std.Err. t P>|t| [0.025 0.975]
x1 0.4639 0.1620 2.8641 0.0078 0.1321 0.7956
x2 0.0105 0.0195 0.5387 0.5944 -0.0294 0.0504
x3 0.3786 0.1392 2.7200 0.0111 0.0935 0.6636
const -1.4980 0.5239 -2.8594 0.0079 -2.5712 -0.4249

Create Summary Class object. Add elements in the order you want to see them printed.

In [6]:
smry = Summary()
smry.add_dict(model_info)
smry.add_df(params_info)
smry.add_dict(diagnostic)
print smry
=============================================================
Model:              OLS              Scale:          0.1506  
Dependent Variable: y                R-squared:      0.4159  
Date:               2013-02-02 19:27 Adj. R-squared: 0.3533  
No. Observations:   32               AIC:            33.9565 
Df Model:           3                BIC:            39.8194 
Df Residuals:       28               Log-Likelihood: -12.9782
-------------------------------------------------------------
          Coef.   Std.Err.     t     P>|t|    [0.025   0.975]
-------------------------------------------------------------
x1        0.4639    0.1620   2.8641  0.0078   0.1321   0.7956
x2        0.0105    0.0195   0.5387  0.5944  -0.0294   0.0504
x3        0.3786    0.1392   2.7200  0.0111   0.0935   0.6636
const    -1.4980    0.5239  -2.8594  0.0079  -2.5712  -0.4249
-------------------------------------------------------------
Prob(Omnibus):            0.916        Skew:            0.141
Durbin-Watson:            2.346        Kurtosis:        2.786
Prob(JB):                 0.920        Omnibus:         0.176
Jarque-Bera (JB):         0.167                              
=============================================================

model_info displayed in 3 columns instead of 2 (notice the automatic padding)

In [7]:
smry = Summary()
smry.add_dict(model_info, ncols=3)
smry.add_df(params_info)
print smry
==================================================================================
Model:              OLS              Df Model:     3      Adj. R-squared: 0.3533  
Dependent Variable: y                Df Residuals: 28     AIC:            33.9565 
Date:               2013-02-02 19:27 Scale:        0.1506 BIC:            39.8194 
No. Observations:   32               R-squared:    0.4159 Log-Likelihood: -12.9782
----------------------------------------------------------------------------------
                Coef.      Std.Err.        t        P>|t|       [0.025      0.975]
----------------------------------------------------------------------------------
x1              0.4639       0.1620      2.8641     0.0078      0.1321      0.7956
x2              0.0105       0.0195      0.5387     0.5944     -0.0294      0.0504
x3              0.3786       0.1392      2.7200     0.0111      0.0935      0.6636
const          -1.4980       0.5239     -2.8594     0.0079     -2.5712     -0.4249
==================================================================================

Float formatting for summary_params (Note: I also changed the print order)

In [8]:
params_info = summary_params(res)
smry = Summary()
smry.add_df(params_info, float_format='%.1f')
smry.add_dict(diagnostic, ncols=2)
smry.add_dict(model_info, ncols=2)
print smry
=============================================================
          Coef.   Std.Err.     t     P>|t|    [0.025   0.975]
-------------------------------------------------------------
x1        0.4639    0.1620   2.8641  0.0078   0.1321   0.7956
x2        0.0105    0.0195   0.5387  0.5944  -0.0294   0.0504
x3        0.3786    0.1392   2.7200  0.0111   0.0935   0.6636
const    -1.4980    0.5239  -2.8594  0.0079  -2.5712  -0.4249
-------------------------------------------------------------
Prob(Omnibus):            0.916        Skew:            0.141
Durbin-Watson:            2.346        Kurtosis:        2.786
Prob(JB):                 0.920        Omnibus:         0.176
Jarque-Bera (JB):         0.167                              
-------------------------------------------------------------
Model:              OLS              Scale:          0.1506  
Dependent Variable: y                R-squared:      0.4159  
Date:               2013-02-02 19:27 Adj. R-squared: 0.3533  
No. Observations:   32               AIC:            33.9565 
Df Model:           3                BIC:            39.8194 
Df Residuals:       28               Log-Likelihood: -12.9782
=============================================================

Display arbirary Numpy arrays

In [9]:
array2d = np.array([
    [123456, 'Other text here'],
    ['Some text over here', 654321]
    ])
array3d = np.array([
    ['Row 1', 123456, 'Other text here'],
    ['Row 2', 'Some text over here', 654321],
    ['Row 3', 'Some text over here', 654321]
    ])
print array2d
print array3d
[['123456' 'Other text here']
 ['Some text over here' '654321']]
[['Row 1' '123456' 'Other text here']
 ['Row 2' 'Some text over here' '654321']
 ['Row 3' 'Some text over here' '654321']]
In [10]:
smry = Summary()
smry.add_array(array2d)
smry.add_array(array3d)
print smry
=========================================
             123456       Other text here
Some text over here                654321
-----------------------------------------
Row 1              123456 Other text here
Row 2 Some text over here          654321
Row 3 Some text over here          654321
=========================================

Add custom field to the model info

In [11]:
model_info = summary_model(res)
model_info['Custom entry:'] = 'blah'
smry = Summary()
smry.add_dict(model_info, ncols=2)
smry.add_df(params_info)
print smry
=============================================================
Model:              OLS              R-squared:      0.4159  
Dependent Variable: y                Adj. R-squared: 0.3533  
Date:               2013-02-02 19:27 AIC:            33.9565 
No. Observations:   32               BIC:            39.8194 
Df Model:           3                Log-Likelihood: -12.9782
Df Residuals:       28               Custom entry:   blah    
Scale:              0.1506                                   
-------------------------------------------------------------
          Coef.   Std.Err.     t     P>|t|    [0.025   0.975]
-------------------------------------------------------------
x1        0.4639    0.1620   2.8641  0.0078   0.1321   0.7956
x2        0.0105    0.0195   0.5387  0.5944  -0.0294   0.0504
x3        0.3786    0.1392   2.7200  0.0111   0.0935   0.6636
const    -1.4980    0.5239  -2.8594  0.0079  -2.5712  -0.4249
=============================================================

In [12]:
smry
Out[12]:
Model: OLS R-squared: 0.4159
Dependent Variable: y Adj. R-squared: 0.3533
Date: 2013-02-02 19:27 AIC: 33.9565
No. Observations: 32 BIC: 39.8194
Df Model: 3 Log-Likelihood: -12.9782
Df Residuals: 28 Custom entry: blah
Scale: 0.1506
Coef. Std.Err. t P>|t| [0.025 0.975]
x1 0.4639 0.1620 2.8641 0.0078 0.1321 0.7956
x2 0.0105 0.0195 0.5387 0.5944 -0.0294 0.0504
x3 0.3786 0.1392 2.7200 0.0111 0.0935 0.6636
const -1.4980 0.5239 -2.8594 0.0079 -2.5712 -0.4249
In [13]:
print smry.as_latex()
\begin{table}
%\caption{}
%\label{}
\begin{tabular}{llll}
\hline
Model:              & OLS              & R-squared:      & 0.4159    \\
Dependent Variable: & y                & Adj. R-squared: & 0.3533    \\
Date:               & 2013-02-02 19:27 & AIC:            & 33.9565   \\
No. Observations:   & 32               & BIC:            & 39.8194   \\
Df Model:           & 3                & Log-Likelihood: & -12.9782  \\
Df Residuals:       & 28               & Custom entry:   & blah      \\
Scale:              & 0.1506           &                 &           \\
\hline
\end{tabular}
\hline
\begin{tabular}{lrrrrrr}
\hline
      &  Coef.  & Std.Err. &    t    & P>|t|  &  [0.025 &  0.975]  \\
\hline
x1    &  0.4639 &   0.1620 &  2.8641 & 0.0078 &  0.1321 &  0.7956  \\
x2    &  0.0105 &   0.0195 &  0.5387 & 0.5944 & -0.0294 &  0.0504  \\
x3    &  0.3786 &   0.1392 &  2.7200 & 0.0111 &  0.0935 &  0.6636  \\
const & -1.4980 &   0.5239 & -2.8594 & 0.0079 & -2.5712 & -0.4249  \\
\hline
\end{tabular}
\end{table}

Add text (auto-wrapped Bacon Ipsum)

In [14]:
smry.add_text('Boudin ribeye ham hock rump turducken, pig cow pork loin leberkas t-bone sausage strip steak. Ground round venison ham hock sausage bresaola capicola prosciutto shoulder swine. Spare ribs beef kielbasa salami fatback. Andouille short ribs doner corned beef ground round pig pork chop. Tail fatback biltong turkey jowl tri-tip venison spare ribs pancetta cow ham rump drumstick brisket corned beef. Kielbasa salami pork chop swine, corned beef hamburger bresaola turducken. Tail strip steak filet mignon doner brisket shank pastrami prosciutto kielbasa ham drumstick chuck.')
print smry
=============================================================
Model:              OLS              R-squared:      0.4159  
Dependent Variable: y                Adj. R-squared: 0.3533  
Date:               2013-02-02 19:27 AIC:            33.9565 
No. Observations:   32               BIC:            39.8194 
Df Model:           3                Log-Likelihood: -12.9782
Df Residuals:       28               Custom entry:   blah    
Scale:              0.1506                                   
-------------------------------------------------------------
          Coef.   Std.Err.     t     P>|t|    [0.025   0.975]
-------------------------------------------------------------
x1        0.4639    0.1620   2.8641  0.0078   0.1321   0.7956
x2        0.0105    0.0195   0.5387  0.5944  -0.0294   0.0504
x3        0.3786    0.1392   2.7200  0.0111   0.0935   0.6636
const    -1.4980    0.5239  -2.8594  0.0079  -2.5712  -0.4249
=============================================================
Boudin ribeye ham hock rump turducken, pig cow pork loin
leberkas t-bone sausage strip steak. Ground round venison ham
hock sausage bresaola capicola prosciutto shoulder swine.
Spare ribs beef kielbasa salami fatback. Andouille short ribs
doner corned beef ground round pig pork chop. Tail fatback
biltong turkey jowl tri-tip venison spare ribs pancetta cow
ham rump drumstick brisket corned beef. Kielbasa salami pork
chop swine, corned beef hamburger bresaola turducken. Tail
strip steak filet mignon doner brisket shank pastrami
prosciutto kielbasa ham drumstick chuck.

Add custom title

In [15]:
smry = Summary()
smry.add_dict(model_info, ncols=2)
smry.add_dict(diagnostic)
smry.add_title('A nice custom title')
print smry
                     A nice custom title
=============================================================
Model:              OLS              R-squared:      0.4159  
Dependent Variable: y                Adj. R-squared: 0.3533  
Date:               2013-02-02 19:27 AIC:            33.9565 
No. Observations:   32               BIC:            39.8194 
Df Model:           3                Log-Likelihood: -12.9782
Df Residuals:       28               Custom entry:   blah    
Scale:              0.1506                                   
-------------------------------------------------------------
Prob(Omnibus):            0.916        Skew:            0.141
Durbin-Watson:            2.346        Kurtosis:        2.786
Prob(JB):                 0.920        Omnibus:         0.176
Jarque-Bera (JB):         0.167                              
=============================================================

Add automatic title

In [16]:
smry = Summary()
smry.add_dict(model_info, ncols=2)
smry.add_dict(diagnostic)
smry.add_title(results=res)
print smry
               Results: Ordinary least squares
=============================================================
Model:              OLS              R-squared:      0.4159  
Dependent Variable: y                Adj. R-squared: 0.3533  
Date:               2013-02-02 19:27 AIC:            33.9565 
No. Observations:   32               BIC:            39.8194 
Df Model:           3                Log-Likelihood: -12.9782
Df Residuals:       28               Custom entry:   blah    
Scale:              0.1506                                   
-------------------------------------------------------------
Prob(Omnibus):            0.916        Skew:            0.141
Durbin-Watson:            2.346        Kurtosis:        2.786
Prob(JB):                 0.920        Omnibus:         0.176
Jarque-Bera (JB):         0.167                              
=============================================================

Vertical summaries

Fit 3 OLS models

In [17]:
import pandas as pd
import patsy
import statsmodels.api as sm
url = "http://vincentarelbundock.github.com/Rdatasets/csv/HistData/Guerry.csv"
df = pd.read_csv(url)
y, X = patsy.dmatrices('Lottery ~ Literacy + Wealth + Region', data=df, return_type='dataframe')
mod1 = sm.OLS(y,X).fit()
y, X = patsy.dmatrices('Lottery ~ Literacy + Region', data=df, return_type='dataframe')
mod2 = sm.OLS(y,X).fit()
y, X = patsy.dmatrices('Literacy ~ Wealth + Region', data=df, return_type='dataframe')
mod3 = sm.OLS(y,X).fit()
results = [mod1, mod2, mod3]
In [18]:
from statsmodels.iolib.summary import summary_col
print summary_col(results)
=============================================
             Model 0     Model 1    Model 2  
---------------------------------------------
Intercept   75.4006*** 102.5068*** 53.1807***
             (23.9714)   (25.7017)  (11.3779)
Literacy       -0.1858    -0.3981*           
              (0.2098)    (0.2265)           
Region[T.C]  -36.7489*    -37.4852  -22.1851*
             (22.0144)   (24.4271)  (11.5389)
Region[T.E] -52.1767**   -43.9906*     9.0667
             (21.6304)   (23.9123)  (11.5549)
Region[T.N] -46.7659**  -51.9861**     3.4836
             (21.5543)   (23.8808)  (11.5524)
Region[T.S]  -41.2972*   -40.8145*   -16.0874
             (21.7791)   (24.1664)  (11.5385)
Region[T.W] -46.8402**   -45.1710*  -19.8565*
             (21.9399)   (24.3415)  (11.5518)
Wealth       0.4515***              -0.1130**
              (0.1028)               (0.0537)
---------------------------------------------
N                   86          86         86
R2               0.358       0.199      0.613
=============================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

HTML print

In [19]:
summary_col(results)
Out[19]:
Model 0 Model 1 Model 2
Intercept 75.4006*** 102.5068*** 53.1807***
(23.9714) (25.7017) (11.3779)
Literacy -0.1858 -0.3981*
(0.2098) (0.2265)
Region[T.C] -36.7489* -37.4852 -22.1851*
(22.0144) (24.4271) (11.5389)
Region[T.E] -52.1767** -43.9906* 9.0667
(21.6304) (23.9123) (11.5549)
Region[T.N] -46.7659** -51.9861** 3.4836
(21.5543) (23.8808) (11.5524)
Region[T.S] -41.2972* -40.8145* -16.0874
(21.7791) (24.1664) (11.5385)
Region[T.W] -46.8402** -45.1710* -19.8565*
(21.9399) (24.3415) (11.5518)
Wealth 0.4515*** -0.1130**
(0.1028) (0.0537)
N 86 86 86
R2 0.358 0.199 0.613

Starless

In [20]:
print summary_col(results, stars=False)
=========================================
             Model 0   Model 1   Model 2 
-----------------------------------------
Intercept     75.4006  102.5068   53.1807
            (23.9714) (25.7017) (11.3779)
Literacy      -0.1858   -0.3981          
             (0.2098)  (0.2265)          
Region[T.C]  -36.7489  -37.4852  -22.1851
            (22.0144) (24.4271) (11.5389)
Region[T.E]  -52.1767  -43.9906    9.0667
            (21.6304) (23.9123) (11.5549)
Region[T.N]  -46.7659  -51.9861    3.4836
            (21.5543) (23.8808) (11.5524)
Region[T.S]  -41.2972  -40.8145  -16.0874
            (21.7791) (24.1664) (11.5385)
Region[T.W]  -46.8402  -45.1710  -19.8565
            (21.9399) (24.3415) (11.5518)
Wealth         0.4515             -0.1130
             (0.1028)            (0.0537)
-----------------------------------------
N                  86        86        86
R2              0.358     0.199     0.613
=========================================
Standard errors in parentheses.

Custom model names

In [21]:
print summary_col(results, model_names=['andouillette frite','b','c'])
=====================================================
            andouillette frite      b          c     
-----------------------------------------------------
Intercept           75.4006*** 102.5068*** 53.1807***
                     (23.9714)   (25.7017)  (11.3779)
Literacy               -0.1858    -0.3981*           
                      (0.2098)    (0.2265)           
Region[T.C]          -36.7489*    -37.4852  -22.1851*
                     (22.0144)   (24.4271)  (11.5389)
Region[T.E]         -52.1767**   -43.9906*     9.0667
                     (21.6304)   (23.9123)  (11.5549)
Region[T.N]         -46.7659**  -51.9861**     3.4836
                     (21.5543)   (23.8808)  (11.5524)
Region[T.S]          -41.2972*   -40.8145*   -16.0874
                     (21.7791)   (24.1664)  (11.5385)
Region[T.W]         -46.8402**   -45.1710*  -19.8565*
                     (21.9399)   (24.3415)  (11.5518)
Wealth               0.4515***              -0.1130**
                      (0.1028)               (0.0537)
-----------------------------------------------------
N                           86          86         86
R2                       0.358       0.199      0.613
=====================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

Bottom panel info

In [22]:
custom_info = {'N': lambda x: str(int(x.nobs)), 
               'BIC': lambda x: '%.3f' % x.aic, 
               'R2-adj': lambda x: '%.3f' % x.rsquared_adj, 
               'F': lambda x: '%.3f' % x.fvalue}
print summary_col([mod2, mod3, mod1], info_dict=custom_info)
=============================================
              Model 0    Model 1    Model 2  
---------------------------------------------
Intercept   102.5068*** 53.1807*** 75.4006***
              (25.7017)  (11.3779)  (23.9714)
Literacy       -0.3981*               -0.1858
               (0.2265)              (0.2098)
Region[T.C]    -37.4852  -22.1851*  -36.7489*
              (24.4271)  (11.5389)  (22.0144)
Region[T.E]   -43.9906*     9.0667 -52.1767**
              (23.9123)  (11.5549)  (21.6304)
Region[T.N]  -51.9861**     3.4836 -46.7659**
              (23.8808)  (11.5524)  (21.5543)
Region[T.S]   -40.8145*   -16.0874  -41.2972*
              (24.1664)  (11.5385)  (21.7791)
Region[T.W]   -45.1710*  -19.8565* -46.8402**
              (24.3415)  (11.5518)  (21.9399)
Wealth                   -0.1130**  0.4515***
                          (0.0537)   (0.1028)
---------------------------------------------
BIC             791.411    666.344    774.423
F                 3.271     20.864      6.205
N                    86         86         86
R2-adj            0.138      0.584      0.300
=============================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
In [22]: