# -*- coding: utf-8 -*- """ Created on Sun Aug 17 21:29:39 2014 Author: Josef Perktold, based on GEE example by Kerby Shedden """ import numpy as np import pandas as pd from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.generalized_estimating_equations import GEE from statsmodels.genmod.dependence_structures import (Exchangeable, Independence, Autoregressive) from statsmodels.genmod import families data_url = "http://vincentarelbundock.github.io/Rdatasets/csv/MASS/epil.csv" data = pd.read_csv(data_url) print data.head() fam = families.Poisson() ind = Independence() mod1 = GEE.from_formula("y ~ age + trt + base", "subject", data, cov_struct=ind, family=fam) rslt1 = mod1.fit() print '\n' print rslt1.summary() rslt1.params rslt1 mod2 = GLM.from_formula("y ~ age + trt + base", data, family=fam) groups = data["subject"] rslt2 = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=groups)) print '\n' print(rslt2.summary()) rslt_glm = mod2.fit() print '\n' print(rslt_glm.summary()) import statsmodels.discrete.discrete_model as dm mod3 = dm.Poisson.from_formula("y ~ age + trt + base", data) rslt3 = mod3.fit(method='bfgs', cov_type='cluster', cov_kwds=dict(groups=groups)) print '\n' print(rslt3.summary()) rslt2t = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=groups), use_t=True) print(rslt2t.summary()) rslt3t = mod3.fit(method='bfgs', cov_type='cluster', cov_kwds=dict(groups=groups), use_t=True) print(rslt3t.summary()) res_all = [rslt_glm, rslt1, rslt2, rslt3, rslt2t, rslt3t] res_names = ['GLM non-robust', 'GEE', 'GLM robust', 'Poisson robust', 'GLM robust t', 'Poisson robust t'] import pandas # pandas.DataFrame([getattr(res, 'params') for res in res_all], columns=res_names) # broken with my versions # pandas.DataFrame([pandas.Series(getattr(res, 'params'), index=rslt2.params.index) for res in res_all], columns=res_names) # the second has only nans # detour through numpy print('Parameters') pandas.DataFrame(np.round(np.column_stack([np.asarray(getattr(res, 'params')) for res in res_all]), 4), index=rslt2.params.index, columns=res_names) print('Standard Errors') pandas.DataFrame(np.round(np.column_stack([np.asarray(getattr(res, 'bse')) for res in res_all]), 4), index=rslt2.params.index, columns=res_names) print('P-values') pandas.DataFrame(np.round(np.column_stack([np.asarray(getattr(res, 'pvalues')) for res in res_all]), 4), index=rslt2.params.index, columns=res_names) #covariance_type='robust' # BUG see https://github.com/statsmodels/statsmodels/issues/1906 mod1b = GEE.from_formula("y ~ age + trt + base", "subject", data, cov_struct=ind, family=fam) rslt1b = mod1b.fit(covariance_type='naive') rslt1b_ = mod1b.fit(covariance_type='naive') print '\n' print rslt1b.summary() #print(rslt1b_.summary()) rslt1b.covariance_type, rslt1b_.covariance_type rslt1b.bse, rslt1b_.bse rslt1b_.bse - rslt1.bse bse_naive = np.sqrt(np.diag(rslt1b.naive_covariance)) print(bse_naive) print(rslt_glm.bse.values) #np.sqrt(np.diag(rslt1c.robust_covariance_bc)) print(bse_naive / rslt_glm.bse.values) np.sqrt(rslt_glm.pearson_chi2 / rslt_glm.model.df_resid) mod1c = GEE.from_formula("y ~ age + trt + base", "subject", data, cov_struct=ind, family=fam) rslt1c = mod1c.fit(covariance_type='bias-reduced') print '\n' print rslt1c.summary() rslt1c.bse - rslt1.bse np.sqrt(np.diag(rslt1c.robust_covariance_bc)) rslt1c.bse rslt1b.bse rslt1b.covariance_type rslt1b.covariance_type = 'naive' del rslt1b._cache['bse'] rslt1b.bse rslt1b_ = mod1b.fit(covariance_type='naive') rslt1b_.covariance_type rslt1b_.bse