# -*- coding: utf-8 -*-
"""
Created on Sun Aug 17 21:29:39 2014

Author: Josef Perktold, based on GEE example by Kerby Shedden

"""

import numpy as np
import pandas as pd
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.dependence_structures import (Exchangeable,
    Independence, Autoregressive)
from statsmodels.genmod import families

data_url = "http://vincentarelbundock.github.io/Rdatasets/csv/MASS/epil.csv"
data = pd.read_csv(data_url)
print data.head()

fam = families.Poisson()
ind = Independence()
mod1 = GEE.from_formula("y ~ age + trt + base", "subject", data, cov_struct=ind, family=fam)
rslt1 = mod1.fit()
print '\n'
print rslt1.summary()

rslt1.params

rslt1

mod2 = GLM.from_formula("y ~ age + trt + base", data, family=fam)
groups = data["subject"]


rslt2 = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=groups))
print '\n'
print(rslt2.summary())

rslt_glm = mod2.fit()
print '\n'
print(rslt_glm.summary())

import statsmodels.discrete.discrete_model as dm

mod3 = dm.Poisson.from_formula("y ~ age + trt + base", data)

rslt3 = mod3.fit(method='bfgs', cov_type='cluster', cov_kwds=dict(groups=groups))
print '\n'
print(rslt3.summary())

rslt2t = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=groups), use_t=True)
print(rslt2t.summary())

rslt3t = mod3.fit(method='bfgs', cov_type='cluster', cov_kwds=dict(groups=groups), use_t=True)
print(rslt3t.summary())

res_all = [rslt_glm, rslt1, rslt2, rslt3, rslt2t, rslt3t]
res_names = ['GLM non-robust', 'GEE', 'GLM robust', 'Poisson robust', 'GLM robust t', 'Poisson robust t']
import pandas
# pandas.DataFrame([getattr(res, 'params') for res in res_all], columns=res_names)  # broken with my versions
# pandas.DataFrame([pandas.Series(getattr(res, 'params'), index=rslt2.params.index) for res in res_all], columns=res_names)
# the second has only nans

# detour through numpy
print('Parameters')
pandas.DataFrame(np.round(np.column_stack([np.asarray(getattr(res, 'params')) for res in res_all]), 4), 
                 index=rslt2.params.index, columns=res_names)

print('Standard Errors')
pandas.DataFrame(np.round(np.column_stack([np.asarray(getattr(res, 'bse')) for res in res_all]), 4), 
                 index=rslt2.params.index, columns=res_names)

print('P-values')
pandas.DataFrame(np.round(np.column_stack([np.asarray(getattr(res, 'pvalues')) for res in res_all]), 4), 
                 index=rslt2.params.index, columns=res_names)

#covariance_type='robust'

# BUG see https://github.com/statsmodels/statsmodels/issues/1906
mod1b = GEE.from_formula("y ~ age + trt + base", "subject", data, cov_struct=ind, family=fam)
rslt1b = mod1b.fit(covariance_type='naive')
rslt1b_ = mod1b.fit(covariance_type='naive')
print '\n'
print rslt1b.summary()

#print(rslt1b_.summary())

rslt1b.covariance_type, rslt1b_.covariance_type

rslt1b.bse, rslt1b_.bse

rslt1b_.bse - rslt1.bse

bse_naive = np.sqrt(np.diag(rslt1b.naive_covariance))
print(bse_naive)
print(rslt_glm.bse.values)
#np.sqrt(np.diag(rslt1c.robust_covariance_bc))
print(bse_naive / rslt_glm.bse.values)

np.sqrt(rslt_glm.pearson_chi2 / rslt_glm.model.df_resid)

mod1c = GEE.from_formula("y ~ age + trt + base", "subject", data, cov_struct=ind, family=fam)
rslt1c = mod1c.fit(covariance_type='bias-reduced')
print '\n'
print rslt1c.summary()

rslt1c.bse - rslt1.bse

np.sqrt(np.diag(rslt1c.robust_covariance_bc))

rslt1c.bse


rslt1b.bse

rslt1b.covariance_type

rslt1b.covariance_type = 'naive'
del rslt1b._cache['bse']

rslt1b.bse

rslt1b_ = mod1b.fit(covariance_type='naive')

rslt1b_.covariance_type

rslt1b_.bse