import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas
%load_ext rpy2.ipython
%R library(lme4)
Loading required package: lattice Loading required package: Matrix
%R data <- read.csv('http://www.stat.wisc.edu/~ane/st572/data/floweringTime.txt', sep='\t')
%R data <- subset(data, !is.na(dtf))
%R data$logdtf <- log(data$dtf)
%R fit <- lmer(logdtf ~ (1|subspecies)+(1|inventoryID), data=data, REML=F)
%R print(summary(fit))
Linear mixed model fit by maximum likelihood ['lmerMod'] Formula: logdtf ~ (1 | subspecies) + (1 | inventoryID) Data: data AIC BIC logLik deviance -118.0267 -106.4653 63.0134 -126.0267 Random effects: Groups Name Variance Std.Dev. inventoryID (Intercept) 0.033431 0.1828 subspecies (Intercept) 0.094382 0.3072 Residual 0.009644 0.0982 Number of obs: 133, groups: inventoryID, 36; subspecies, 9 Fixed effects: Estimate Std. Error t value (Intercept) 3.9756 0.1092 36.42
data = pandas.read_csv('http://www.stat.wisc.edu/~ane/st572/data/floweringTime.txt', delimiter='\t', index_col=0)
data = data.dropna()
data['logdtf'] = np.log(data['dtf'])
vcf = {"subspecies" : "0 + C(subspecies)", "inventoryID" : "0 + C(inventoryID)"}
model = sm.MixedLM.from_formula('logdtf ~ 1', groups="subspecies", vc_formula=vcf, data=data)
result = model.fit(reml=False)
result.summary()
Model: | MixedLM | Dependent Variable: | logdtf |
No. Observations: | 133 | Method: | ML |
No. Groups: | 9 | Scale: | 0.0096 |
Min. group size: | 4 | Likelihood: | 63.0134 |
Max. group size: | 32 | Converged: | Yes |
Mean group size: | 14.8 |
Coef. | Std.Err. | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 3.976 | 0.109 | 36.415 | 0.000 | 3.762 | 4.190 |
inventoryID RE | 0.033 | 0.114 | ||||
subspecies RE | 0.094 | 0.554 |
%R fit.noinventory <- update(fit, .~.- (1|inventoryID))
%R print(summary(fit.noinventory))
Linear mixed model fit by maximum likelihood ['lmerMod'] Formula: logdtf ~ (1 | subspecies) Data: data AIC BIC logLik deviance -23.3853 -14.7142 14.6926 -29.3853 Random effects: Groups Name Variance Std.Dev. subspecies (Intercept) 0.10092 0.3177 Residual 0.03709 0.1926 Number of obs: 133, groups: subspecies, 9 Fixed effects: Estimate Std. Error t value (Intercept) 3.959 0.108 36.66
vcf = {"subspecies" : "1 + C(subspecies)"}
model_noinventory = sm.MixedLM.from_formula('logdtf ~ 1', groups="subspecies", vc_formula=vcf, data=data)
result_noinventory = model_noinventory.fit(reml=False)
result_noinventory.summary()
Model: | MixedLM | Dependent Variable: | logdtf |
No. Observations: | 133 | Method: | ML |
No. Groups: | 9 | Scale: | 0.0371 |
Min. group size: | 4 | Likelihood: | 14.6926 |
Max. group size: | 32 | Converged: | Yes |
Mean group size: | 14.8 |
Coef. | Std.Err. | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 3.959 | 0.108 | 36.656 | 0.000 | 3.747 | 4.171 |
subspecies RE | 0.101 | 0.270 |
%R print(anova(fit, fit.noinventory))
Data: data Models: fit.noinventory: logdtf ~ (1 | subspecies) fit: logdtf ~ (1 | subspecies) + (1 | inventoryID) Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq) fit.noinventory 3 -23.385 -14.714 14.693 -29.385 fit 4 -118.027 -106.465 63.013 -126.027 96.641 1 < 2.2e-16 fit.noinventory fit *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
result.compare_lr_test(result_noinventory)
(96.641440367875362, 8.3090825023326631e-23, 1)
%R data <- read.table('http://www.stat.wisc.edu/~ane/st572/data/bilingual.txt', header=T)
%R fit = lmer(score ~ phase * time * language + (1|pair) + (1|child), data=data)
%R print(summary(fit))
Linear mixed model fit by REML ['lmerMod'] Formula: score ~ phase * time * language + (1 | pair) + (1 | child) Data: data REML criterion at convergence: 1172.664 Random effects: Groups Name Variance Std.Dev. child (Intercept) 26.58 5.155 pair (Intercept) 0.00 0.000 Residual 92.59 9.622 Number of obs: 160, groups: child, 40; pair, 20 Fixed effects: Estimate Std. Error t value (Intercept) 24.000 2.441 9.832 phasepre-switch 6.500 3.043 2.136 timeend 34.250 3.043 11.256 languagemonolingual -1.000 3.452 -0.290 phasepre-switch:timeend -5.500 4.303 -1.278 phasepre-switch:languagemonolingual -5.750 4.303 -1.336 timeend:languagemonolingual -17.500 4.303 -4.067 phasepre-switch:timeend:languagemonolingual 20.250 6.086 3.327 Correlation of Fixed Effects: (Intr) phspr- timend lnggmn phspr-swtch:t phspr-swtch:l tmnd:l phspr-swtch -0.623 timeend -0.623 0.500 langgmnlngl -0.707 0.441 0.441 phspr-swtch:t 0.441 -0.707 -0.707 -0.312 phspr-swtch:l 0.441 -0.707 -0.354 -0.623 0.500 tmnd:lnggmn 0.441 -0.354 -0.707 -0.623 0.500 0.500 phspr-swt:: -0.312 0.500 0.500 0.441 -0.707 -0.707 -0.707
data = pandas.read_table('bilingual.txt', skipinitialspace=True)
vcf = {"pair" : "0 + C(pair)", "child" : "0 + C(child)"}
model = sm.MixedLM.from_formula('score ~ phase*time*language', groups='pair', vc_formula=vcf, data=data)
result = model.fit(reml=False)
print (result.summary())
Mixed Linear Model Regression Results ==================================================================================================== Model: MixedLM Dependent Variable: score No. Observations: 160 Method: ML No. Groups: 20 Scale: 87.9611 Min. group size: 8 Likelihood: -600.4745 Max. group size: 8 Converged: Yes Mean group size: 8.0 ---------------------------------------------------------------------------------------------------- Coef. Std.Err. z P>|z| [0.025 0.975] ---------------------------------------------------------------------------------------------------- Intercept 24.000 2.379 10.087 0.000 19.337 28.663 phase[T.pre-switch] 6.500 2.966 2.192 0.028 0.687 12.313 time[T.end] 34.250 2.966 11.548 0.000 28.437 40.063 language[T.monolingual] -1.000 3.365 -0.297 0.766 -7.595 5.595 phase[T.pre-switch]:time[T.end] -5.500 4.194 -1.311 0.190 -13.721 2.721 phase[T.pre-switch]:language[T.monolingual] -5.750 4.194 -1.371 0.170 -13.971 2.471 time[T.end]:language[T.monolingual] -17.500 4.194 -4.172 0.000 -25.721 -9.279 phase[T.pre-switch]:time[T.end]:language[T.monolingual] 20.250 5.932 3.414 0.001 8.624 31.876 child RE 25.250 1.639 pair RE 0.000 1.133 ====================================================================================================
/usr/local/lib/python2.7/dist-packages/statsmodels-0.7.0-py2.7-linux-x86_64.egg/statsmodels/regression/mixed_linear_model.py:1906: ConvergenceWarning: The MLE may be on the boundary of the parameter space. warnings.warn(msg, ConvergenceWarning)
#%R fitno3way = lmer(score ~ (1|pair) + (1|child), data=data)
%R fitno3way <- update(fit, .~. - phase:time:language)
%R print(summary(fitno3way))
Linear mixed model fit by REML ['lmerMod'] Formula: score ~ phase + time + language + (1 | pair) + (1 | child) + phase:time + phase:language + time:language Data: data REML criterion at convergence: 1188.769 Random effects: Groups Name Variance Std.Dev. child (Intercept) 24.55 4.955 pair (Intercept) 0.00 0.000 Residual 100.70 10.035 Number of obs: 160, groups: child, 40; pair, 20 Fixed effects: Estimate Std. Error t value (Intercept) 26.531 2.373 11.178 phasepre-switch 1.437 2.748 0.523 timeend 29.187 2.748 10.621 languagemonolingual -6.063 3.163 -1.916 phasepre-switch:timeend 4.625 3.173 1.457 phasepre-switch:languagemonolingual 4.375 3.173 1.379 timeend:languagemonolingual -7.375 3.173 -2.324 Correlation of Fixed Effects: (Intr) phspr- timend lnggmn phspr-swtch:t phspr-swtch:l phspr-swtch -0.579 timeend -0.579 0.333 langgmnlngl -0.666 0.290 0.290 phspr-swtch:t 0.334 -0.577 -0.577 0.000 phspr-swtch:l 0.334 -0.577 0.000 -0.502 0.000 tmnd:lnggmn 0.334 0.000 -0.577 -0.502 0.000 0.000
modelno3way = sm.MixedLM.from_formula('score ~ phase*time*language - phase:language:time', groups='pair', vc_formula=vcf, data=data)
resultno3way = modelno3way.fit(reml=False)
print (resultno3way.summary())
Mixed Linear Model Regression Results ======================================================================================= Model: MixedLM Dependent Variable: score No. Observations: 160 Method: ML No. Groups: 20 Scale: 96.5032 Min. group size: 8 Likelihood: -606.0360 Max. group size: 8 Converged: Yes Mean group size: 8.0 --------------------------------------------------------------------------------------- Coef. Std.Err. z P>|z| [0.025 0.975] --------------------------------------------------------------------------------------- Intercept 26.531 2.319 11.441 0.000 21.986 31.076 phase[T.pre-switch] 1.438 2.690 0.534 0.593 -3.835 6.710 time[T.end] 29.188 2.690 10.849 0.000 23.915 34.460 language[T.monolingual] -6.062 3.090 -1.962 0.050 -12.119 -0.006 phase[T.pre-switch]:time[T.end] 4.625 3.106 1.489 0.137 -1.464 10.714 phase[T.pre-switch]:language[T.monolingual] 4.375 3.106 1.408 0.159 -1.714 10.464 time[T.end]:language[T.monolingual] -7.375 3.106 -2.374 0.018 -13.464 -1.286 child RE 23.115 1.565 pair RE 0.000 1.081 =======================================================================================
/usr/local/lib/python2.7/dist-packages/statsmodels-0.7.0-py2.7-linux-x86_64.egg/statsmodels/regression/mixed_linear_model.py:1906: ConvergenceWarning: The MLE may be on the boundary of the parameter space. warnings.warn(msg, ConvergenceWarning)
result.compare_lr_test(resultno3way)
(11.122879559027069, 0.00085269302140603945, 1)
%R print(anova(fit, fitno3way))
Data: data Models: fitno3way: score ~ phase + time + language + (1 | pair) + (1 | child) + fitno3way: phase:time + phase:language + time:language fit: score ~ phase * time * language + (1 | pair) + (1 | child) Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq) fitno3way 10 1232.1 1262.8 -606.04 1212.1 fit 11 1223.0 1256.8 -600.47 1201.0 11.123 1 0.0008527 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
%R data <- read.csv('http://www-personal.umich.edu/~bwest/classroom.csv')
<DataFrame - Python:0x7fd0302e2d40 / R:0x859dce0> [IntVe..., IntVe..., IntVe..., ..., IntVe..., IntVe..., IntVe...] sex: <class 'rpy2.robjects.vectors.IntVector'> <IntVector - Python:0x7fd0302e1e18 / R:0x9dada60> [ 1, 0, 1, ..., 0, 0, 1] minority: <class 'rpy2.robjects.vectors.IntVector'> <IntVector - Python:0x7fd0302d9290 / R:0x9daed30> [ 1, 1, 1, ..., 0, 0, 0] mathkind: <class 'rpy2.robjects.vectors.IntVector'> <IntVector - Python:0x7fd0302e1dd0 / R:0x9db0000> [ 448, 460, 511, ..., 485, 473, 453] ... sex: <class 'rpy2.robjects.vectors.IntVector'> <IntVector - Python:0x7fd0302e1320 / R:0x9dbe080> [ 160, 160, 160, ..., 96, 239, 239] minority: <class 'rpy2.robjects.vectors.IntVector'> <IntVector - Python:0x7fd03030f248 / R:0x9dbf350> [ 1, 1, 1, ..., 107, 107, 107] mathkind: <class 'rpy2.robjects.vectors.IntVector'> <IntVector - Python:0x7fd0302e2c20 / R:0x9dc0620> [ 1, 2, 3, ..., 1188, 1189, 1190]
%R model <- lmer(mathgain ~ 1 + (1|schoolid) + (1|classid), REML=F, data=data)
%R print(summary(model))
Linear mixed model fit by maximum likelihood ['lmerMod'] Formula: mathgain ~ 1 + (1 | schoolid) + (1 | classid) Data: data AIC BIC logLik deviance 11779.331 11799.658 -5885.666 11771.331 Random effects: Groups Name Variance Std.Dev. classid (Intercept) 99.14 9.957 schoolid (Intercept) 75.38 8.682 Residual 1028.30 32.067 Number of obs: 1190, groups: classid, 312; schoolid, 107 Fixed effects: Estimate Std. Error t value (Intercept) 57.429 1.436 40
data = pandas.read_csv('http://www-personal.umich.edu/~bwest/classroom.csv')
vcf = {"schoolid" : "1 + C(schoolid)", "classid" : "1 + C(classid)"}
model = sm.MixedLM.from_formula('mathgain ~ 1', groups="schoolid", vc_formula=vcf, data=data)
result = model.fit(reml=False)
result.summary()
Model: | MixedLM | Dependent Variable: | mathgain |
No. Observations: | 1190 | Method: | ML |
No. Groups: | 107 | Scale: | 1050.5873 |
Min. group size: | 2 | Likelihood: | -5887.0811 |
Max. group size: | 31 | Converged: | Yes |
Mean group size: | 11.1 |
Coef. | Std.Err. | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 57.296 | 1.459 | 39.263 | 0.000 | 54.436 | 60.156 |
classid RE | 87.878 | 1.451 | ||||
schoolid RE | 11.238 | 1.763 |
%R model2 <- lmer(mathgain ~ mathkind + sex + minority + ses +(1|schoolid) + (1|classid), data=data, REML = F)
%R print(summary(model2))
Linear mixed model fit by maximum likelihood ['lmerMod'] Formula: mathgain ~ mathkind + sex + minority + ses + (1 | schoolid) + (1 | classid) Data: data AIC BIC logLik deviance 11406.963 11447.617 -5695.481 11390.963 Random effects: Groups Name Variance Std.Dev. classid (Intercept) 83.20 9.122 schoolid (Intercept) 72.67 8.524 Residual 732.19 27.059 Number of obs: 1190, groups: classid, 312; schoolid, 107 Fixed effects: Estimate Std. Error t value (Intercept) 282.71600 10.82832 26.109 mathkind -0.46965 0.02222 -21.139 sex -1.24898 1.65469 -0.755 minority -8.25670 2.33081 -3.542 ses 5.34235 1.23840 4.314 Correlation of Fixed Effects: (Intr) mthknd sex minrty mathkind -0.978 sex -0.044 -0.032 minority -0.307 0.164 -0.018 ses 0.140 -0.169 0.019 0.164
data = pandas.read_csv('http://www-personal.umich.edu/~bwest/classroom.csv')
vcf = {"schoolid" : "1 + C(schoolid)", "classid" : "1 + C(classid)"}
model2 = sm.MixedLM.from_formula('mathgain ~ mathkind + sex + minority + ses', groups="schoolid", vc_formula=vcf, data=data)
result2 = model2.fit(reml=False)
result2.summary()
Model: | MixedLM | Dependent Variable: | mathgain |
No. Observations: | 1190 | Method: | ML |
No. Groups: | 107 | Scale: | 755.8745 |
Min. group size: | 2 | Likelihood: | -5698.6892 |
Max. group size: | 31 | Converged: | Yes |
Mean group size: | 11.1 |
Coef. | Std.Err. | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 281.779 | 10.911 | 25.826 | 0.000 | 260.394 | 303.163 |
mathkind | -0.469 | 0.022 | -20.951 | 0.000 | -0.513 | -0.425 |
sex | -1.259 | 1.665 | -0.756 | 0.450 | -4.523 | 2.005 |
minority | -7.625 | 2.358 | -3.234 | 0.001 | -12.246 | -3.003 |
ses | 5.449 | 1.246 | 4.373 | 0.000 | 3.007 | 7.891 |
classid RE | 66.430 | 1.234 | ||||
schoolid RE | 26.071 | 1.540 |
%R print(anova(model,model2))
Data: data Models: model: mathgain ~ 1 + (1 | schoolid) + (1 | classid) model2: mathgain ~ mathkind + sex + minority + ses + (1 | schoolid) + model2: (1 | classid) Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq) model 4 11779 11800 -5885.7 11771 model2 8 11407 11448 -5695.5 11391 380.37 4 < 2.2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
result2.compare_lr_test(result)
(376.78373760784962, 2.8827851248469664e-80, 4)
%R model3 <- lmer(mathgain ~ minority +(mathkind|schoolid) + (sex|schoolid) + (ses|schoolid) , data=data, REML = F)
%R print(summary(model3))
Linear mixed model fit by maximum likelihood ['lmerMod'] Formula: mathgain ~ minority + (mathkind | schoolid) + (sex | schoolid) + (ses | schoolid) Data: data AIC BIC logLik deviance 11539.117 11600.097 -5757.558 11515.117 Random effects: Groups Name Variance Std.Dev. Corr schoolid (Intercept) 4.328e+04 2.080e+02 mathkind 1.898e-01 4.356e-01 -1.00 schoolid.1 (Intercept) 1.109e+02 1.053e+01 sex 1.561e+01 3.951e+00 -0.64 schoolid.2 (Intercept) 1.980e-20 1.407e-10 ses 4.670e+00 2.161e+00 -1.00 Residual 7.565e+02 2.750e+01 Number of obs: 1190, groups: schoolid, 107 Fixed effects: Estimate Std. Error t value (Intercept) 57.830 2.124 27.222 minority -6.463 2.342 -2.759 Correlation of Fixed Effects: (Intr) minority -0.775
%R print(anova(model2, model3))
Data: data Models: model2: mathgain ~ mathkind + sex + minority + ses + (1 | schoolid) + model2: (1 | classid) model3: mathgain ~ minority + (mathkind | schoolid) + (sex | schoolid) + model3: (ses | schoolid) Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq) model2 8 11407 11448 -5695.5 11391 model3 12 11539 11600 -5757.6 11515 0 4 1
%R model3 <- lmer(mathgain ~ mathkind + minority + ses +(1|schoolid) + (1|classid), data=data, REML = F, verbose=T)
%R print(summary(model3))
(NM) 20: f = 11481.7 at 0.4575 1.08188 (NM) 40: f = 11394.1 at 0.327462 0.403996 (NM) 60: f = 11392.2 at 0.286625 0.311154 (NM) 80: f = 11391.5 at 0.335163 0.313464 (NM) 100: f = 11391.5 at 0.33588 0.314426 (NM) 120: f = 11391.5 at 0.335987 0.31454
Linear mixed model fit by maximum likelihood ['lmerMod'] Formula: mathgain ~ mathkind + minority + ses + (1 | schoolid) + (1 | classid) Data: data AIC BIC logLik deviance 11405.532 11441.104 -5695.766 11391.532 Random effects: Groups Name Variance Std.Dev. classid (Intercept) 82.73 9.096 schoolid (Intercept) 72.51 8.515 Residual 732.93 27.073 Number of obs: 1190, groups: classid, 312; schoolid, 107 Fixed effects: Estimate Std. Error t value (Intercept) 282.34364 10.82030 26.094 mathkind -0.47015 0.02221 -21.168 minority -8.28499 2.33040 -3.555 ses 5.36063 1.23850 4.328 Correlation of Fixed Effects: (Intr) mthknd minrty mathkind -0.981 minority -0.308 0.164 ses 0.141 -0.168 0.164
model3 = sm.MixedLM.from_formula('mathgain ~ mathkind + minority + ses*ses', groups="schoolid", vc_formula=vcf, data=data)
result3 = model3.fit(reml=False)
result3.summary()
Model: | MixedLM | Dependent Variable: | mathgain |
No. Observations: | 1190 | Method: | ML |
No. Groups: | 107 | Scale: | 756.7407 |
Min. group size: | 2 | Likelihood: | -5698.9749 |
Max. group size: | 31 | Converged: | Yes |
Mean group size: | 11.1 |
Coef. | Std.Err. | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 281.406 | 10.902 | 25.812 | 0.000 | 260.039 | 302.774 |
mathkind | -0.469 | 0.022 | -20.979 | 0.000 | -0.513 | -0.426 |
minority | -7.654 | 2.357 | -3.247 | 0.001 | -12.274 | -3.034 |
ses | 5.470 | 1.246 | 4.389 | 0.000 | 3.027 | 7.912 |
classid RE | 65.463 | 1.228 | ||||
schoolid RE | 26.867 | 1.536 |
%R print(anova(model2,model3))
Data: data Models: model3: mathgain ~ mathkind + minority + ses + (1 | schoolid) + (1 | model3: classid) model2: mathgain ~ mathkind + sex + minority + ses + (1 | schoolid) + model2: (1 | classid) Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq) model3 7 11406 11441 -5695.8 11392 model2 8 11407 11448 -5695.5 11391 0.5691 1 0.4506
result2.compare_lr_test(result3)
(0.57129857088511926, 0.44974335987194591, 1)
Raise warnings for REML:
model4 = sm.MixedLM.from_formula('mathgain ~ mathkind + minority + ses*ses', groups="schoolid", vc_formula=vcf, data=data)
#REML[Should raise warning]
result4 = model4.fit()
result2.compare_lr_test(result4)
/usr/local/lib/python2.7/dist-packages/statsmodels-0.7.0-py2.7-linux-x86_64.egg/statsmodels/base/model.py:1414: InvalidTestWarning: Likelihood Ratio test is likely invalid with .fit(REML=True), proceeding anyway InvalidTestWarning)
(-1.7970366599784029, 1.0, 1)