In this example, we will use Generalized Estimating Equations to do some longitudinal modeling of data from the ECLS-K 2011. Specifically, we will model changes in a student’s standardized math score as a continuous outcome and self rated health as a binomial outcome, from fall kindergarten to spring, 1st grade.
A presentation discussing GEE’s can be found here.
First we load our data
load("~/Google Drive/dem7903_App_Hier/data/eclsk_11.Rdata")
names(eclsk11)<-tolower(names(eclsk11))
library (car)
library(geepack)
library(MuMIn)
#get out only the variables I'm going to use for this example
myvars<-c("childid", "x_chsex_r", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal", "x1mscalk1", "x2mscalk1", "x3mscalk1", "x4mscalk1", "p1hscale", "p2hscale", "p4hscale","x2fsstat2","x4fsstat2", "x12sesl","x4sesl_i", "p2parct1", "p2parct2", "s1_id", "p2safepl","x2krceth","p1o2near", "x_distpov" , "w1c0", "w1p0", "w2p0", "w1c0str", "w1p0str",
"w4c4p_40","w4c4p_4str", "w4c4p_4psu", "w1c0psu", "w1p0psu", "x1height","x2height","x3height","x4height", "x1kage_r","x2kage_r","x3age","x4age")
#subset the data
eclsk.sub<-eclsk11[,myvars]
rm(eclsk11); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1564235 83.6 2637877 140.9 1819863 97.2
## Vcells 2899441 22.2 219477930 1674.5 182812640 1394.8
Next, I do some recoding of variables. I have to make the repeated measures of each of my longitudinal variables.
#Longitudinal variables
#recode our outcomes, the first is the child's math standardized test score in Kindergarten
eclsk.sub$math1<-ifelse(eclsk.sub$x1mscalk1<0, NA, eclsk.sub$x1mscalk1)
eclsk.sub$math2<-ifelse(eclsk.sub$x2mscalk1<0, NA, eclsk.sub$x2mscalk1)
#eclsk.sub$math3<-ifelse(eclsk.sub$x3mscalk1<0, NA, eclsk.sub$x3mscalk1)
eclsk.sub$math4<-ifelse(eclsk.sub$x4mscalk1<0, NA, eclsk.sub$x4mscalk1)
#Second outcome is child's height for age, continuous outcome
eclsk.sub$height1<-ifelse(eclsk.sub$x1height<=-7, NA, eclsk.sub$x1height)
eclsk.sub$height2<-ifelse(eclsk.sub$x2height<=-7, NA, eclsk.sub$x2height)
#eclsk.sub$height3<-ifelse(eclsk.sub$x3height<=-7, NA, eclsk.sub$x3height)
eclsk.sub$height4<-ifelse(eclsk.sub$x4height<=-7, NA, eclsk.sub$x4height)
#Age at each wave
eclsk.sub$age_yrs1<-ifelse(eclsk.sub$x1kage_r<0, NA, eclsk.sub$x1kage_r/12)
eclsk.sub$age_yrs2<-ifelse(eclsk.sub$x2kage_r<0, NA, eclsk.sub$x2kage_r/12)
#eclsk.sub$age_yrs3<-ifelse(eclsk.sub$x3age<0, NA, eclsk.sub$x3age/12)
eclsk.sub$age_yrs4<-ifelse(eclsk.sub$x4age<0, NA, eclsk.sub$x4age/12)
eclsk.sub<- eclsk.sub[is.na(eclsk.sub$age_yrs1)==F, ]
#Height for age z score standardized by sex and age
eclsk.sub$height_z1<-ave(eclsk.sub$height1, as.factor(paste(round(eclsk.sub$age_yrs1, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z2<-ave(eclsk.sub$height2, as.factor(paste(round(eclsk.sub$age_yrs2, 1.5), eclsk.sub$male)), FUN=scale)
#eclsk.sub$height_z3<-ave(eclsk.sub$height3, as.factor(paste(round(eclsk.sub$age_yrs3, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z4<-ave(eclsk.sub$height4, as.factor(paste(round(eclsk.sub$age_yrs4, 1.5), eclsk.sub$male)), FUN=scale)
#Household food insecurity, dichotomous outcome
#This outcome is only present at two waves
eclsk.sub$foodinsec1<-recode(eclsk.sub$x2fsstat2, recodes="2:3=1; 1=0; else=NA")
eclsk.sub$foodinsec2<-recode(eclsk.sub$x4fsstat2, recodes="2:3=1; 1=0; else=NA")
#Child health assessment Excellent to poor , ordinal outcome
eclsk.sub$chhealth1<-ifelse(eclsk.sub$p1hscale<0, NA, eclsk.sub$p1hscale)
eclsk.sub$chhealth2<-ifelse(eclsk.sub$p2hscale<0, NA, eclsk.sub$p2hscale)
eclsk.sub$chhealth4<-ifelse(eclsk.sub$p4hscale<0, NA, eclsk.sub$p4hscale)
#SES
eclsk.sub$hhses1<-ifelse(eclsk.sub$x12sesl==-9, NA, scale(eclsk.sub$x12sesl))
eclsk.sub$hhses4<-ifelse(eclsk.sub$x4sesl_i==-9, NA, scale(eclsk.sub$x4sesl_i))
That does it for our time varying variables. We now code some variables from the kindergarten wave to use as predictors:
#Non time varying variables
#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")
#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-recode (eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-recode (eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-recode (eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-recode (eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-recode (eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")
#Then we recode some parent/mother characteristics
#Mother's education, recode as 2 dummys with HS = reference
eclsk.sub$lths<-recode(eclsk.sub$x12par1ed_i, recodes = "0:2=1; 3:8=0; else = NA")
eclsk.sub$gths<-recode(eclsk.sub$x12par1ed_i, recodes = "1:3=0; 4:8=1; else =NA")
#marital status, recode as 2 dummys, ref= married
eclsk.sub$single<-recode(eclsk.sub$p1curmar, recodes="4=1; -7:-9=NA; else=0")
eclsk.sub$notmar<-recode(eclsk.sub$p1curmar, recodes="2:3=1; -7:-9=NA; else=0")
#Then we do some household level variables
#Urban school location = 1
eclsk.sub$urban<-recode(eclsk.sub$x1locale, recodes = "1:3=1; 4=0; -1:-9=NA")
#poverty level in poverty = 1
eclsk.sub$pov<-recode(eclsk.sub$x2povty , recodes ="1:2=1; 3=0; -9=NA")
#Household size
eclsk.sub$hhsize<-eclsk.sub$x1htotal
#school % minority student body
eclsk.sub$minorsch<-ifelse(eclsk.sub$x2krceth <0, NA, eclsk.sub$x2krceth/10)
#Unsafe neighborhood
eclsk.sub$unsafe<-recode(eclsk.sub$p2safepl , recodes = "1:2='unsafe'; 3='safe'; else=NA", as.factor.result = T)
#school district poverty
eclsk.sub$dist_pov<-ifelse(eclsk.sub$x_distpov==-9, NA, scale(eclsk.sub$x_distpov))
To analyze data longitudinally, we must reshape the data fron its current “wide” format, where each repeated measure is a column, into the “long” format, where there is a single column for a particular variable, and we account for the repeated measurements of each person. In this case, I’m going to use three waves of data, so each child can contribute up to three lines to the data.
e.long<-reshape(eclsk.sub, idvar="childid", varying=list(math = c("math1", "math2", "math4"),
age = c("age_yrs1", "age_yrs2", "age_yrs4"),
height= c("height_z1", "height_z2", "height_z4"),
health=c("chhealth1", "chhealth2", "chhealth4"),
ses = c("hhses1", "hhses1", "hhses4")),
v.names = c("math", "age", "height", "health", "ses"),
timevar="wave",times=c(1,2,4),direction="long",
drop = names(eclsk.sub)[c(2:20, 22:25)])
e.long<-e.long[order(e.long$childid, e.long$wave),]
head(e.long, n=10)
## childid p2parct2 x_distpov w1c0 w1p0 w2p0 w1c0str
## 10000001.1 10000001 1 45 188.09105 211.4320 215.1901 70
## 10000001.2 10000001 1 45 188.09105 211.4320 215.1901 70
## 10000001.4 10000001 1 45 188.09105 211.4320 215.1901 70
## 10000002.1 10000002 1 7 237.88083 274.5976 327.8694 45
## 10000002.2 10000002 1 7 237.88083 274.5976 327.8694 45
## 10000002.4 10000002 1 7 237.88083 274.5976 327.8694 45
## 10000004.1 10000004 1 17 96.36698 103.5068 111.8929 51
## 10000004.2 10000004 1 17 96.36698 103.5068 111.8929 51
## 10000004.4 10000004 1 17 96.36698 103.5068 111.8929 51
## 10000005.1 10000005 3 -1 178.54929 237.3439 267.5328 30
## w1p0str w4c4p_40 w4c4p_4str w4c4p_4psu w1c0psu w1p0psu x1height
## 10000001.1 70 322.0055 69 3 3 3 44.50
## 10000001.2 70 322.0055 69 3 3 3 44.50
## 10000001.4 70 322.0055 69 3 3 3 44.50
## 10000002.1 45 474.8417 43 2 2 2 45.63
## 10000002.2 45 474.8417 43 2 2 2 45.63
## 10000002.4 45 474.8417 43 2 2 2 45.63
## 10000004.1 51 171.4766 49 2 2 2 46.00
## 10000004.2 51 171.4766 49 2 2 2 46.00
## 10000004.4 51 171.4766 49 2 2 2 46.00
## 10000005.1 30 0.0000 NA NA 3 3 40.80
## x2height x3height x4height x1kage_r x2kage_r x3age x4age
## 10000001.1 45.75 NA 48.0 72.39 79.76 NA 91.66
## 10000001.2 45.75 NA 48.0 72.39 79.76 NA 91.66
## 10000001.4 45.75 NA 48.0 72.39 79.76 NA 91.66
## 10000002.1 47.00 NA 49.5 71.41 77.52 NA 89.56
## 10000002.2 47.00 NA 49.5 71.41 77.52 NA 89.56
## 10000002.4 47.00 NA 49.5 71.41 77.52 NA 89.56
## 10000004.1 48.00 NA 50.0 72.39 78.84 NA 89.23
## 10000004.2 48.00 NA 50.0 72.39 78.84 NA 89.23
## 10000004.4 48.00 NA 50.0 72.39 78.84 NA 89.23
## 10000005.1 42.00 NA NA 59.51 65.06 NA NA
## height1 height2 height4 foodinsec1 foodinsec2 male hisp black
## 10000001.1 44.50 45.75 48.0 0 0 1 0 0
## 10000001.2 44.50 45.75 48.0 0 0 1 0 0
## 10000001.4 44.50 45.75 48.0 0 0 1 0 0
## 10000002.1 45.63 47.00 49.5 0 0 0 0 0
## 10000002.2 45.63 47.00 49.5 0 0 0 0 0
## 10000002.4 45.63 47.00 49.5 0 0 0 0 0
## 10000004.1 46.00 48.00 50.0 0 NA 1 0 0
## 10000004.2 46.00 48.00 50.0 0 NA 1 0 0
## 10000004.4 46.00 48.00 50.0 0 NA 1 0 0
## 10000005.1 40.80 42.00 NA 0 NA 0 1 0
## asian nahn other lths gths single notmar urban pov hhsize
## 10000001.1 0 0 0 0 1 0 0 0 1 6
## 10000001.2 0 0 0 0 1 0 0 0 1 6
## 10000001.4 0 0 0 0 1 0 0 0 1 6
## 10000002.1 0 0 0 0 1 0 0 1 0 4
## 10000002.2 0 0 0 0 1 0 0 1 0 4
## 10000002.4 0 0 0 0 1 0 0 1 0 4
## 10000004.1 0 0 0 0 1 0 0 0 0 4
## 10000004.2 0 0 0 0 1 0 0 0 0 4
## 10000004.4 0 0 0 0 1 0 0 0 0 4
## 10000005.1 0 0 0 0 1 0 0 1 1 5
## minorsch unsafe dist_pov wave math age height
## 10000001.1 0.800 unsafe 2.22748174 1 44.9695 6.032500 -0.74750120
## 10000001.2 0.800 unsafe 2.22748174 2 61.5056 6.646667 -0.71445383
## 10000001.4 0.800 unsafe 2.22748174 4 79.5010 7.638333 -0.69549851
## 10000002.1 3.000 safe -0.67176163 1 30.1857 5.950833 0.01182955
## 10000002.2 3.000 safe -0.67176163 2 50.0694 6.460000 0.16591695
## 10000002.4 3.000 safe -0.67176163 4 71.0421 7.463333 0.01627224
## 10000004.1 1.800 unsafe 0.09119715 1 13.8410 6.032500 0.03840209
## 10000004.2 1.800 unsafe 0.09119715 2 31.0148 6.570000 0.42106884
## 10000004.4 1.800 unsafe 0.09119715 4 53.9906 7.435833 0.41691644
## 10000005.1 9.048 safe -1.28212866 1 27.6587 4.959167 -1.45613917
## health ses
## 10000001.1 1 -0.05343635
## 10000001.2 2 -0.05343635
## 10000001.4 3 -0.04820708
## 10000002.1 1 0.36092641
## 10000002.2 1 0.36092641
## 10000002.4 1 0.32296225
## 10000004.1 1 -0.18810425
## 10000004.2 1 -0.18810425
## 10000004.4 NA -0.32039792
## 10000005.1 1 0.37128548
Again, the GEE is used here instead of the (G)LMM.
#get non missing cases
e.long.comp<-e.long[complete.cases(e.long[, c("math", "black", "age", "health", "w1p0str")]),]
#basic linear model
fit.1<-glm(scale(math)~scale(age)+factor(wave)+male+black+hisp+asian+nahn+other+ses, data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
summary(fit.1)
## Warning in summary.glm(fit.1): observations with zero weight not used for
## calculating dispersion
##
## Call:
## glm(formula = scale(math) ~ scale(age) + factor(wave) + male +
## black + hisp + asian + nahn + other + ses, data = e.long.comp,
## weights = w4c4p_40/mean(w4c4p_40))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7697 -0.2775 0.0000 0.3059 3.2424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.512455 0.009481 -54.049 < 2e-16 ***
## scale(age) 0.229757 0.006811 33.735 < 2e-16 ***
## factor(wave)2 0.561986 0.009926 56.619 < 2e-16 ***
## factor(wave)4 1.342453 0.016552 81.106 < 2e-16 ***
## male 0.016802 0.007100 2.366 0.01796 *
## black -0.273801 0.011200 -24.447 < 2e-16 ***
## hisp -0.198407 0.009379 -21.154 < 2e-16 ***
## asian 0.078785 0.019471 4.046 5.22e-05 ***
## nahn -0.084099 0.026261 -3.202 0.00136 **
## other -0.037226 0.017937 -2.075 0.03796 *
## ses 0.273429 0.004523 60.447 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.4027532)
##
## Null deviance: 32333 on 26821 degrees of freedom
## Residual deviance: 10798 on 26811 degrees of freedom
## AIC: Inf
##
## Number of Fisher Scoring iterations: 2
#Get residuals and put them in a data frame
e.long.comp$resid<- residuals(fit.1)
test<-reshape(e.long.comp, idvar = "childid", v.names = "resid", timevar = "wave", direction = "wide", drop = names(e.long.comp)[2:42])
## Warning in reshapeWide(data, idvar = idvar, timevar = timevar, varying =
## varying, : some constant variables (math,age,height,health,ses) are really
## varying
head(test[order(test$childid),c(1,7:9)], n=12)
## childid resid.1 resid.2 resid.4
## 10000001.1 10000001 0.5801180 0.738576193 0.6370686
## 10000002.1 10000002 -0.3765893 0.082218058 0.1692755
## 10000004.1 10000004 -0.8097529 -0.650357745 NA
## 10000005.1 10000005 0.0000000 0.000000000 NA
## 10000006.1 10000006 -1.0226900 -1.774759628 -2.1690247
## 10000007.1 10000007 -0.1233800 -0.293986936 NA
## 10000008.1 10000008 -0.2025946 0.233898881 0.4492969
## 10000009.1 10000009 0.0000000 NA NA
## 10000010.1 10000010 0.0000000 0.000000000 NA
## 10000011.1 10000011 -0.2272625 -0.149493290 0.4408888
## 10000012.1 10000012 0.2332129 0.477174969 0.2300501
## 10000014.1 10000014 0.7617973 -0.001425345 0.3500414
Here is our acutal correlation matrix in the residuals between waves:
cor(test[, 7:9], use="pairwise.complete")
## resid.1 resid.2 resid.4
## resid.1 1.0000000 0.7721607 0.6697974
## resid.2 0.7721607 1.0000000 0.7707695
## resid.4 0.6697974 0.7707695 1.0000000
This is certainly not independence, and looks more like an AR(1), becuase the correlation decreases as the difference between wave number increases.
Now we fit a GEE:
#Model with independent correlation, meaning ZERO correlation
fit.1b<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses,id=childid , wave = wave, corstr ="independence", data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
summary(fit.1b)
## Warning in summary.glm(object): observations with zero weight not used for
## calculating dispersion
##
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp +
## asian + nahn + other + ses, data = e.long.comp, weights = w4c4p_40/mean(w4c4p_40),
## id = childid, waves = wave, corstr = "independence")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) 0.053547 0.012169 19.362 1.08e-05 ***
## scale(age) 0.699604 0.005636 15410.897 < 2e-16 ***
## male -0.018433 0.014473 1.622 0.203
## black -0.243626 0.023604 106.534 < 2e-16 ***
## hisp -0.129127 0.020005 41.664 1.08e-10 ***
## asian 0.170758 0.034379 24.671 6.80e-07 ***
## nahn -0.097499 0.072862 1.791 0.181
## other -0.003804 0.036275 0.011 0.916
## ses 0.282735 0.009173 949.927 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 0.4184 0.007092
##
## Correlation: Structure = independenceNumber of clusters: 12942 Maximum cluster size: 3
#Model with Exchangeable correlation:
fit.2<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses,id=childid , wave = wave, corstr ="exchangeable", data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
summary(fit.2)
## Warning in summary.glm(object): observations with zero weight not used for
## calculating dispersion
##
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp +
## asian + nahn + other + ses, data = e.long.comp, weights = w4c4p_40/mean(w4c4p_40),
## id = childid, waves = wave, corstr = "exchangeable")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) 0.04339 0.01303 11.08 0.00087 ***
## scale(age) 0.84428 0.00284 88102.14 < 2e-16 ***
## male -0.03158 0.01562 4.09 0.04322 *
## black -0.26806 0.02504 114.64 < 2e-16 ***
## hisp -0.14998 0.02097 51.16 8.5e-13 ***
## asian 0.21416 0.03857 30.83 2.8e-08 ***
## nahn -0.15576 0.08142 3.66 0.05573 .
## other 0.00108 0.03881 0.00 0.97776
## ses 0.21601 0.00973 493.29 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 0.444 0.00825
##
## Correlation: Structure = exchangeable Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.831 0.0196
## Number of clusters: 12942 Maximum cluster size: 3
The second model shows the exchangeable correlation to be 0.831, which is certainly different from our measured correlations above
| resid.1 | resid.2 | resid.4 | |
|---|---|---|---|
| resid.1 | 1.000 | 0.772 | 0.670 |
| resid.2 | 0.772 | 1.000 | 0.771 |
| resid.4 | 0.670 | 0.771 | 1.000 |
Now we examine the other correlation types:
fit.3<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses, id=childid , wave = wave, corstr ="ar1", data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
summary(fit.3)
## Warning in summary.glm(object): observations with zero weight not used for
## calculating dispersion
##
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp +
## asian + nahn + other + ses, data = e.long.comp, weights = w4c4p_40/mean(w4c4p_40),
## id = childid, waves = wave, corstr = "ar1")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) 0.02011 0.01298 2.40 0.121
## scale(age) 0.82475 0.00293 79365.99 < 2e-16 ***
## male -0.02316 0.01557 2.21 0.137
## black -0.26549 0.02479 114.68 < 2e-16 ***
## hisp -0.15515 0.02095 54.85 1.3e-13 ***
## asian 0.21865 0.03877 31.81 1.7e-08 ***
## nahn -0.15118 0.07959 3.61 0.057 .
## other -0.00150 0.03917 0.00 0.969
## ses 0.22625 0.00945 573.00 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 0.438 0.00803
##
## Correlation: Structure = ar1 Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.868 0.0139
## Number of clusters: 12942 Maximum cluster size: 3
This is how to fit the model, but it crashes on this data:
#fit.4<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses, id=childid , wave = wave, corstr ="unstructured", data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
#summary(fit.4)
Since GEE’s aren’t fit via maximum likelihood, they aren’t comparable in terms of AIC or likelihood ratio tests. However, Pan, 2001 describe an information criterion using a Quasi-likelihood formulation. This can be used to compare models with alternative correlation structures, with the lowese QIC representing the best fitting model.
QIC(fit.1b)
## QIC
## -29017
QIC(fit.2)
## QIC
## -29009
QIC(fit.3)
## QIC
## -29010
So, it looks like the exchangeable correlation structure is certainly better than independence, but the AR(1) structure also fits as well.
Here we use the GEE for a binomial outcome.
btest<-glm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses+factor(wave) , family=binomial, data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
## Warning: non-integer #successes in a binomial glm!
e.long.comp$binomresid<- residuals(btest)
test2<-reshape(e.long.comp, idvar = "childid", v.names = "binomresid", timevar = "wave", direction = "wide", drop = names(e.long.comp)[2:42])
## Warning in reshapeWide(data, idvar = idvar, timevar = timevar, varying =
## varying, : some constant variables (math,age,height,health,ses,resid) are
## really varying
head(test2[order(test$childid),c(1, (8:10))], n=12)
## childid binomresid.1 binomresid.2 binomresid.4
## 10000001.1 10000001 -0.502 -0.497 2.109
## 10000002.1 10000002 -0.504 -0.495 -0.482
## 10000004.1 10000004 -0.380 -0.374 NA
## 10000005.1 10000005 0.000 0.000 NA
## 10000006.1 10000006 -0.969 -0.956 -0.976
## 10000007.1 10000007 -0.652 -0.640 NA
## 10000008.1 10000008 -0.266 -0.264 -0.248
## 10000009.1 10000009 0.000 NA NA
## 10000010.1 10000010 0.000 0.000 NA
## 10000011.1 10000011 -0.324 2.214 2.335
## 10000012.1 10000012 -1.281 -1.256 -1.281
## 10000014.1 10000014 -0.509 -0.493 -0.494
#empirical correlations between waves in the residuals
cor(test2[, 8:10], use="pairwise.complete")
## binomresid.1 binomresid.2 binomresid.4
## binomresid.1 1.000 0.401 0.304
## binomresid.2 0.401 1.000 0.339
## binomresid.4 0.304 0.339 1.000
#basic linear model
fitb.1<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid ,corstr ="independence", family=binomial, data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
## Warning: non-integer #successes in a binomial glm!
summary(fitb.1)
##
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp +
## asian + nahn + other + ses, family = binomial, data = e.long.comp,
## weights = w4c4p_40/mean(w4c4p_40), id = childid, waves = wave,
## corstr = "independence")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -2.1912 0.2083 110.69 < 2e-16 ***
## age -0.0165 0.0311 0.28 0.5958
## male 0.1529 0.0573 7.12 0.0076 **
## black 0.3917 0.0899 18.97 1.3e-05 ***
## hisp 0.5402 0.0729 54.96 1.2e-13 ***
## asian 0.7538 0.1159 42.30 7.8e-11 ***
## nahn 0.1891 0.2058 0.84 0.3582
## other 0.1132 0.1404 0.65 0.4199
## ses -0.5855 0.0383 233.24 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 1.01 0.0731
##
## Correlation: Structure = independenceNumber of clusters: 12942 Maximum cluster size: 3
fitb.2<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid, family=binomial, data=e.long.comp, corstr="exchangeable", weights=w4c4p_40/mean(w4c4p_40))
## Warning: non-integer #successes in a binomial glm!
summary(fitb.2)
##
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp +
## asian + nahn + other + ses, family = binomial, data = e.long.comp,
## weights = w4c4p_40/mean(w4c4p_40), id = childid, waves = wave,
## corstr = "exchangeable")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -2.0117 0.1930 108.70 < 2e-16 ***
## age -0.0456 0.0287 2.53 0.1118
## male 0.1570 0.0570 7.58 0.0059 **
## black 0.4045 0.0896 20.35 6.4e-06 ***
## hisp 0.5520 0.0722 58.54 2.0e-14 ***
## asian 0.7513 0.1146 42.98 5.5e-11 ***
## nahn 0.1750 0.2048 0.73 0.3929
## other 0.1134 0.1393 0.66 0.4157
## ses -0.5523 0.0375 217.31 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 1 0.0695
##
## Correlation: Structure = exchangeable Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.338 0.0338
## Number of clusters: 12942 Maximum cluster size: 3
fitb.3<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid, family=binomial, data=e.long.comp, corstr="ar1", weights=w4c4p_40/mean(w4c4p_40))
## Warning: non-integer #successes in a binomial glm!
summary(fitb.3)
##
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp +
## asian + nahn + other + ses, family = binomial, data = e.long.comp,
## weights = w4c4p_40/mean(w4c4p_40), id = childid, waves = wave,
## corstr = "ar1")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -2.0451 0.1956 109.28 < 2e-16 ***
## age -0.0442 0.0291 2.32 0.1281
## male 0.1716 0.0572 8.99 0.0027 **
## black 0.4285 0.0898 22.76 1.8e-06 ***
## hisp 0.5642 0.0723 60.98 5.8e-15 ***
## asian 0.8031 0.1148 48.96 2.6e-12 ***
## nahn 0.2259 0.2101 1.16 0.2821
## other 0.1486 0.1419 1.10 0.2949
## ses -0.5699 0.0380 224.94 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 1.01 0.0744
##
## Correlation: Structure = ar1 Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.422 0.0372
## Number of clusters: 12942 Maximum cluster size: 3
QIC(fitb.1)
## QIC
## 20816
QIC(fitb.2)
## Warning: non-integer #successes in a binomial glm!
## QIC
## 20815
QIC(fitb.3)
## Warning: non-integer #successes in a binomial glm!
## QIC
## 20816
In the binomial case, it looks like the independence correlation structure is good.