DEM 7903 Week 6: Longitudinal Models for Change using Generalized Estimating Equations

In this example, we will use Generalized Estimating Equations to do some longitudinal modeling of data from the ECLS-K 2011. Specifically, we will model changes in a student’s standardized math score as a continuous outcome and self rated health as a binomial outcome, from fall kindergarten to spring, 1st grade.

A presentation discussing GEE’s can be found here.

Data and recodes

First we load our data

load("~/Google Drive/dem7903_App_Hier/data/eclsk_11.Rdata")
names(eclsk11)<-tolower(names(eclsk11))
library (car)
library(geepack)
library(MuMIn)
#get out only the variables I'm going to use for this example

myvars<-c("childid", "x_chsex_r", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal", "x1mscalk1", "x2mscalk1", "x3mscalk1", "x4mscalk1", "p1hscale", "p2hscale", "p4hscale","x2fsstat2","x4fsstat2", "x12sesl","x4sesl_i", "p2parct1", "p2parct2", "s1_id", "p2safepl","x2krceth","p1o2near", "x_distpov" , "w1c0", "w1p0", "w2p0", "w1c0str", "w1p0str",
        "w4c4p_40","w4c4p_4str", "w4c4p_4psu",  "w1c0psu", "w1p0psu", "x1height","x2height","x3height","x4height", "x1kage_r","x2kage_r","x3age","x4age")

#subset the data
eclsk.sub<-eclsk11[,myvars]

rm(eclsk11); gc()

##           used (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells 1564235 83.6    2637877  140.9   1819863   97.2
## Vcells 2899441 22.2  219477930 1674.5 182812640 1394.8

Next, I do some recoding of variables. I have to make the repeated measures of each of my longitudinal variables.

#Longitudinal variables
#recode our outcomes, the  first is the child's math standardized test score  in Kindergarten
eclsk.sub$math1<-ifelse(eclsk.sub$x1mscalk1<0, NA, eclsk.sub$x1mscalk1)
eclsk.sub$math2<-ifelse(eclsk.sub$x2mscalk1<0, NA, eclsk.sub$x2mscalk1)
#eclsk.sub$math3<-ifelse(eclsk.sub$x3mscalk1<0, NA, eclsk.sub$x3mscalk1)
eclsk.sub$math4<-ifelse(eclsk.sub$x4mscalk1<0, NA, eclsk.sub$x4mscalk1)

#Second outcome is child's height for age, continuous outcome
eclsk.sub$height1<-ifelse(eclsk.sub$x1height<=-7, NA, eclsk.sub$x1height)
eclsk.sub$height2<-ifelse(eclsk.sub$x2height<=-7, NA, eclsk.sub$x2height)
#eclsk.sub$height3<-ifelse(eclsk.sub$x3height<=-7, NA, eclsk.sub$x3height)
eclsk.sub$height4<-ifelse(eclsk.sub$x4height<=-7, NA, eclsk.sub$x4height)

#Age at each wave
eclsk.sub$age_yrs1<-ifelse(eclsk.sub$x1kage_r<0, NA, eclsk.sub$x1kage_r/12)
eclsk.sub$age_yrs2<-ifelse(eclsk.sub$x2kage_r<0, NA, eclsk.sub$x2kage_r/12)
#eclsk.sub$age_yrs3<-ifelse(eclsk.sub$x3age<0, NA, eclsk.sub$x3age/12)
eclsk.sub$age_yrs4<-ifelse(eclsk.sub$x4age<0, NA, eclsk.sub$x4age/12)

eclsk.sub<- eclsk.sub[is.na(eclsk.sub$age_yrs1)==F, ]

#Height for age z score standardized by sex and age
eclsk.sub$height_z1<-ave(eclsk.sub$height1, as.factor(paste(round(eclsk.sub$age_yrs1, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z2<-ave(eclsk.sub$height2, as.factor(paste(round(eclsk.sub$age_yrs2, 1.5), eclsk.sub$male)), FUN=scale)
#eclsk.sub$height_z3<-ave(eclsk.sub$height3, as.factor(paste(round(eclsk.sub$age_yrs3, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z4<-ave(eclsk.sub$height4, as.factor(paste(round(eclsk.sub$age_yrs4, 1.5), eclsk.sub$male)), FUN=scale)


#Household food insecurity, dichotomous outcome
#This outcome is only present at two waves
eclsk.sub$foodinsec1<-recode(eclsk.sub$x2fsstat2, recodes="2:3=1; 1=0; else=NA")
eclsk.sub$foodinsec2<-recode(eclsk.sub$x4fsstat2, recodes="2:3=1; 1=0; else=NA")


#Child health assessment Excellent to poor , ordinal outcome
eclsk.sub$chhealth1<-ifelse(eclsk.sub$p1hscale<0, NA, eclsk.sub$p1hscale)
eclsk.sub$chhealth2<-ifelse(eclsk.sub$p2hscale<0, NA, eclsk.sub$p2hscale)
eclsk.sub$chhealth4<-ifelse(eclsk.sub$p4hscale<0, NA, eclsk.sub$p4hscale)

#SES
eclsk.sub$hhses1<-ifelse(eclsk.sub$x12sesl==-9, NA, scale(eclsk.sub$x12sesl))
eclsk.sub$hhses4<-ifelse(eclsk.sub$x4sesl_i==-9, NA, scale(eclsk.sub$x4sesl_i))

That does it for our time varying variables. We now code some variables from the kindergarten wave to use as predictors:

#Non time varying variables
#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")

#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-recode (eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-recode (eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-recode (eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-recode (eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-recode (eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")


#Then we recode some parent/mother characteristics
#Mother's education, recode as 2 dummys with HS = reference
eclsk.sub$lths<-recode(eclsk.sub$x12par1ed_i, recodes = "0:2=1; 3:8=0; else = NA")
eclsk.sub$gths<-recode(eclsk.sub$x12par1ed_i, recodes = "1:3=0; 4:8=1; else =NA") 

#marital status, recode as 2 dummys, ref= married
eclsk.sub$single<-recode(eclsk.sub$p1curmar, recodes="4=1; -7:-9=NA; else=0")
eclsk.sub$notmar<-recode(eclsk.sub$p1curmar, recodes="2:3=1; -7:-9=NA; else=0")


#Then we do some household level variables

#Urban school location = 1
eclsk.sub$urban<-recode(eclsk.sub$x1locale, recodes = "1:3=1; 4=0; -1:-9=NA")

#poverty level in poverty = 1
eclsk.sub$pov<-recode(eclsk.sub$x2povty , recodes ="1:2=1; 3=0; -9=NA")

#Household size
eclsk.sub$hhsize<-eclsk.sub$x1htotal

#school % minority student body
eclsk.sub$minorsch<-ifelse(eclsk.sub$x2krceth <0, NA, eclsk.sub$x2krceth/10)

#Unsafe neighborhood
eclsk.sub$unsafe<-recode(eclsk.sub$p2safepl , recodes = "1:2='unsafe'; 3='safe'; else=NA", as.factor.result = T)

#school district poverty
eclsk.sub$dist_pov<-ifelse(eclsk.sub$x_distpov==-9, NA, scale(eclsk.sub$x_distpov))

Reshaping data into longitudinal format

To analyze data longitudinally, we must reshape the data fron its current “wide” format, where each repeated measure is a column, into the “long” format, where there is a single column for a particular variable, and we account for the repeated measurements of each person. In this case, I’m going to use three waves of data, so each child can contribute up to three lines to the data.

e.long<-reshape(eclsk.sub, idvar="childid", varying=list(math = c("math1", "math2",  "math4"),
                                         age = c("age_yrs1", "age_yrs2",  "age_yrs4"),
                                         height= c("height_z1", "height_z2", "height_z4"),
                                         health=c("chhealth1", "chhealth2", "chhealth4"), 
                                         ses = c("hhses1", "hhses1", "hhses4")),
                                        v.names = c("math", "age", "height", "health", "ses"),
                                         timevar="wave",times=c(1,2,4),direction="long",  
                                          drop = names(eclsk.sub)[c(2:20, 22:25)])

e.long<-e.long[order(e.long$childid, e.long$wave),]
head(e.long, n=10)

##             childid p2parct2 x_distpov      w1c0     w1p0     w2p0 w1c0str
## 10000001.1 10000001        1        45 188.09105 211.4320 215.1901      70
## 10000001.2 10000001        1        45 188.09105 211.4320 215.1901      70
## 10000001.4 10000001        1        45 188.09105 211.4320 215.1901      70
## 10000002.1 10000002        1         7 237.88083 274.5976 327.8694      45
## 10000002.2 10000002        1         7 237.88083 274.5976 327.8694      45
## 10000002.4 10000002        1         7 237.88083 274.5976 327.8694      45
## 10000004.1 10000004        1        17  96.36698 103.5068 111.8929      51
## 10000004.2 10000004        1        17  96.36698 103.5068 111.8929      51
## 10000004.4 10000004        1        17  96.36698 103.5068 111.8929      51
## 10000005.1 10000005        3        -1 178.54929 237.3439 267.5328      30
##            w1p0str w4c4p_40 w4c4p_4str w4c4p_4psu w1c0psu w1p0psu x1height
## 10000001.1      70 322.0055         69          3       3       3    44.50
## 10000001.2      70 322.0055         69          3       3       3    44.50
## 10000001.4      70 322.0055         69          3       3       3    44.50
## 10000002.1      45 474.8417         43          2       2       2    45.63
## 10000002.2      45 474.8417         43          2       2       2    45.63
## 10000002.4      45 474.8417         43          2       2       2    45.63
## 10000004.1      51 171.4766         49          2       2       2    46.00
## 10000004.2      51 171.4766         49          2       2       2    46.00
## 10000004.4      51 171.4766         49          2       2       2    46.00
## 10000005.1      30   0.0000         NA         NA       3       3    40.80
##            x2height x3height x4height x1kage_r x2kage_r x3age x4age
## 10000001.1    45.75       NA     48.0    72.39    79.76    NA 91.66
## 10000001.2    45.75       NA     48.0    72.39    79.76    NA 91.66
## 10000001.4    45.75       NA     48.0    72.39    79.76    NA 91.66
## 10000002.1    47.00       NA     49.5    71.41    77.52    NA 89.56
## 10000002.2    47.00       NA     49.5    71.41    77.52    NA 89.56
## 10000002.4    47.00       NA     49.5    71.41    77.52    NA 89.56
## 10000004.1    48.00       NA     50.0    72.39    78.84    NA 89.23
## 10000004.2    48.00       NA     50.0    72.39    78.84    NA 89.23
## 10000004.4    48.00       NA     50.0    72.39    78.84    NA 89.23
## 10000005.1    42.00       NA       NA    59.51    65.06    NA    NA
##            height1 height2 height4 foodinsec1 foodinsec2 male hisp black
## 10000001.1   44.50   45.75    48.0          0          0    1    0     0
## 10000001.2   44.50   45.75    48.0          0          0    1    0     0
## 10000001.4   44.50   45.75    48.0          0          0    1    0     0
## 10000002.1   45.63   47.00    49.5          0          0    0    0     0
## 10000002.2   45.63   47.00    49.5          0          0    0    0     0
## 10000002.4   45.63   47.00    49.5          0          0    0    0     0
## 10000004.1   46.00   48.00    50.0          0         NA    1    0     0
## 10000004.2   46.00   48.00    50.0          0         NA    1    0     0
## 10000004.4   46.00   48.00    50.0          0         NA    1    0     0
## 10000005.1   40.80   42.00      NA          0         NA    0    1     0
##            asian nahn other lths gths single notmar urban pov hhsize
## 10000001.1     0    0     0    0    1      0      0     0   1      6
## 10000001.2     0    0     0    0    1      0      0     0   1      6
## 10000001.4     0    0     0    0    1      0      0     0   1      6
## 10000002.1     0    0     0    0    1      0      0     1   0      4
## 10000002.2     0    0     0    0    1      0      0     1   0      4
## 10000002.4     0    0     0    0    1      0      0     1   0      4
## 10000004.1     0    0     0    0    1      0      0     0   0      4
## 10000004.2     0    0     0    0    1      0      0     0   0      4
## 10000004.4     0    0     0    0    1      0      0     0   0      4
## 10000005.1     0    0     0    0    1      0      0     1   1      5
##            minorsch unsafe    dist_pov wave    math      age      height
## 10000001.1    0.800 unsafe  2.22748174    1 44.9695 6.032500 -0.74750120
## 10000001.2    0.800 unsafe  2.22748174    2 61.5056 6.646667 -0.71445383
## 10000001.4    0.800 unsafe  2.22748174    4 79.5010 7.638333 -0.69549851
## 10000002.1    3.000   safe -0.67176163    1 30.1857 5.950833  0.01182955
## 10000002.2    3.000   safe -0.67176163    2 50.0694 6.460000  0.16591695
## 10000002.4    3.000   safe -0.67176163    4 71.0421 7.463333  0.01627224
## 10000004.1    1.800 unsafe  0.09119715    1 13.8410 6.032500  0.03840209
## 10000004.2    1.800 unsafe  0.09119715    2 31.0148 6.570000  0.42106884
## 10000004.4    1.800 unsafe  0.09119715    4 53.9906 7.435833  0.41691644
## 10000005.1    9.048   safe -1.28212866    1 27.6587 4.959167 -1.45613917
##            health         ses
## 10000001.1      1 -0.05343635
## 10000001.2      2 -0.05343635
## 10000001.4      3 -0.04820708
## 10000002.1      1  0.36092641
## 10000002.2      1  0.36092641
## 10000002.4      1  0.32296225
## 10000004.1      1 -0.18810425
## 10000004.2      1 -0.18810425
## 10000004.4     NA -0.32039792
## 10000005.1      1  0.37128548

Modeling

Longitudinal Models using GEE’s

Again, the GEE is used here instead of the (G)LMM.

#get non missing cases
e.long.comp<-e.long[complete.cases(e.long[, c("math", "black", "age", "health", "w1p0str")]),]
#basic linear model
fit.1<-glm(scale(math)~scale(age)+factor(wave)+male+black+hisp+asian+nahn+other+ses, data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
summary(fit.1)

## Warning in summary.glm(fit.1): observations with zero weight not used for
## calculating dispersion

## 
## Call:
## glm(formula = scale(math) ~ scale(age) + factor(wave) + male + 
##     black + hisp + asian + nahn + other + ses, data = e.long.comp, 
##     weights = w4c4p_40/mean(w4c4p_40))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7697  -0.2775   0.0000   0.3059   3.2424  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.512455   0.009481 -54.049  < 2e-16 ***
## scale(age)     0.229757   0.006811  33.735  < 2e-16 ***
## factor(wave)2  0.561986   0.009926  56.619  < 2e-16 ***
## factor(wave)4  1.342453   0.016552  81.106  < 2e-16 ***
## male           0.016802   0.007100   2.366  0.01796 *  
## black         -0.273801   0.011200 -24.447  < 2e-16 ***
## hisp          -0.198407   0.009379 -21.154  < 2e-16 ***
## asian          0.078785   0.019471   4.046 5.22e-05 ***
## nahn          -0.084099   0.026261  -3.202  0.00136 ** 
## other         -0.037226   0.017937  -2.075  0.03796 *  
## ses            0.273429   0.004523  60.447  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4027532)
## 
##     Null deviance: 32333  on 26821  degrees of freedom
## Residual deviance: 10798  on 26811  degrees of freedom
## AIC: Inf
## 
## Number of Fisher Scoring iterations: 2

#Get residuals and put them in a data frame
e.long.comp$resid<- residuals(fit.1)

test<-reshape(e.long.comp, idvar = "childid", v.names = "resid", timevar = "wave", direction = "wide", drop = names(e.long.comp)[2:42])

## Warning in reshapeWide(data, idvar = idvar, timevar = timevar, varying =
## varying, : some constant variables (math,age,height,health,ses) are really
## varying

head(test[order(test$childid),c(1,7:9)], n=12)

##             childid    resid.1      resid.2    resid.4
## 10000001.1 10000001  0.5801180  0.738576193  0.6370686
## 10000002.1 10000002 -0.3765893  0.082218058  0.1692755
## 10000004.1 10000004 -0.8097529 -0.650357745         NA
## 10000005.1 10000005  0.0000000  0.000000000         NA
## 10000006.1 10000006 -1.0226900 -1.774759628 -2.1690247
## 10000007.1 10000007 -0.1233800 -0.293986936         NA
## 10000008.1 10000008 -0.2025946  0.233898881  0.4492969
## 10000009.1 10000009  0.0000000           NA         NA
## 10000010.1 10000010  0.0000000  0.000000000         NA
## 10000011.1 10000011 -0.2272625 -0.149493290  0.4408888
## 10000012.1 10000012  0.2332129  0.477174969  0.2300501
## 10000014.1 10000014  0.7617973 -0.001425345  0.3500414

Here is our acutal correlation matrix in the residuals between waves:

cor(test[, 7:9], use="pairwise.complete")

##           resid.1   resid.2   resid.4
## resid.1 1.0000000 0.7721607 0.6697974
## resid.2 0.7721607 1.0000000 0.7707695
## resid.4 0.6697974 0.7707695 1.0000000

This is certainly not independence, and looks more like an AR(1), becuase the correlation decreases as the difference between wave number increases.

Now we fit a GEE:

#Model with independent correlation, meaning ZERO correlation
fit.1b<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses,id=childid , wave = wave, corstr ="independence",   data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
summary(fit.1b)

## Warning in summary.glm(object): observations with zero weight not used for
## calculating dispersion

## 
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp + 
##     asian + nahn + other + ses, data = e.long.comp, weights = w4c4p_40/mean(w4c4p_40), 
##     id = childid, waves = wave, corstr = "independence")
## 
##  Coefficients:
##              Estimate   Std.err      Wald Pr(>|W|)    
## (Intercept)  0.053547  0.012169    19.362 1.08e-05 ***
## scale(age)   0.699604  0.005636 15410.897  < 2e-16 ***
## male        -0.018433  0.014473     1.622    0.203    
## black       -0.243626  0.023604   106.534  < 2e-16 ***
## hisp        -0.129127  0.020005    41.664 1.08e-10 ***
## asian        0.170758  0.034379    24.671 6.80e-07 ***
## nahn        -0.097499  0.072862     1.791    0.181    
## other       -0.003804  0.036275     0.011    0.916    
## ses          0.282735  0.009173   949.927  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate  Std.err
## (Intercept)   0.4184 0.007092
## 
## Correlation: Structure = independenceNumber of clusters:   12942   Maximum cluster size: 3

#Model with Exchangeable correlation:
fit.2<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses,id=childid , wave = wave, corstr ="exchangeable",  data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
  
summary(fit.2)

## Warning in summary.glm(object): observations with zero weight not used for
## calculating dispersion

## 
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp + 
##     asian + nahn + other + ses, data = e.long.comp, weights = w4c4p_40/mean(w4c4p_40), 
##     id = childid, waves = wave, corstr = "exchangeable")
## 
##  Coefficients:
##             Estimate  Std.err     Wald Pr(>|W|)    
## (Intercept)  0.04339  0.01303    11.08  0.00087 ***
## scale(age)   0.84428  0.00284 88102.14  < 2e-16 ***
## male        -0.03158  0.01562     4.09  0.04322 *  
## black       -0.26806  0.02504   114.64  < 2e-16 ***
## hisp        -0.14998  0.02097    51.16  8.5e-13 ***
## asian        0.21416  0.03857    30.83  2.8e-08 ***
## nahn        -0.15576  0.08142     3.66  0.05573 .  
## other        0.00108  0.03881     0.00  0.97776    
## ses          0.21601  0.00973   493.29  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate Std.err
## (Intercept)    0.444 0.00825
## 
## Correlation: Structure = exchangeable  Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.831  0.0196
## Number of clusters:   12942   Maximum cluster size: 3

The second model shows the exchangeable correlation to be 0.831, which is certainly different from our measured correlations above

	resid.1	resid.2	resid.4
resid.1	1.000	0.772	0.670
resid.2	0.772	1.000	0.771
resid.4	0.670	0.771	1.000

Now we examine the other correlation types:

fit.3<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses, id=childid , wave = wave, corstr ="ar1",  data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
  
summary(fit.3)

## Warning in summary.glm(object): observations with zero weight not used for
## calculating dispersion

## 
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp + 
##     asian + nahn + other + ses, data = e.long.comp, weights = w4c4p_40/mean(w4c4p_40), 
##     id = childid, waves = wave, corstr = "ar1")
## 
##  Coefficients:
##             Estimate  Std.err     Wald Pr(>|W|)    
## (Intercept)  0.02011  0.01298     2.40    0.121    
## scale(age)   0.82475  0.00293 79365.99  < 2e-16 ***
## male        -0.02316  0.01557     2.21    0.137    
## black       -0.26549  0.02479   114.68  < 2e-16 ***
## hisp        -0.15515  0.02095    54.85  1.3e-13 ***
## asian        0.21865  0.03877    31.81  1.7e-08 ***
## nahn        -0.15118  0.07959     3.61    0.057 .  
## other       -0.00150  0.03917     0.00    0.969    
## ses          0.22625  0.00945   573.00  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate Std.err
## (Intercept)    0.438 0.00803
## 
## Correlation: Structure = ar1  Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.868  0.0139
## Number of clusters:   12942   Maximum cluster size: 3

This is how to fit the model, but it crashes on this data:

#fit.4<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses, id=childid , wave = wave, corstr ="unstructured",  data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))
#summary(fit.4)

Since GEE’s aren’t fit via maximum likelihood, they aren’t comparable in terms of AIC or likelihood ratio tests. However, Pan, 2001 describe an information criterion using a Quasi-likelihood formulation. This can be used to compare models with alternative correlation structures, with the lowese QIC representing the best fitting model.

QIC(fit.1b)

##    QIC 
## -29017

QIC(fit.2)

##    QIC 
## -29009

QIC(fit.3)

##    QIC 
## -29010

So, it looks like the exchangeable correlation structure is certainly better than independence, but the AR(1) structure also fits as well.

Binary response longitudinal model

Here we use the GEE for a binomial outcome.

btest<-glm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses+factor(wave) , family=binomial, data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))

## Warning: non-integer #successes in a binomial glm!

e.long.comp$binomresid<- residuals(btest)
test2<-reshape(e.long.comp, idvar = "childid", v.names = "binomresid", timevar = "wave", direction = "wide", drop = names(e.long.comp)[2:42])

## Warning in reshapeWide(data, idvar = idvar, timevar = timevar, varying =
## varying, : some constant variables (math,age,height,health,ses,resid) are
## really varying

head(test2[order(test$childid),c(1, (8:10))], n=12)

##             childid binomresid.1 binomresid.2 binomresid.4
## 10000001.1 10000001       -0.502       -0.497        2.109
## 10000002.1 10000002       -0.504       -0.495       -0.482
## 10000004.1 10000004       -0.380       -0.374           NA
## 10000005.1 10000005        0.000        0.000           NA
## 10000006.1 10000006       -0.969       -0.956       -0.976
## 10000007.1 10000007       -0.652       -0.640           NA
## 10000008.1 10000008       -0.266       -0.264       -0.248
## 10000009.1 10000009        0.000           NA           NA
## 10000010.1 10000010        0.000        0.000           NA
## 10000011.1 10000011       -0.324        2.214        2.335
## 10000012.1 10000012       -1.281       -1.256       -1.281
## 10000014.1 10000014       -0.509       -0.493       -0.494

#empirical correlations between waves in the residuals
cor(test2[, 8:10], use="pairwise.complete")

##              binomresid.1 binomresid.2 binomresid.4
## binomresid.1        1.000        0.401        0.304
## binomresid.2        0.401        1.000        0.339
## binomresid.4        0.304        0.339        1.000

#basic linear model
fitb.1<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid ,corstr ="independence", family=binomial, data=e.long.comp, weights=w4c4p_40/mean(w4c4p_40))

## Warning: non-integer #successes in a binomial glm!

summary(fitb.1)

## 
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp + 
##     asian + nahn + other + ses, family = binomial, data = e.long.comp, 
##     weights = w4c4p_40/mean(w4c4p_40), id = childid, waves = wave, 
##     corstr = "independence")
## 
##  Coefficients:
##             Estimate Std.err   Wald Pr(>|W|)    
## (Intercept)  -2.1912  0.2083 110.69  < 2e-16 ***
## age          -0.0165  0.0311   0.28   0.5958    
## male          0.1529  0.0573   7.12   0.0076 ** 
## black         0.3917  0.0899  18.97  1.3e-05 ***
## hisp          0.5402  0.0729  54.96  1.2e-13 ***
## asian         0.7538  0.1159  42.30  7.8e-11 ***
## nahn          0.1891  0.2058   0.84   0.3582    
## other         0.1132  0.1404   0.65   0.4199    
## ses          -0.5855  0.0383 233.24  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate Std.err
## (Intercept)     1.01  0.0731
## 
## Correlation: Structure = independenceNumber of clusters:   12942   Maximum cluster size: 3

fitb.2<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid, family=binomial, data=e.long.comp, corstr="exchangeable", weights=w4c4p_40/mean(w4c4p_40))

## Warning: non-integer #successes in a binomial glm!

summary(fitb.2)

## 
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp + 
##     asian + nahn + other + ses, family = binomial, data = e.long.comp, 
##     weights = w4c4p_40/mean(w4c4p_40), id = childid, waves = wave, 
##     corstr = "exchangeable")
## 
##  Coefficients:
##             Estimate Std.err   Wald Pr(>|W|)    
## (Intercept)  -2.0117  0.1930 108.70  < 2e-16 ***
## age          -0.0456  0.0287   2.53   0.1118    
## male          0.1570  0.0570   7.58   0.0059 ** 
## black         0.4045  0.0896  20.35  6.4e-06 ***
## hisp          0.5520  0.0722  58.54  2.0e-14 ***
## asian         0.7513  0.1146  42.98  5.5e-11 ***
## nahn          0.1750  0.2048   0.73   0.3929    
## other         0.1134  0.1393   0.66   0.4157    
## ses          -0.5523  0.0375 217.31  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate Std.err
## (Intercept)        1  0.0695
## 
## Correlation: Structure = exchangeable  Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.338  0.0338
## Number of clusters:   12942   Maximum cluster size: 3

fitb.3<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid, family=binomial, data=e.long.comp, corstr="ar1", weights=w4c4p_40/mean(w4c4p_40))

## Warning: non-integer #successes in a binomial glm!

summary(fitb.3)

## 
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp + 
##     asian + nahn + other + ses, family = binomial, data = e.long.comp, 
##     weights = w4c4p_40/mean(w4c4p_40), id = childid, waves = wave, 
##     corstr = "ar1")
## 
##  Coefficients:
##             Estimate Std.err   Wald Pr(>|W|)    
## (Intercept)  -2.0451  0.1956 109.28  < 2e-16 ***
## age          -0.0442  0.0291   2.32   0.1281    
## male          0.1716  0.0572   8.99   0.0027 ** 
## black         0.4285  0.0898  22.76  1.8e-06 ***
## hisp          0.5642  0.0723  60.98  5.8e-15 ***
## asian         0.8031  0.1148  48.96  2.6e-12 ***
## nahn          0.2259  0.2101   1.16   0.2821    
## other         0.1486  0.1419   1.10   0.2949    
## ses          -0.5699  0.0380 224.94  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate Std.err
## (Intercept)     1.01  0.0744
## 
## Correlation: Structure = ar1  Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.422  0.0372
## Number of clusters:   12942   Maximum cluster size: 3

QIC(fitb.1)

##   QIC 
## 20816

QIC(fitb.2)

## Warning: non-integer #successes in a binomial glm!

##   QIC 
## 20815

QIC(fitb.3)

## Warning: non-integer #successes in a binomial glm!

##   QIC 
## 20816

In the binomial case, it looks like the independence correlation structure is good.