hw 8

Use a binary or count outcome, identify what this is and what you are measuring.

I’m going to use Health as a dichotomous outcome good or poor, with the 2017 PSID data set, using state as my higher level variable. My question is: Can we predict health using income, race and education and does this outcome vary by state?

h8 <- read_excel("~/Google Drive/Stat 2/hw 7/J257751.xlsx")
#table(h7$sex)
myscale <- function(x){as.numeric(scale(x))}
h8 <- h8 %>% 
  transmute(
    state = ER66004,
    age = ER66017,
    sex = ER66018,
    health = ER68420,
    race = as.factor(ifelse(ER70881 >= 1 & ER70881 <= 7, "hisp",
                  ifelse(ER70882 == 1, "nhw",
                         ifelse(ER70882 == 2, "nhb",
                                ifelse(ER70882 >= 4 & ER70882 <= 7, "other", NA))))),
    hhinc = ER71426,
    educ = ER71538,
    wt = ER71570
  ) %>% 
  filter(
    state <= 52,
    hhinc  <= 999999,
    age <=120,
    educ <= 17,
    health <= 5
  )
#table(h8$health)
#str(h8$health)
h8 <- h8[,1:7] %>% 
  transmute(
    state = state,
    hhinc = hhinc,
    age2 = age^2,
    sex = as.factor(ifelse(sex == 1, "male", "female")),
    health = as.numeric(ifelse(health <= 3, "0",
                    ifelse(health == 4 | health == 5, "1", NA))),#1 = poor, 0 = good
    educ = educ,
    race = race
  ) %>% 
  mutate_at(c("hhinc", "age2"), myscale)
#str(h8)
#table(h8$health)
h8 <- h8 %>% 
  mutate(
    nhw = ifelse(race == "nhw",1,0),
    nhb = ifelse( race == "nhb",1,0),
    hisp = ifelse(race == "hisp", 1,0),
    othr = ifelse(race == "other",1,0)
  )
h8 <- h8[complete.cases(h8), ]

pnhw <- h8 %>% 
  group_by(state) %>% 
  summarise(pnhw = mean(nhw), pnhb = mean(nhb), phisp = mean(hisp), pothr = mean(othr))

h8 <- left_join(h8,pnhw)

## Joining, by = "state"

h8 <- h8 %>% 
  mutate_at(c("pnhw","pnhb","phisp","pothr"), myscale)
h8$race <- relevel(h8$race, ref = "nhw")
h8$sex <- relevel(h8$sex, ref = "male")
str(h8)

## Classes 'tbl_df', 'tbl' and 'data.frame':    9058 obs. of  15 variables:
##  $ state : num  47 47 47 20 47 20 47 47 47 47 ...
##  $ hhinc : num  -0.858 -0.216 -0.959 0.257 -0.678 ...
##  $ age2  : num  0.656 0.518 -0.311 -0.361 -0.974 ...
##  $ sex   : Factor w/ 2 levels "male","female": 2 2 2 1 2 1 2 1 1 1 ...
##  $ health: num  1 0 1 0 0 0 1 0 0 0 ...
##  $ educ  : num  11 12 12 10 12 12 12 12 14 12 ...
##  $ race  : Factor w/ 4 levels "nhw","hisp","nhb",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ nhw   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ nhb   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ hisp  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ othr  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pnhw  : num  1.049 1.049 1.049 0.961 1.049 ...
##  $ pnhb  : num  -0.613 -0.613 -0.613 -1.031 -0.613 ...
##  $ phisp : num  -0.5146 -0.5146 -0.5146 0.0848 -0.5146 ...
##  $ pothr : num  -1.12 -1.12 -1.12 1.27 -1.12 ...

Conduct a multi-level analysis where you:

1) Test if a generalized linear mixed model (GLMM) is justified based on the GLM ANOVA appropriate for your outcome.

###glm anova
options(survey.lonely.psu = "adjust")
#des1 <- svydesign(ids = ~1, weights = ~wt, data = h8 )
fit.an <- anova(glm(health ~ as.factor(state), h8, family = "binomial"), test = "Chisq")
print(fit.an)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: health
## 
## Terms added sequentially (first to last)
## 
## 
##                  Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                              9057     8776.4              
## as.factor(state) 47   91.191      9010     8685.2 0.0001187 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the ANOVA we can see that there is indeed between the states, so we may proceed to the multi-level model

2) Estimate a multi-level model where you control for individual level predictors and at least 1 group level predictor

#looking at how many people in each state
head(data.frame(name = table(h8$state), id = unique(h8$state)))

##   name.Var1 name.Freq id
## 1         0        50 47
## 2         1       138 20
## 3         2        14 29
## 4         4       136 22
## 5         5       180 31
## 6         6       866 17

From the

fit.null <- glmer(health~(1|state), data = h8, na.action = na.omit, family = "binomial")
arm::display(fit.null, detail = T)

## glmer(formula = health ~ (1 | state), data = h8, family = "binomial", 
##     na.action = na.omit)
##             coef.est coef.se z value Pr(>|z|)
## (Intercept)  -1.50     0.05  -32.94    0.00  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  state    (Intercept) 0.19    
##  Residual             1.00    
## ---
## number of obs: 9058, groups: state, 48
## AIC = 8768.6, DIC = 8668.9
## deviance = 8716.8

summary(fit.null)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: health ~ (1 | state)
##    Data: h8
## 
##      AIC      BIC   logLik deviance df.resid 
##   8768.6   8782.8  -4382.3   8764.6     9056 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.5623 -0.5022 -0.4724 -0.4208  2.3845 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  state  (Intercept) 0.03582  0.1893  
## Number of obs: 9058, groups:  state, 48
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.50456    0.04567  -32.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

sjstats::icc(fit.null)

## 
## Intraclass Correlation Coefficient for Generalized linear mixed model
## 
## Family : binomial (logit)
## Formula: health ~ (1 | state)
## 
##   ICC (state): 0.0108

fit1 RE=states, FE=hhinc, age2, race, educ

fit1 <- glmer(health ~ hhinc + age2 + race + educ + sex + (1|state), family = "binomial",
              data = h8,
              control = glmerControl(optimizer = c("Nelder_Mead", "bobyqa"),
                                     optCtrl = list(maxfun = 2e9)))
summary(fit1)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: health ~ hhinc + age2 + race + educ + sex + (1 | state)
##    Data: h8
## Control: 
## glmerControl(optimizer = c("Nelder_Mead", "bobyqa"), optCtrl = list(maxfun = 2e+09))
## 
##      AIC      BIC   logLik deviance df.resid 
##   7918.4   7982.4  -3950.2   7900.4     9049 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.3183 -0.5012 -0.3715 -0.2137 19.0948 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  state  (Intercept) 0.03255  0.1804  
## Number of obs: 9058, groups:  state, 48
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.34968    0.16739  -2.089 0.036705 *  
## hhinc       -0.63006    0.05439 -11.584  < 2e-16 ***
## age2         0.43630    0.02669  16.348  < 2e-16 ***
## racehisp     0.11839    0.10570   1.120 0.262693    
## racenhb      0.25779    0.06995   3.685 0.000228 ***
## raceother    0.28783    0.20074   1.434 0.151612    
## educ        -0.11316    0.01162  -9.740  < 2e-16 ***
## sexfemale    0.11276    0.06306   1.788 0.073757 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) hhinc  age2   rachsp racnhb racthr educ  
## hhinc      0.270                                          
## age2      -0.164 -0.008                                   
## racehisp  -0.398  0.042  0.173                            
## racenhb   -0.278  0.151  0.221  0.374                     
## raceother -0.045  0.004  0.020  0.128  0.146              
## educ      -0.934 -0.278  0.096  0.293  0.113 -0.021       
## sexfemale  0.035  0.313 -0.079 -0.018 -0.147  0.023 -0.135

sjstats::icc(fit1)

## 
## Intraclass Correlation Coefficient for Generalized linear mixed model
## 
## Family : binomial (logit)
## Formula: health ~ hhinc + age2 + race + educ + sex + (1 | state)
## 
##   ICC (state): 0.0098

a) Report the results for the fixed effects

It appears as if hhinc and years of education are both significant and protective while being nhb is significant and in less health compared to nhwand age2 shows that people become less healthy as they age
#####b)report the variance components of the model
within states is 0.03299
ICC is 0.0098
not much variance between states

Without group level predictor
#####fit4 no group level predictor

fit4 <- glm(health ~ hhinc + age2 + race + educ + sex, family = "binomial",
              data = h8)
summary(fit4)

## 
## Call:
## glm(formula = health ~ hhinc + age2 + race + educ + sex, family = "binomial", 
##     data = h8)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8540  -0.6710  -0.5135  -0.3075   3.4216  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.29435    0.16249  -1.812  0.07006 .  
## hhinc       -0.62638    0.05401 -11.598  < 2e-16 ***
## age2         0.42843    0.02642  16.214  < 2e-16 ***
## racehisp     0.09608    0.09989   0.962  0.33611    
## racenhb      0.22697    0.06560   3.460  0.00054 ***
## raceother    0.27873    0.19843   1.405  0.16010    
## educ        -0.11429    0.01153  -9.916  < 2e-16 ***
## sexfemale    0.11756    0.06270   1.875  0.06080 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8776.4  on 9057  degrees of freedom
## Residual deviance: 7913.4  on 9050  degrees of freedom
## AIC: 7929.4
## 
## Number of Fisher Scoring iterations: 5

anova( fit1,fit4, test = "Chisq")

## Data: h8
## Models:
## fit4: health ~ hhinc + age2 + race + educ + sex
## fit1: health ~ hhinc + age2 + race + educ + sex + (1 | state)
##      Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
## fit4  8 7929.4 7986.3 -3956.7   7913.4                             
## fit1  9 7918.4 7982.4 -3950.2   7900.4 12.951      1  0.0003198 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

c) Compare your model to a model without the group level variable using a likelihood ratio test

It appears as if the the model with the group level predictor fits the data better as its AIC is 10 lower, however, it does not appear to be statistically significantly so as the Chi Square test shows no significance.

I want to check if controlling for the percentage of race by state fits a better model
#####fit2 RE=states FE=hhinc, age2, race, educ, pnhw, pnhb, pjisp, sex

fit2 <- glmer(health ~ hhinc + age2 + race + educ + sex + pnhw + pnhb + phisp + pothr + (1|state), family = "binomial",
              data = h8,
              control = glmerControl(optimizer = c("Nelder_Mead", "bobyqa"),
                                     optCtrl = list(maxfun = 2e9)))

## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient

summary(fit2)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## health ~ hhinc + age2 + race + educ + sex + pnhw + pnhb + phisp +  
##     pothr + (1 | state)
##    Data: h8
## Control: 
## glmerControl(optimizer = c("Nelder_Mead", "bobyqa"), optCtrl = list(maxfun = 2e+09))
## 
##      AIC      BIC   logLik deviance df.resid 
##   7923.0   8008.4  -3949.5   7899.0     9046 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.3472 -0.5005 -0.3708 -0.2139 18.2610 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  state  (Intercept) 0.02953  0.1718  
## Number of obs: 9058, groups:  state, 48
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.36581    0.16831  -2.173 0.029749 *  
## hhinc       -0.62808    0.05441 -11.543  < 2e-16 ***
## age2         0.43645    0.02669  16.353  < 2e-16 ***
## racehisp     0.14665    0.10891   1.347 0.178124    
## racenhb      0.27259    0.07465   3.652 0.000261 ***
## raceother    0.30463    0.20180   1.510 0.131156    
## educ        -0.11312    0.01162  -9.734  < 2e-16 ***
## sexfemale    0.11371    0.06306   1.803 0.071377 .  
## pnhw        -0.32868    0.75343  -0.436 0.662655    
## pnhb        -0.39377    0.82775  -0.476 0.634278    
## phisp       -0.25403    0.45404  -0.559 0.575834    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) hhinc  age2   rachsp racnhb racthr educ   sexfml pnhw  
## hhinc      0.265                                                        
## age2      -0.165 -0.008                                                 
## racehisp  -0.404  0.048  0.170                                          
## racenhb   -0.290  0.146  0.206  0.363                                   
## raceother -0.054  0.007  0.021  0.146  0.151                            
## educ      -0.929 -0.278  0.096  0.285  0.103 -0.020                     
## sexfemale  0.033  0.313 -0.079 -0.015 -0.132  0.024 -0.135              
## pnhw      -0.034  0.004  0.009  0.014  0.015  0.049  0.014 -0.003       
## pnhb      -0.028  0.003  0.009  0.008 -0.005  0.046  0.014 -0.004  0.998
## phisp     -0.022  0.000  0.007 -0.017  0.004  0.037  0.013 -0.004  0.993
##           pnhb  
## hhinc           
## age2            
## racehisp        
## racenhb         
## raceother       
## educ            
## sexfemale       
## pnhw            
## pnhb            
## phisp      0.994
## fit warnings:
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient

fit3 RE = state & race, FE=hhinc, age2, educ, sex, pnhw, pnhb, phisp and pothr

fit3 <- glmer(health ~ hhinc + age2 + educ + sex + pnhw + pnhb + phisp + pothr + (1|state) + (1|race), family = "binomial",
              data = h8,
              control = glmerControl(optimizer = c("Nelder_Mead", "bobyqa"),
                                     optCtrl = list(maxfun = 2e9)))

## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient

summary(fit3)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## health ~ hhinc + age2 + educ + sex + pnhw + pnhb + phisp + pothr +  
##     (1 | state) + (1 | race)
##    Data: h8
## Control: 
## glmerControl(optimizer = c("Nelder_Mead", "bobyqa"), optCtrl = list(maxfun = 2e+09))
## 
##      AIC      BIC   logLik deviance df.resid 
##   7926.5   7997.7  -3953.3   7906.5     9048 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2131 -0.5011 -0.3719 -0.2148 18.6204 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  state  (Intercept) 0.03102  0.1761  
##  race   (Intercept) 0.01080  0.1039  
## Number of obs: 9058, groups:  state, 48; race, 4
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.21010    0.16390  -1.282   0.1999    
## hhinc       -0.63370    0.05449 -11.629   <2e-16 ***
## age2         0.43311    0.02666  16.246   <2e-16 ***
## educ        -0.11364    0.01143  -9.939   <2e-16 ***
## sexfemale    0.11815    0.06301   1.875   0.0608 .  
## pnhw        -0.35503    0.76168  -0.466   0.6411    
## pnhb        -0.40980    0.83673  -0.490   0.6243    
## phisp       -0.26416    0.45887  -0.576   0.5648    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) hhinc  age2   educ   sexfml pnhw   pnhb  
## hhinc      0.306                                          
## age2      -0.093 -0.004                                   
## educ      -0.878 -0.281  0.087                            
## sexfemale  0.013  0.312 -0.081 -0.140                     
## pnhw      -0.026  0.005  0.011  0.016 -0.006              
## pnhb      -0.025  0.004  0.010  0.016 -0.006  0.998       
## phisp     -0.025  0.001  0.009  0.017 -0.006  0.993  0.994
## fit warnings:
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient

3) Discuss the results of your model within the context of your question from above.

testing all models

anova( fit.null,fit1, fit2, fit3, fit4, test = "Chisq")

## Data: h8
## Models:
## fit.null: health ~ (1 | state)
## fit4: health ~ hhinc + age2 + race + educ + sex
## fit1: health ~ hhinc + age2 + race + educ + sex + (1 | state)
## fit3: health ~ hhinc + age2 + educ + sex + pnhw + pnhb + phisp + pothr + 
## fit3:     (1 | state) + (1 | race)
## fit2: health ~ hhinc + age2 + race + educ + sex + pnhw + pnhb + phisp + 
## fit2:     pothr + (1 | state)
##          Df    AIC    BIC  logLik deviance    Chisq Chi Df Pr(>Chisq)    
## fit.null  2 8768.6 8782.8 -4382.3   8764.6                               
## fit4      8 7929.4 7986.3 -3956.7   7913.4 851.2359      6  < 2.2e-16 ***
## fit1      9 7918.4 7982.4 -3950.2   7900.4  12.9508      1  0.0003198 ***
## fit3     10 7926.5 7997.7 -3953.3   7906.5   0.0000      1  1.0000000    
## fit2     12 7923.0 8008.4 -3949.5   7899.0   7.5202      2  0.0232810 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

stargazer::stargazer(fit.null, fit1, fit2, fit3, fit4, type = "html", style = "demography")


	health
	generalized linear				logistic
	mixed-effects
	Model 1	Model 2	Model 3	Model 4	Model 5

hhinc		-0.630^***	-0.628^***	-0.634^***	-0.626^***
		(0.054)	(0.054)	(0.054)	(0.054)
age2		0.436^***	0.436^***	0.433^***	0.428^***
		(0.027)	(0.027)	(0.027)	(0.026)
racehisp		0.118	0.147		0.096
		(0.106)	(0.109)		(0.100)
racenhb		0.258^***	0.273^***		0.227^***
		(0.070)	(0.075)		(0.066)
raceother		0.288	0.305		0.279
		(0.201)	(0.202)		(0.198)
educ		-0.113^***	-0.113^***	-0.114^***	-0.114^***
		(0.012)	(0.012)	(0.011)	(0.012)
sexfemale		0.113	0.114	0.118	0.118
		(0.063)	(0.063)	(0.063)	(0.063)
pnhw			-0.329	-0.355
			(0.753)	(0.762)
pnhb			-0.394	-0.410
			(0.828)	(0.837)
phisp			-0.254	-0.264
			(0.454)	(0.459)
Constant	-1.505^***	-0.350^*	-0.366^*	-0.210	-0.294
	(0.046)	(0.167)	(0.168)	(0.164)	(0.162)
N	9,058	9,058	9,058	9,058	9,058
Log Likelihood	-4,382.312	-3,950.218	-3,949.509	-3,953.269	-3,956.694
AIC	8,768.624	7,918.437	7,923.017	7,926.538	7,929.388
BIC	8,782.846	7,982.439	8,008.354	7,997.652

p < .05; p < .01; p < .001

From the models, we can see that first, health outcomes do vary by state. As for the models, it when looking at the ANOVA, only model 2 has problems, while all the other models appear to fit the data better than the GLM model with no Random Effects.

When looking at the comparison of all the models we can see that all models do better than the NULL and all models with RE do better than the purely GLM model. Of the models that included RE, Model 1 appears to have the best fit, with the lowest AIC (and lowest number of predictors, and thereby most parsimonious). The results of the outcomes do not change model by model, so it would appear that fit1 (aka model 1 fits best)