Iām going to use Health as a dichotomous outcome good or poor, with the 2017 PSID data set, using state as my higher level variable. My question is: Can we predict health using income, race and education and does this outcome vary by state?
h8 <- read_excel("~/Google Drive/Stat 2/hw 7/J257751.xlsx")
#table(h7$sex)
myscale <- function(x){as.numeric(scale(x))}
h8 <- h8 %>%
transmute(
state = ER66004,
age = ER66017,
sex = ER66018,
health = ER68420,
race = as.factor(ifelse(ER70881 >= 1 & ER70881 <= 7, "hisp",
ifelse(ER70882 == 1, "nhw",
ifelse(ER70882 == 2, "nhb",
ifelse(ER70882 >= 4 & ER70882 <= 7, "other", NA))))),
hhinc = ER71426,
educ = ER71538,
wt = ER71570
) %>%
filter(
state <= 52,
hhinc <= 999999,
age <=120,
educ <= 17,
health <= 5
)
#table(h8$health)
#str(h8$health)
h8 <- h8[,1:7] %>%
transmute(
state = state,
hhinc = hhinc,
age2 = age^2,
sex = as.factor(ifelse(sex == 1, "male", "female")),
health = as.numeric(ifelse(health <= 3, "0",
ifelse(health == 4 | health == 5, "1", NA))),#1 = poor, 0 = good
educ = educ,
race = race
) %>%
mutate_at(c("hhinc", "age2"), myscale)
#str(h8)
#table(h8$health)
h8 <- h8 %>%
mutate(
nhw = ifelse(race == "nhw",1,0),
nhb = ifelse( race == "nhb",1,0),
hisp = ifelse(race == "hisp", 1,0),
othr = ifelse(race == "other",1,0)
)
h8 <- h8[complete.cases(h8), ]
pnhw <- h8 %>%
group_by(state) %>%
summarise(pnhw = mean(nhw), pnhb = mean(nhb), phisp = mean(hisp), pothr = mean(othr))
h8 <- left_join(h8,pnhw)
## Joining, by = "state"
h8 <- h8 %>%
mutate_at(c("pnhw","pnhb","phisp","pothr"), myscale)
h8$race <- relevel(h8$race, ref = "nhw")
h8$sex <- relevel(h8$sex, ref = "male")
str(h8)
## Classes 'tbl_df', 'tbl' and 'data.frame': 9058 obs. of 15 variables:
## $ state : num 47 47 47 20 47 20 47 47 47 47 ...
## $ hhinc : num -0.858 -0.216 -0.959 0.257 -0.678 ...
## $ age2 : num 0.656 0.518 -0.311 -0.361 -0.974 ...
## $ sex : Factor w/ 2 levels "male","female": 2 2 2 1 2 1 2 1 1 1 ...
## $ health: num 1 0 1 0 0 0 1 0 0 0 ...
## $ educ : num 11 12 12 10 12 12 12 12 14 12 ...
## $ race : Factor w/ 4 levels "nhw","hisp","nhb",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ nhw : num 1 1 1 1 1 1 1 1 1 1 ...
## $ nhb : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hisp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ othr : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pnhw : num 1.049 1.049 1.049 0.961 1.049 ...
## $ pnhb : num -0.613 -0.613 -0.613 -1.031 -0.613 ...
## $ phisp : num -0.5146 -0.5146 -0.5146 0.0848 -0.5146 ...
## $ pothr : num -1.12 -1.12 -1.12 1.27 -1.12 ...
###glm anova
options(survey.lonely.psu = "adjust")
#des1 <- svydesign(ids = ~1, weights = ~wt, data = h8 )
fit.an <- anova(glm(health ~ as.factor(state), h8, family = "binomial"), test = "Chisq")
print(fit.an)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: health
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 9057 8776.4
## as.factor(state) 47 91.191 9010 8685.2 0.0001187 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the ANOVA we can see that there is indeed between the states, so we may proceed to the multi-level model
#looking at how many people in each state
head(data.frame(name = table(h8$state), id = unique(h8$state)))
## name.Var1 name.Freq id
## 1 0 50 47
## 2 1 138 20
## 3 2 14 29
## 4 4 136 22
## 5 5 180 31
## 6 6 866 17
From the
fit.null <- glmer(health~(1|state), data = h8, na.action = na.omit, family = "binomial")
arm::display(fit.null, detail = T)
## glmer(formula = health ~ (1 | state), data = h8, family = "binomial",
## na.action = na.omit)
## coef.est coef.se z value Pr(>|z|)
## (Intercept) -1.50 0.05 -32.94 0.00
##
## Error terms:
## Groups Name Std.Dev.
## state (Intercept) 0.19
## Residual 1.00
## ---
## number of obs: 9058, groups: state, 48
## AIC = 8768.6, DIC = 8668.9
## deviance = 8716.8
summary(fit.null)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: health ~ (1 | state)
## Data: h8
##
## AIC BIC logLik deviance df.resid
## 8768.6 8782.8 -4382.3 8764.6 9056
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.5623 -0.5022 -0.4724 -0.4208 2.3845
##
## Random effects:
## Groups Name Variance Std.Dev.
## state (Intercept) 0.03582 0.1893
## Number of obs: 9058, groups: state, 48
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.50456 0.04567 -32.94 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sjstats::icc(fit.null)
##
## Intraclass Correlation Coefficient for Generalized linear mixed model
##
## Family : binomial (logit)
## Formula: health ~ (1 | state)
##
## ICC (state): 0.0108
fit1 <- glmer(health ~ hhinc + age2 + race + educ + sex + (1|state), family = "binomial",
data = h8,
control = glmerControl(optimizer = c("Nelder_Mead", "bobyqa"),
optCtrl = list(maxfun = 2e9)))
summary(fit1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: health ~ hhinc + age2 + race + educ + sex + (1 | state)
## Data: h8
## Control:
## glmerControl(optimizer = c("Nelder_Mead", "bobyqa"), optCtrl = list(maxfun = 2e+09))
##
## AIC BIC logLik deviance df.resid
## 7918.4 7982.4 -3950.2 7900.4 9049
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.3183 -0.5012 -0.3715 -0.2137 19.0948
##
## Random effects:
## Groups Name Variance Std.Dev.
## state (Intercept) 0.03255 0.1804
## Number of obs: 9058, groups: state, 48
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.34968 0.16739 -2.089 0.036705 *
## hhinc -0.63006 0.05439 -11.584 < 2e-16 ***
## age2 0.43630 0.02669 16.348 < 2e-16 ***
## racehisp 0.11839 0.10570 1.120 0.262693
## racenhb 0.25779 0.06995 3.685 0.000228 ***
## raceother 0.28783 0.20074 1.434 0.151612
## educ -0.11316 0.01162 -9.740 < 2e-16 ***
## sexfemale 0.11276 0.06306 1.788 0.073757 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) hhinc age2 rachsp racnhb racthr educ
## hhinc 0.270
## age2 -0.164 -0.008
## racehisp -0.398 0.042 0.173
## racenhb -0.278 0.151 0.221 0.374
## raceother -0.045 0.004 0.020 0.128 0.146
## educ -0.934 -0.278 0.096 0.293 0.113 -0.021
## sexfemale 0.035 0.313 -0.079 -0.018 -0.147 0.023 -0.135
sjstats::icc(fit1)
##
## Intraclass Correlation Coefficient for Generalized linear mixed model
##
## Family : binomial (logit)
## Formula: health ~ hhinc + age2 + race + educ + sex + (1 | state)
##
## ICC (state): 0.0098
It appears as if hhinc and years of education are both significant and protective while being nhb is significant and in less health compared to nhwand age2 shows that people become less healthy as they age
#####b)report the variance components of the model
within states is 0.03299
ICC is 0.0098
not much variance between states
Without group level predictor
#####fit4 no group level predictor
fit4 <- glm(health ~ hhinc + age2 + race + educ + sex, family = "binomial",
data = h8)
summary(fit4)
##
## Call:
## glm(formula = health ~ hhinc + age2 + race + educ + sex, family = "binomial",
## data = h8)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8540 -0.6710 -0.5135 -0.3075 3.4216
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.29435 0.16249 -1.812 0.07006 .
## hhinc -0.62638 0.05401 -11.598 < 2e-16 ***
## age2 0.42843 0.02642 16.214 < 2e-16 ***
## racehisp 0.09608 0.09989 0.962 0.33611
## racenhb 0.22697 0.06560 3.460 0.00054 ***
## raceother 0.27873 0.19843 1.405 0.16010
## educ -0.11429 0.01153 -9.916 < 2e-16 ***
## sexfemale 0.11756 0.06270 1.875 0.06080 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8776.4 on 9057 degrees of freedom
## Residual deviance: 7913.4 on 9050 degrees of freedom
## AIC: 7929.4
##
## Number of Fisher Scoring iterations: 5
anova( fit1,fit4, test = "Chisq")
## Data: h8
## Models:
## fit4: health ~ hhinc + age2 + race + educ + sex
## fit1: health ~ hhinc + age2 + race + educ + sex + (1 | state)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit4 8 7929.4 7986.3 -3956.7 7913.4
## fit1 9 7918.4 7982.4 -3950.2 7900.4 12.951 1 0.0003198 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It appears as if the the model with the group level predictor fits the data better as its AIC is 10 lower, however, it does not appear to be statistically significantly so as the Chi Square test shows no significance.
I want to check if controlling for the percentage of race by state fits a better model
#####fit2 RE=states FE=hhinc, age2, race, educ, pnhw, pnhb, pjisp, sex
fit2 <- glmer(health ~ hhinc + age2 + race + educ + sex + pnhw + pnhb + phisp + pothr + (1|state), family = "binomial",
data = h8,
control = glmerControl(optimizer = c("Nelder_Mead", "bobyqa"),
optCtrl = list(maxfun = 2e9)))
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
summary(fit2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## health ~ hhinc + age2 + race + educ + sex + pnhw + pnhb + phisp +
## pothr + (1 | state)
## Data: h8
## Control:
## glmerControl(optimizer = c("Nelder_Mead", "bobyqa"), optCtrl = list(maxfun = 2e+09))
##
## AIC BIC logLik deviance df.resid
## 7923.0 8008.4 -3949.5 7899.0 9046
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.3472 -0.5005 -0.3708 -0.2139 18.2610
##
## Random effects:
## Groups Name Variance Std.Dev.
## state (Intercept) 0.02953 0.1718
## Number of obs: 9058, groups: state, 48
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.36581 0.16831 -2.173 0.029749 *
## hhinc -0.62808 0.05441 -11.543 < 2e-16 ***
## age2 0.43645 0.02669 16.353 < 2e-16 ***
## racehisp 0.14665 0.10891 1.347 0.178124
## racenhb 0.27259 0.07465 3.652 0.000261 ***
## raceother 0.30463 0.20180 1.510 0.131156
## educ -0.11312 0.01162 -9.734 < 2e-16 ***
## sexfemale 0.11371 0.06306 1.803 0.071377 .
## pnhw -0.32868 0.75343 -0.436 0.662655
## pnhb -0.39377 0.82775 -0.476 0.634278
## phisp -0.25403 0.45404 -0.559 0.575834
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) hhinc age2 rachsp racnhb racthr educ sexfml pnhw
## hhinc 0.265
## age2 -0.165 -0.008
## racehisp -0.404 0.048 0.170
## racenhb -0.290 0.146 0.206 0.363
## raceother -0.054 0.007 0.021 0.146 0.151
## educ -0.929 -0.278 0.096 0.285 0.103 -0.020
## sexfemale 0.033 0.313 -0.079 -0.015 -0.132 0.024 -0.135
## pnhw -0.034 0.004 0.009 0.014 0.015 0.049 0.014 -0.003
## pnhb -0.028 0.003 0.009 0.008 -0.005 0.046 0.014 -0.004 0.998
## phisp -0.022 0.000 0.007 -0.017 0.004 0.037 0.013 -0.004 0.993
## pnhb
## hhinc
## age2
## racehisp
## racenhb
## raceother
## educ
## sexfemale
## pnhw
## pnhb
## phisp 0.994
## fit warnings:
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
fit3 <- glmer(health ~ hhinc + age2 + educ + sex + pnhw + pnhb + phisp + pothr + (1|state) + (1|race), family = "binomial",
data = h8,
control = glmerControl(optimizer = c("Nelder_Mead", "bobyqa"),
optCtrl = list(maxfun = 2e9)))
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
summary(fit3)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## health ~ hhinc + age2 + educ + sex + pnhw + pnhb + phisp + pothr +
## (1 | state) + (1 | race)
## Data: h8
## Control:
## glmerControl(optimizer = c("Nelder_Mead", "bobyqa"), optCtrl = list(maxfun = 2e+09))
##
## AIC BIC logLik deviance df.resid
## 7926.5 7997.7 -3953.3 7906.5 9048
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2131 -0.5011 -0.3719 -0.2148 18.6204
##
## Random effects:
## Groups Name Variance Std.Dev.
## state (Intercept) 0.03102 0.1761
## race (Intercept) 0.01080 0.1039
## Number of obs: 9058, groups: state, 48; race, 4
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.21010 0.16390 -1.282 0.1999
## hhinc -0.63370 0.05449 -11.629 <2e-16 ***
## age2 0.43311 0.02666 16.246 <2e-16 ***
## educ -0.11364 0.01143 -9.939 <2e-16 ***
## sexfemale 0.11815 0.06301 1.875 0.0608 .
## pnhw -0.35503 0.76168 -0.466 0.6411
## pnhb -0.40980 0.83673 -0.490 0.6243
## phisp -0.26416 0.45887 -0.576 0.5648
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) hhinc age2 educ sexfml pnhw pnhb
## hhinc 0.306
## age2 -0.093 -0.004
## educ -0.878 -0.281 0.087
## sexfemale 0.013 0.312 -0.081 -0.140
## pnhw -0.026 0.005 0.011 0.016 -0.006
## pnhb -0.025 0.004 0.010 0.016 -0.006 0.998
## phisp -0.025 0.001 0.009 0.017 -0.006 0.993 0.994
## fit warnings:
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
anova( fit.null,fit1, fit2, fit3, fit4, test = "Chisq")
## Data: h8
## Models:
## fit.null: health ~ (1 | state)
## fit4: health ~ hhinc + age2 + race + educ + sex
## fit1: health ~ hhinc + age2 + race + educ + sex + (1 | state)
## fit3: health ~ hhinc + age2 + educ + sex + pnhw + pnhb + phisp + pothr +
## fit3: (1 | state) + (1 | race)
## fit2: health ~ hhinc + age2 + race + educ + sex + pnhw + pnhb + phisp +
## fit2: pothr + (1 | state)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit.null 2 8768.6 8782.8 -4382.3 8764.6
## fit4 8 7929.4 7986.3 -3956.7 7913.4 851.2359 6 < 2.2e-16 ***
## fit1 9 7918.4 7982.4 -3950.2 7900.4 12.9508 1 0.0003198 ***
## fit3 10 7926.5 7997.7 -3953.3 7906.5 0.0000 1 1.0000000
## fit2 12 7923.0 8008.4 -3949.5 7899.0 7.5202 2 0.0232810 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
stargazer::stargazer(fit.null, fit1, fit2, fit3, fit4, type = "html", style = "demography")
| health | |||||
| generalized linear | logistic | ||||
| mixed-effects | |||||
| Model 1 | Model 2 | Model 3 | Model 4 | Model 5 | |
| hhinc | -0.630*** | -0.628*** | -0.634*** | -0.626*** | |
| (0.054) | (0.054) | (0.054) | (0.054) | ||
| age2 | 0.436*** | 0.436*** | 0.433*** | 0.428*** | |
| (0.027) | (0.027) | (0.027) | (0.026) | ||
| racehisp | 0.118 | 0.147 | 0.096 | ||
| (0.106) | (0.109) | (0.100) | |||
| racenhb | 0.258*** | 0.273*** | 0.227*** | ||
| (0.070) | (0.075) | (0.066) | |||
| raceother | 0.288 | 0.305 | 0.279 | ||
| (0.201) | (0.202) | (0.198) | |||
| educ | -0.113*** | -0.113*** | -0.114*** | -0.114*** | |
| (0.012) | (0.012) | (0.011) | (0.012) | ||
| sexfemale | 0.113 | 0.114 | 0.118 | 0.118 | |
| (0.063) | (0.063) | (0.063) | (0.063) | ||
| pnhw | -0.329 | -0.355 | |||
| (0.753) | (0.762) | ||||
| pnhb | -0.394 | -0.410 | |||
| (0.828) | (0.837) | ||||
| phisp | -0.254 | -0.264 | |||
| (0.454) | (0.459) | ||||
| Constant | -1.505*** | -0.350* | -0.366* | -0.210 | -0.294 |
| (0.046) | (0.167) | (0.168) | (0.164) | (0.162) | |
| N | 9,058 | 9,058 | 9,058 | 9,058 | 9,058 |
| Log Likelihood | -4,382.312 | -3,950.218 | -3,949.509 | -3,953.269 | -3,956.694 |
| AIC | 8,768.624 | 7,918.437 | 7,923.017 | 7,926.538 | 7,929.388 |
| BIC | 8,782.846 | 7,982.439 | 8,008.354 | 7,997.652 | |
| p < .05; p < .01; p < .001 | |||||
From the models, we can see that first, health outcomes do vary by state. As for the models, it when looking at the ANOVA, only model 2 has problems, while all the other models appear to fit the data better than the GLM model with no Random Effects.
When looking at the comparison of all the models we can see that all models do better than the NULL and all models with RE do better than the purely GLM model. Of the models that included RE, Model 1 appears to have the best fit, with the lowest AIC (and lowest number of predictors, and thereby most parsimonious). The results of the outcomes do not change model by model, so it would appear that fit1 (aka model 1 fits best)