library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
My outcome for this assignment is the number of days the respondent reported poor mental health in the past month:
Question: Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?
Research Question: Does age, race/ethnicity, smoking and educational level affect mental health outcomes?
Yes, because this will allow the model to represent rates rather than counts. For example, since that the count variable of interest is the number of days respondent poor mental health for varying amount of days (30 days), it means that we would expect to observe larger numbers of respondents to experience poor mental health for longer periods of time. In this case, it may be more appropriate to model the rate of respondents experiencing poor mental health per unit per day rather than the number of respondents that experienced poor mental health.
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
brfss_17$healthdays<-Recode(brfss_17$menthlth, recodes = "88=0; 77=NA; 99=NA")
hist(brfss_17$healthdays)
summary(brfss_17$healthdays)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.00 3.54 2.00 30.00 3532
Other variables:
#brfss_17$badhealth<-ifelse(brfss_17$genhlth %in% c(4,5),1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss_17$bmi<-brfss_17$bmi5/100
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3,
recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA",
as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
First, we will subset our data to have complete cases for our variables in our model and make our survey design object
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sub<-brfss_17%>%
select(healthdays, mmsaname, bmi,
agec,race_eth, marst, educ,white, black, hispanic,
other, smoke, ins, mmsawt, ststr) %>%
filter( complete.cases(.))
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr,
weights=~mmsawt,
data = brfss_17[is.na(brfss_17$mmsawt)==F,])
To fit a Poisson GLM to survey data in R, we use the svyglm function in the survey library.
#First I do some simple descriptives
svyhist(~healthdays, des)
svyby(~healthdays, ~race_eth+educ, des, svymean, na.rm=T)
## race_eth educ healthdays se
## nhwhite.2hsgrad nhwhite 2hsgrad 4.441731 0.09198523
## hispanic.2hsgrad hispanic 2hsgrad 3.649486 0.18958833
## nh black.2hsgrad nh black 2hsgrad 4.341862 0.18131078
## nh multirace.2hsgrad nh multirace 2hsgrad 7.964629 0.71215383
## nh other.2hsgrad nh other 2hsgrad 3.523677 0.36777682
## nhwhite.0Prim nhwhite 0Prim 5.559214 0.52992501
## hispanic.0Prim hispanic 0Prim 3.581327 0.27627739
## nh black.0Prim nh black 0Prim 4.641120 0.76283362
## nh multirace.0Prim nh multirace 0Prim 8.125647 2.40950214
## nh other.0Prim nh other 0Prim 5.520753 1.68261646
## nhwhite.1somehs nhwhite 1somehs 6.635648 0.29983912
## hispanic.1somehs hispanic 1somehs 3.735959 0.27038025
## nh black.1somehs nh black 1somehs 5.940522 0.42093247
## nh multirace.1somehs nh multirace 1somehs 6.874169 1.20156885
## nh other.1somehs nh other 1somehs 7.462211 1.44358241
## nhwhite.3somecol nhwhite 3somecol 4.334890 0.08096589
## hispanic.3somecol hispanic 3somecol 4.147646 0.20461440
## nh black.3somecol nh black 3somecol 3.805794 0.17095197
## nh multirace.3somecol nh multirace 3somecol 5.664337 0.45706397
## nh other.3somecol nh other 3somecol 4.088507 0.40320647
## nhwhite.4colgrad nhwhite 4colgrad 2.732519 0.04206487
## hispanic.4colgrad hispanic 4colgrad 3.089244 0.14245709
## nh black.4colgrad nh black 4colgrad 2.660009 0.12743373
## nh multirace.4colgrad nh multirace 4colgrad 4.652389 0.54388564
## nh other.4colgrad nh other 4colgrad 2.174086 0.14773551
svyby(~healthdays, ~agec, des, svymean, na.rm=T)
## agec healthdays se
## (0,24] (0,24] 5.272664 0.12444480
## (24,39] (24,39] 4.169103 0.06953472
## (39,59] (39,59] 3.883674 0.06395369
## (59,79] (59,79] 2.959103 0.06155984
## (79,99] (79,99] 2.270664 0.13385754
#Poisson glm fit to survey data
fit1<-svyglm(healthdays~factor(race_eth)+factor(educ)+factor(agec), design=des, family=poisson)
summary(fit1)
##
## Call:
## svyglm(formula = healthdays ~ factor(race_eth) + factor(educ) +
## factor(agec), design = des, family = poisson)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss_17[is.na(brfss_17$mmsawt) ==
## F, ])
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.79212 0.02885 62.127 < 2e-16 ***
## factor(race_eth)hispanic -0.29875 0.03143 -9.505 < 2e-16 ***
## factor(race_eth)nh black -0.11262 0.02733 -4.121 3.77e-05 ***
## factor(race_eth)nh multirace 0.32195 0.05188 6.205 5.47e-10 ***
## factor(race_eth)nh other -0.25077 0.05150 -4.869 1.12e-06 ***
## factor(educ)0Prim 0.15101 0.06331 2.385 0.0171 *
## factor(educ)1somehs 0.29481 0.03882 7.595 3.09e-14 ***
## factor(educ)3somecol -0.02575 0.02382 -1.081 0.2797
## factor(educ)4colgrad -0.46001 0.02278 -20.197 < 2e-16 ***
## factor(agec)(24,39] -0.14171 0.02927 -4.841 1.29e-06 ***
## factor(agec)(39,59] -0.24552 0.02887 -8.504 < 2e-16 ***
## factor(agec)(59,79] -0.56190 0.03235 -17.370 < 2e-16 ***
## factor(agec)(79,99] -0.88684 0.06465 -13.718 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 16.32728)
##
## Number of Fisher Scoring iterations: 7
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
## factor(race_eth)hispanic factor(race_eth)nh black
## 0.742 0.893
## factor(race_eth)nh multirace factor(race_eth)nh other
## 1.380 0.778
## factor(educ)0Prim factor(educ)1somehs
## 1.163 1.343
## factor(educ)3somecol factor(educ)4colgrad
## 0.975 0.631
## factor(agec)(24,39] factor(agec)(39,59]
## 0.868 0.782
## factor(agec)(59,79] factor(agec)(79,99]
## 0.570 0.412
The result shows that NH Multirace respondents had higher mean counts of poor mental health days than NH Whites, while hispanics had a lower mean count of poor mental health days in exception of respondents with some college completion. As education increases, the the mean count of poor mental health days decreases. On the other hand, as age increases, the mean count of poor health decreases.
An easy check on this is to compare the residual deviance to the residual degrees of freedom. They ratio should be 1 if the model fits the data.
fit2<-glm(healthdays~factor(race_eth)+factor(educ)+factor(agec), data=brfss_17, family=poisson)
summary(fit2)
##
## Call:
## glm(formula = healthdays ~ factor(race_eth) + factor(educ) +
## factor(agec), family = poisson, data = brfss_17)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.8185 -2.5729 -2.1278 -0.6355 11.6925
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.779975 0.004132 430.80 <2e-16 ***
## factor(race_eth)hispanic -0.142360 0.003955 -35.99 <2e-16 ***
## factor(race_eth)nh black -0.042015 0.003685 -11.40 <2e-16 ***
## factor(race_eth)nh multirace 0.350208 0.006776 51.68 <2e-16 ***
## factor(race_eth)nh other -0.092791 0.005791 -16.02 <2e-16 ***
## factor(educ)0Prim 0.178867 0.007458 23.98 <2e-16 ***
## factor(educ)1somehs 0.321579 0.004947 65.00 <2e-16 ***
## factor(educ)3somecol -0.047151 0.002942 -16.03 <2e-16 ***
## factor(educ)4colgrad -0.476330 0.002970 -160.38 <2e-16 ***
## factor(agec)(24,39] -0.098742 0.004426 -22.31 <2e-16 ***
## factor(agec)(39,59] -0.192433 0.004160 -46.26 <2e-16 ***
## factor(agec)(59,79] -0.583089 0.004284 -136.10 <2e-16 ***
## factor(agec)(79,99] -0.999473 0.006685 -149.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2425055 on 222096 degrees of freedom
## Residual deviance: 2324182 on 222084 degrees of freedom
## (8778 observations deleted due to missingness)
## AIC: 2595781
##
## Number of Fisher Scoring iterations: 7
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 3.235016
1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0
The Dispersion parameter for poisson family was 16.32728 which is overdispersed since its suppose to equal 1.Therefore, it is an evidence that the model is not a good fit.
##. 2.b. Is the Poisson model a good choice?
The test of model fit, using a chi-square distribution, with degrees of freedom equal to the residual produced a result of 0. So, this p value is 0, which means the model does not fit the data. That means the poisson model is not a good choice for this model.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
fit.nb1<-glm.nb(healthdays~factor(race_eth)+factor(educ)+factor(agec),
data=sub,
weights=mmsawt/mean(mmsawt, na.rm=T))
summary(fit.nb1)
##
## Call:
## glm.nb(formula = healthdays ~ factor(race_eth) + factor(educ) +
## factor(agec), data = sub, weights = mmsawt/mean(mmsawt, na.rm = T),
## init.theta = 0.1455502131, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.3581 -0.7510 -0.3927 -0.0611 9.0452
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.82172 0.02041 89.251 < 2e-16 ***
## factor(race_eth)hispanic -0.18992 0.01761 -10.783 < 2e-16 ***
## factor(race_eth)nh black -0.05977 0.01826 -3.273 0.00106 **
## factor(race_eth)nh multirace 0.29235 0.05130 5.699 1.21e-08 ***
## factor(race_eth)nh other -0.25554 0.02368 -10.789 < 2e-16 ***
## factor(educ)0Prim 0.21989 0.03359 6.547 5.89e-11 ***
## factor(educ)1somehs 0.26405 0.02525 10.458 < 2e-16 ***
## factor(educ)3somecol -0.04284 0.01599 -2.679 0.00738 **
## factor(educ)4colgrad -0.47759 0.01641 -29.101 < 2e-16 ***
## factor(agec)(24,39] -0.14116 0.02107 -6.700 2.08e-11 ***
## factor(agec)(39,59] -0.28785 0.02033 -14.160 < 2e-16 ***
## factor(agec)(59,79] -0.59776 0.02156 -27.730 < 2e-16 ***
## factor(agec)(79,99] -0.94428 0.03474 -27.183 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.1456) family taken to be 1)
##
## Null deviance: 147894 on 198380 degrees of freedom
## Residual deviance: 144866 on 198368 degrees of freedom
## AIC: 746958
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.145550
## Std. Err.: 0.000676
##
## 2 x log-likelihood: -746930.390000
The AIC of 746,958 of the alternative model is lower than the AIC of the Poisson model, Therefore, the alternative (Negative binomial model) is a good fit for this model.