Load libraries

library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)

1. Define a count outcome for the dataset of your choosing

My outcome for this assignment is the number of days the respondent reported poor mental health in the past month:

Question: Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?

1.a. State a research question about your outcome

Research Question: Does age, race/ethnicity, smoking and educational level affect mental health outcomes?

1.b. Is an offset term necessary? why or why not?

Yes, because this will allow the model to represent rates rather than counts. For example, since that the count variable of interest is the number of days respondent poor mental health for varying amount of days (30 days), it means that we would expect to observe larger numbers of respondents to experience poor mental health for longer periods of time. In this case, it may be more appropriate to model the rate of respondents experiencing poor mental health per unit per day rather than the number of respondents that experienced poor mental health.

Load brfss data

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

brfss_17$healthdays<-Recode(brfss_17$menthlth, recodes = "88=0; 77=NA; 99=NA")
hist(brfss_17$healthdays)

summary(brfss_17$healthdays)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00    3.54    2.00   30.00    3532

Other variables:

#brfss_17$badhealth<-ifelse(brfss_17$genhlth %in% c(4,5),1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", 
as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

2. Consider a Poisson regression model for the outcome

Analysis

First, we will subset our data to have complete cases for our variables in our model and make our survey design object

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
sub<-brfss_17%>%
  select(healthdays, mmsaname, bmi,
         agec,race_eth, marst, educ,white, black, hispanic,
         other, smoke, ins, mmsawt, ststr) %>%
  filter( complete.cases(.))


#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, 
               weights=~mmsawt,
               data = brfss_17[is.na(brfss_17$mmsawt)==F,])

Poisson regression

To fit a Poisson GLM to survey data in R, we use the svyglm function in the survey library.

#First I do some simple descriptives
svyhist(~healthdays, des)

svyby(~healthdays, ~race_eth+educ, des, svymean, na.rm=T)
##                           race_eth     educ healthdays         se
## nhwhite.2hsgrad            nhwhite  2hsgrad   4.441731 0.09198523
## hispanic.2hsgrad          hispanic  2hsgrad   3.649486 0.18958833
## nh black.2hsgrad          nh black  2hsgrad   4.341862 0.18131078
## nh multirace.2hsgrad  nh multirace  2hsgrad   7.964629 0.71215383
## nh other.2hsgrad          nh other  2hsgrad   3.523677 0.36777682
## nhwhite.0Prim              nhwhite    0Prim   5.559214 0.52992501
## hispanic.0Prim            hispanic    0Prim   3.581327 0.27627739
## nh black.0Prim            nh black    0Prim   4.641120 0.76283362
## nh multirace.0Prim    nh multirace    0Prim   8.125647 2.40950214
## nh other.0Prim            nh other    0Prim   5.520753 1.68261646
## nhwhite.1somehs            nhwhite  1somehs   6.635648 0.29983912
## hispanic.1somehs          hispanic  1somehs   3.735959 0.27038025
## nh black.1somehs          nh black  1somehs   5.940522 0.42093247
## nh multirace.1somehs  nh multirace  1somehs   6.874169 1.20156885
## nh other.1somehs          nh other  1somehs   7.462211 1.44358241
## nhwhite.3somecol           nhwhite 3somecol   4.334890 0.08096589
## hispanic.3somecol         hispanic 3somecol   4.147646 0.20461440
## nh black.3somecol         nh black 3somecol   3.805794 0.17095197
## nh multirace.3somecol nh multirace 3somecol   5.664337 0.45706397
## nh other.3somecol         nh other 3somecol   4.088507 0.40320647
## nhwhite.4colgrad           nhwhite 4colgrad   2.732519 0.04206487
## hispanic.4colgrad         hispanic 4colgrad   3.089244 0.14245709
## nh black.4colgrad         nh black 4colgrad   2.660009 0.12743373
## nh multirace.4colgrad nh multirace 4colgrad   4.652389 0.54388564
## nh other.4colgrad         nh other 4colgrad   2.174086 0.14773551
svyby(~healthdays, ~agec, des, svymean, na.rm=T)
##            agec healthdays         se
## (0,24]   (0,24]   5.272664 0.12444480
## (24,39] (24,39]   4.169103 0.06953472
## (39,59] (39,59]   3.883674 0.06395369
## (59,79] (59,79]   2.959103 0.06155984
## (79,99] (79,99]   2.270664 0.13385754
#Poisson glm fit to survey data
fit1<-svyglm(healthdays~factor(race_eth)+factor(educ)+factor(agec), design=des, family=poisson)
summary(fit1)
## 
## Call:
## svyglm(formula = healthdays ~ factor(race_eth) + factor(educ) + 
##     factor(agec), design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss_17[is.na(brfss_17$mmsawt) == 
##     F, ])
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.79212    0.02885  62.127  < 2e-16 ***
## factor(race_eth)hispanic     -0.29875    0.03143  -9.505  < 2e-16 ***
## factor(race_eth)nh black     -0.11262    0.02733  -4.121 3.77e-05 ***
## factor(race_eth)nh multirace  0.32195    0.05188   6.205 5.47e-10 ***
## factor(race_eth)nh other     -0.25077    0.05150  -4.869 1.12e-06 ***
## factor(educ)0Prim             0.15101    0.06331   2.385   0.0171 *  
## factor(educ)1somehs           0.29481    0.03882   7.595 3.09e-14 ***
## factor(educ)3somecol         -0.02575    0.02382  -1.081   0.2797    
## factor(educ)4colgrad         -0.46001    0.02278 -20.197  < 2e-16 ***
## factor(agec)(24,39]          -0.14171    0.02927  -4.841 1.29e-06 ***
## factor(agec)(39,59]          -0.24552    0.02887  -8.504  < 2e-16 ***
## factor(agec)(59,79]          -0.56190    0.03235 -17.370  < 2e-16 ***
## factor(agec)(79,99]          -0.88684    0.06465 -13.718  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 16.32728)
## 
## Number of Fisher Scoring iterations: 7
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
##     factor(race_eth)hispanic     factor(race_eth)nh black 
##                        0.742                        0.893 
## factor(race_eth)nh multirace     factor(race_eth)nh other 
##                        1.380                        0.778 
##            factor(educ)0Prim          factor(educ)1somehs 
##                        1.163                        1.343 
##         factor(educ)3somecol         factor(educ)4colgrad 
##                        0.975                        0.631 
##          factor(agec)(24,39]          factor(agec)(39,59] 
##                        0.868                        0.782 
##          factor(agec)(59,79]          factor(agec)(79,99] 
##                        0.570                        0.412

The result shows that NH Multirace respondents had higher mean counts of poor mental health days than NH Whites, while hispanics had a lower mean count of poor mental health days in exception of respondents with some college completion. As education increases, the the mean count of poor mental health days decreases. On the other hand, as age increases, the mean count of poor health decreases.

2.1. Evaluate the level of dispersion in the outcome

An easy check on this is to compare the residual deviance to the residual degrees of freedom. They ratio should be 1 if the model fits the data.

fit2<-glm(healthdays~factor(race_eth)+factor(educ)+factor(agec), data=brfss_17, family=poisson)
summary(fit2)
## 
## Call:
## glm(formula = healthdays ~ factor(race_eth) + factor(educ) + 
##     factor(agec), family = poisson, data = brfss_17)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.8185  -2.5729  -2.1278  -0.6355  11.6925  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   1.779975   0.004132  430.80   <2e-16 ***
## factor(race_eth)hispanic     -0.142360   0.003955  -35.99   <2e-16 ***
## factor(race_eth)nh black     -0.042015   0.003685  -11.40   <2e-16 ***
## factor(race_eth)nh multirace  0.350208   0.006776   51.68   <2e-16 ***
## factor(race_eth)nh other     -0.092791   0.005791  -16.02   <2e-16 ***
## factor(educ)0Prim             0.178867   0.007458   23.98   <2e-16 ***
## factor(educ)1somehs           0.321579   0.004947   65.00   <2e-16 ***
## factor(educ)3somecol         -0.047151   0.002942  -16.03   <2e-16 ***
## factor(educ)4colgrad         -0.476330   0.002970 -160.38   <2e-16 ***
## factor(agec)(24,39]          -0.098742   0.004426  -22.31   <2e-16 ***
## factor(agec)(39,59]          -0.192433   0.004160  -46.26   <2e-16 ***
## factor(agec)(59,79]          -0.583089   0.004284 -136.10   <2e-16 ***
## factor(agec)(79,99]          -0.999473   0.006685 -149.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2425055  on 222096  degrees of freedom
## Residual deviance: 2324182  on 222084  degrees of freedom
##   (8778 observations deleted due to missingness)
## AIC: 2595781
## 
## Number of Fisher Scoring iterations: 7
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 3.235016
1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0

The Dispersion parameter for poisson family was 16.32728 which is overdispersed since its suppose to equal 1.Therefore, it is an evidence that the model is not a good fit.

##. 2.b. Is the Poisson model a good choice?

The test of model fit, using a chi-square distribution, with degrees of freedom equal to the residual produced a result of 0. So, this p value is 0, which means the model does not fit the data. That means the poisson model is not a good choice for this model.

3. Consider a Negative binomial model

Fitting the Negative Binomial GLM

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
fit.nb1<-glm.nb(healthdays~factor(race_eth)+factor(educ)+factor(agec),
              data=sub,
              weights=mmsawt/mean(mmsawt, na.rm=T))
summary(fit.nb1)
## 
## Call:
## glm.nb(formula = healthdays ~ factor(race_eth) + factor(educ) + 
##     factor(agec), data = sub, weights = mmsawt/mean(mmsawt, na.rm = T), 
##     init.theta = 0.1455502131, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.3581  -0.7510  -0.3927  -0.0611   9.0452  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   1.82172    0.02041  89.251  < 2e-16 ***
## factor(race_eth)hispanic     -0.18992    0.01761 -10.783  < 2e-16 ***
## factor(race_eth)nh black     -0.05977    0.01826  -3.273  0.00106 ** 
## factor(race_eth)nh multirace  0.29235    0.05130   5.699 1.21e-08 ***
## factor(race_eth)nh other     -0.25554    0.02368 -10.789  < 2e-16 ***
## factor(educ)0Prim             0.21989    0.03359   6.547 5.89e-11 ***
## factor(educ)1somehs           0.26405    0.02525  10.458  < 2e-16 ***
## factor(educ)3somecol         -0.04284    0.01599  -2.679  0.00738 ** 
## factor(educ)4colgrad         -0.47759    0.01641 -29.101  < 2e-16 ***
## factor(agec)(24,39]          -0.14116    0.02107  -6.700 2.08e-11 ***
## factor(agec)(39,59]          -0.28785    0.02033 -14.160  < 2e-16 ***
## factor(agec)(59,79]          -0.59776    0.02156 -27.730  < 2e-16 ***
## factor(agec)(79,99]          -0.94428    0.03474 -27.183  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1456) family taken to be 1)
## 
##     Null deviance: 147894  on 198380  degrees of freedom
## Residual deviance: 144866  on 198368  degrees of freedom
## AIC: 746958
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.145550 
##           Std. Err.:  0.000676 
## 
##  2 x log-likelihood:  -746930.390000

4. Compare the model fits of the alternative models using AIC

The AIC of 746,958 of the alternative model is lower than the AIC of the Poisson model, Therefore, the alternative (Negative binomial model) is a good fit for this model.