HW Count Data Models

#load brfss
library(car)
library(stargazer)
library(survey)

library(ggplot2)

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

#1) The count outcome is depression. The question asked is "(Ever told) you have a depressive disorder (including depression, major depression, dysthymia, or minor depression)?

Do veterans have depression more than non-veterans? This does not need an offset because every person in the survey was askd the sam equestions.

#outcome variable is Depression
brfss_17$Depression<-Recode(brfss_17$addepev2, recodes = "7=NA; 9=NA")
hist(brfss_17$Depression)

summary(brfss_17$Depression)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   2.000   1.804   2.000   2.000    1140

#Veteran status
brfss_17$vetstatus<-Recode(brfss_17$veteran3, recodes="1= 'Veteran'; 2= 'Nonveteran'; else=NA", as.factor = T)
brfss_17$vetstatus<-relevel(brfss_17$vetstatus, ref = "Veteran")

#brfss_17$badhealth<-ifelse(brfss_17$genhlth %in% c(4,5),1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", 
as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

#Here I keep complete cases on my key variables,
#just for speed (the survey procedures can run for a long time)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

sub<-brfss_17%>%
  select(Depression, vetstatus,
         agec,race_eth, marst, educ,white, black, hispanic,
         other, ins, mmsawt, ststr) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )
#OR THE BRFSS, R GAVE ME A WARNING AND I NEEDED TO ADD:
#YOU MAY NOT NEED TO DO THIS!!!!
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, 
               weights=~mmsawt,
               data = brfss_17[is.na(brfss_17$mmsawt)==F,])

svyhist(~Depression, des)

svyby(~Depression, ~race_eth+educ, des, svymean, na.rm=T)

##                           race_eth     educ Depression          se
## nhwhite.2hsgrad            nhwhite  2hsgrad   1.784797 0.004328881
## hispanic.2hsgrad          hispanic  2hsgrad   1.867142 0.008439934
## nh black.2hsgrad          nh black  2hsgrad   1.844332 0.008613719
## nh multirace.2hsgrad  nh multirace  2hsgrad   1.687759 0.029969545
## nh other.2hsgrad          nh other  2hsgrad   1.871327 0.015959179
## nhwhite.0Prim              nhwhite    0Prim   1.737838 0.023072885
## hispanic.0Prim            hispanic    0Prim   1.867133 0.010537138
## nh black.0Prim            nh black    0Prim   1.786697 0.032510736
## nh multirace.0Prim    nh multirace    0Prim   1.641944 0.091136662
## nh other.0Prim            nh other    0Prim   1.879223 0.040643220
## nhwhite.1somehs            nhwhite  1somehs   1.681027 0.012785141
## hispanic.1somehs          hispanic  1somehs   1.858145 0.011125185
## nh black.1somehs          nh black  1somehs   1.744416 0.020865539
## nh multirace.1somehs  nh multirace  1somehs   1.637382 0.061932560
## nh other.1somehs          nh other  1somehs   1.675660 0.065141753
## nhwhite.3somecol           nhwhite 3somecol   1.769938 0.003978383
## hispanic.3somecol         hispanic 3somecol   1.821423 0.009926594
## nh black.3somecol         nh black 3somecol   1.844672 0.009317801
## nh multirace.3somecol nh multirace 3somecol   1.747794 0.020142887
## nh other.3somecol         nh other 3somecol   1.840616 0.019249206
## nhwhite.4colgrad           nhwhite 4colgrad   1.830927 0.002416563
## hispanic.4colgrad         hispanic 4colgrad   1.877249 0.007322868
## nh black.4colgrad         nh black 4colgrad   1.896054 0.005971260
## nh multirace.4colgrad nh multirace 4colgrad   1.770049 0.022425456
## nh other.4colgrad         nh other 4colgrad   1.907185 0.009541618

svyby(~Depression, ~agec, des, svymean, na.rm=T)

##            agec Depression          se
## (0,24]   (0,24]   1.816360 0.005550975
## (24,39] (24,39]   1.819274 0.003274101
## (39,59] (39,59]   1.808873 0.002946819
## (59,79] (59,79]   1.813606 0.003334247
## (79,99] (79,99]   1.890416 0.006993593

svyby(~Depression, ~vetstatus, des, svymean, na.rm=T)

##             vetstatus Depression          se
## Veteran       Veteran   1.821962 0.005489156
## Nonveteran Nonveteran   1.816495 0.001786697

#Poisson glm fit to survey data
fit1<-svyglm(Depression~factor(race_eth)+factor(educ)+factor(agec), design=des, family=poisson)
summary(fit1)

## 
## Call:
## svyglm(formula = Depression ~ factor(race_eth) + factor(educ) + 
##     factor(agec), design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss_17[is.na(brfss_17$mmsawt) == 
##     F, ])
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.577168   0.003664 157.538  < 2e-16 ***
## factor(race_eth)hispanic      0.049362   0.002970  16.618  < 2e-16 ***
## factor(race_eth)nh black      0.037690   0.002937  12.831  < 2e-16 ***
## factor(race_eth)nh multirace -0.028501   0.007811  -3.649 0.000263 ***
## factor(race_eth)nh other      0.044133   0.004481   9.849  < 2e-16 ***
## factor(educ)0Prim            -0.008152   0.005501  -1.482 0.138370    
## factor(educ)1somehs          -0.038697   0.005071  -7.631 2.34e-14 ***
## factor(educ)3somecol         -0.008130   0.002702  -3.009 0.002620 ** 
## factor(educ)4colgrad          0.026397   0.002280  11.579  < 2e-16 ***
## factor(agec)(24,39]          -0.004870   0.003605  -1.351 0.176669    
## factor(agec)(39,59]          -0.006623   0.003526  -1.878 0.060380 .  
## factor(agec)(59,79]           0.003123   0.003714   0.841 0.400438    
## factor(agec)(79,99]           0.050374   0.005028  10.018  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 0.08173664)
## 
## Number of Fisher Scoring iterations: 4

#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)

##     factor(race_eth)hispanic     factor(race_eth)nh black 
##                        1.051                        1.038 
## factor(race_eth)nh multirace     factor(race_eth)nh other 
##                        0.972                        1.045 
##            factor(educ)0Prim          factor(educ)1somehs 
##                        0.992                        0.962 
##         factor(educ)3somecol         factor(educ)4colgrad 
##                        0.992                        1.027 
##          factor(agec)(24,39]          factor(agec)(39,59] 
##                        0.995                        0.993 
##          factor(agec)(59,79]          factor(agec)(79,99] 
##                        1.003                        1.052

#2) Consider a Poisson regression model for the outcomea. Evaluate the level of dispersion in the outcomeb. Is the Poisson model a good choice?

fit2<-glm(Depression~factor(race_eth)+factor(educ)+factor(agec), data=brfss_17, family=poisson)
summary(fit2)

## 
## Call:
## glm(formula = Depression ~ factor(race_eth) + factor(educ) + 
##     factor(agec), family = poisson, data = brfss_17)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.78671   0.08718   0.13098   0.15835   0.28078  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   0.572589   0.006942  82.486  < 2e-16 ***
## factor(race_eth)hispanic      0.035217   0.005721   6.156 7.46e-10 ***
## factor(race_eth)nh black      0.037540   0.005213   7.201 5.96e-13 ***
## factor(race_eth)nh multirace -0.030867   0.012105  -2.550   0.0108 *  
## factor(race_eth)nh other      0.038391   0.007894   4.863 1.15e-06 ***
## factor(educ)0Prim            -0.012931   0.011436  -1.131   0.2582    
## factor(educ)1somehs          -0.041253   0.008582  -4.807 1.54e-06 ***
## factor(educ)3somecol         -0.006808   0.004419  -1.541   0.1234    
## factor(educ)4colgrad          0.026491   0.004059   6.527 6.73e-11 ***
## factor(agec)(24,39]          -0.010687   0.007386  -1.447   0.1479    
## factor(agec)(39,59]          -0.012658   0.006960  -1.819   0.0690 .  
## factor(agec)(59,79]           0.006461   0.006902   0.936   0.3492    
## factor(agec)(79,99]           0.058271   0.008467   6.882 5.90e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 22582  on 224369  degrees of freedom
## Residual deviance: 22273  on 224357  degrees of freedom
##   (6505 observations deleted due to missingness)
## AIC: 581669
## 
## Number of Fisher Scoring iterations: 4

scale<-sqrt(fit2$deviance/fit2$df.residual)
scale

## [1] 0.315076

fit3<-glm(Depression~factor(race_eth)+factor(educ)+factor(agec), data=brfss_17, family=quasipoisson)
summary(fit3)

## 
## Call:
## glm(formula = Depression ~ factor(race_eth) + factor(educ) + 
##     factor(agec), family = quasipoisson, data = brfss_17)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.78671   0.08718   0.13098   0.15835   0.28078  
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.572589   0.002044 280.105  < 2e-16 ***
## factor(race_eth)hispanic      0.035217   0.001685  20.904  < 2e-16 ***
## factor(race_eth)nh black      0.037540   0.001535  24.455  < 2e-16 ***
## factor(race_eth)nh multirace -0.030867   0.003565  -8.659  < 2e-16 ***
## factor(race_eth)nh other      0.038391   0.002325  16.515  < 2e-16 ***
## factor(educ)0Prim            -0.012931   0.003368  -3.839 0.000123 ***
## factor(educ)1somehs          -0.041253   0.002527 -16.322  < 2e-16 ***
## factor(educ)3somecol         -0.006808   0.001301  -5.231 1.69e-07 ***
## factor(educ)4colgrad          0.026491   0.001195  22.163  < 2e-16 ***
## factor(agec)(24,39]          -0.010687   0.002175  -4.914 8.95e-07 ***
## factor(agec)(39,59]          -0.012658   0.002050  -6.176 6.60e-10 ***
## factor(agec)(59,79]           0.006461   0.002032   3.179 0.001480 ** 
## factor(agec)(79,99]           0.058271   0.002493  23.370  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.08671977)
## 
##     Null deviance: 22582  on 224369  degrees of freedom
## Residual deviance: 22273  on 224357  degrees of freedom
##   (6505 observations deleted due to missingness)
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

#3) Consider a Negative binomial model

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(lmtest)

## Warning: package 'lmtest' was built under R version 3.6.3

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 3.6.3

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(sandwich)

## Warning: package 'sandwich' was built under R version 3.6.3

fit.nb1<-glm.nb(Depression~factor(race_eth),
              data=sub,
              weights=mmsawt/mean(mmsawt, na.rm=T))

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

fit.nb2<-glm.nb(Depression~factor(race_eth)+factor(educ)+factor(agec)+factor(vetstatus),
              data=sub,
              weights=mmsawt/mean(mmsawt, na.rm=T))

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in glm.nb(Depression ~ factor(race_eth) + factor(educ) + factor(agec)
## + : alternation limit reached

#clx2(fit.nb2,cluster =sub$ststr)
tests1<-coeftest(fit.nb1, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
tests<-coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
library(stargazer)

stargazer(fit.nb1, fit.nb2,style="demography", type = "text", t.auto=F,p.auto=F,coef=list(tests1[, 1],tests[,1]),  se =list(tests1[, 2], tests[, 2]), p=list(tests1[,4],tests[, 4])   )

## 
## --------------------------------------------------------------------------------
##                                                  Depression                     
##                                       Model 1                   Model 2         
## --------------------------------------------------------------------------------
## factor(race_eth)hispanic             0.036***                  0.049***         
##                                       (0.003)                   (0.003)         
## factor(race_eth)nh black             0.030***                  0.038***         
##                                       (0.003)                   (0.003)         
## factor(race_eth)nh multirace         -0.035***                 -0.029***        
##                                       (0.008)                   (0.008)         
## factor(race_eth)nh other             0.047***                  0.045***         
##                                       (0.004)                   (0.004)         
## factor(educ)0Prim                                               -0.008          
##                                                                 (0.006)         
## factor(educ)1somehs                                            -0.037***        
##                                                                 (0.005)         
## factor(educ)3somecol                                           -0.008**         
##                                                                 (0.003)         
## factor(educ)4colgrad                                           0.027***         
##                                                                 (0.002)         
## factor(agec)(24,39]                                             -0.004          
##                                                                 (0.004)         
## factor(agec)(39,59]                                             -0.006          
##                                                                 (0.004)         
## factor(agec)(59,79]                                              0.004          
##                                                                 (0.004)         
## factor(agec)(79,99]                                            0.051***         
##                                                                 (0.005)         
## factor(vetstatus)Nonveteran                                     -0.004          
##                                                                 (0.003)         
## Constant                             0.582***                  0.579***         
##                                       (0.005)                   (0.005)         
## N                                     222,225                   222,225         
## Log Likelihood                     -288,399.600              -288,313.200       
## theta                        381,536.400 (210,793.400) 382,303.500 (211,324.300)
## AIC                                 576,809.200               576,654.300       
## --------------------------------------------------------------------------------
## *p < .05; **p < .01; ***p < .001

#4) Comparing the models, the AIC of the negative binomial model is lower which suggest that it is the better fit.

More veterans have depression than nonveterans.

HW Count Data Models

Candace Jennings

4/5/2020