#load brfss
library(car)
library(stargazer)
library(survey)

library(ggplot2)
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

#1) The count outcome is depression. The question asked is "(Ever told) you have a depressive disorder (including depression, major depression, dysthymia, or minor depression)?

Do veterans have depression more than non-veterans? This does not need an offset because every person in the survey was askd the sam equestions.

#outcome variable is Depression
brfss_17$Depression<-Recode(brfss_17$addepev2, recodes = "7=NA; 9=NA")
hist(brfss_17$Depression)

summary(brfss_17$Depression)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   2.000   1.804   2.000   2.000    1140
#Veteran status
brfss_17$vetstatus<-Recode(brfss_17$veteran3, recodes="1= 'Veteran'; 2= 'Nonveteran'; else=NA", as.factor = T)
brfss_17$vetstatus<-relevel(brfss_17$vetstatus, ref = "Veteran")
#brfss_17$badhealth<-ifelse(brfss_17$genhlth %in% c(4,5),1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", 
as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
#Here I keep complete cases on my key variables,
#just for speed (the survey procedures can run for a long time)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
sub<-brfss_17%>%
  select(Depression, vetstatus,
         agec,race_eth, marst, educ,white, black, hispanic,
         other, ins, mmsawt, ststr) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )
#OR THE BRFSS, R GAVE ME A WARNING AND I NEEDED TO ADD:
#YOU MAY NOT NEED TO DO THIS!!!!
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, 
               weights=~mmsawt,
               data = brfss_17[is.na(brfss_17$mmsawt)==F,])
svyhist(~Depression, des)

svyby(~Depression, ~race_eth+educ, des, svymean, na.rm=T)
##                           race_eth     educ Depression          se
## nhwhite.2hsgrad            nhwhite  2hsgrad   1.784797 0.004328881
## hispanic.2hsgrad          hispanic  2hsgrad   1.867142 0.008439934
## nh black.2hsgrad          nh black  2hsgrad   1.844332 0.008613719
## nh multirace.2hsgrad  nh multirace  2hsgrad   1.687759 0.029969545
## nh other.2hsgrad          nh other  2hsgrad   1.871327 0.015959179
## nhwhite.0Prim              nhwhite    0Prim   1.737838 0.023072885
## hispanic.0Prim            hispanic    0Prim   1.867133 0.010537138
## nh black.0Prim            nh black    0Prim   1.786697 0.032510736
## nh multirace.0Prim    nh multirace    0Prim   1.641944 0.091136662
## nh other.0Prim            nh other    0Prim   1.879223 0.040643220
## nhwhite.1somehs            nhwhite  1somehs   1.681027 0.012785141
## hispanic.1somehs          hispanic  1somehs   1.858145 0.011125185
## nh black.1somehs          nh black  1somehs   1.744416 0.020865539
## nh multirace.1somehs  nh multirace  1somehs   1.637382 0.061932560
## nh other.1somehs          nh other  1somehs   1.675660 0.065141753
## nhwhite.3somecol           nhwhite 3somecol   1.769938 0.003978383
## hispanic.3somecol         hispanic 3somecol   1.821423 0.009926594
## nh black.3somecol         nh black 3somecol   1.844672 0.009317801
## nh multirace.3somecol nh multirace 3somecol   1.747794 0.020142887
## nh other.3somecol         nh other 3somecol   1.840616 0.019249206
## nhwhite.4colgrad           nhwhite 4colgrad   1.830927 0.002416563
## hispanic.4colgrad         hispanic 4colgrad   1.877249 0.007322868
## nh black.4colgrad         nh black 4colgrad   1.896054 0.005971260
## nh multirace.4colgrad nh multirace 4colgrad   1.770049 0.022425456
## nh other.4colgrad         nh other 4colgrad   1.907185 0.009541618
svyby(~Depression, ~agec, des, svymean, na.rm=T)
##            agec Depression          se
## (0,24]   (0,24]   1.816360 0.005550975
## (24,39] (24,39]   1.819274 0.003274101
## (39,59] (39,59]   1.808873 0.002946819
## (59,79] (59,79]   1.813606 0.003334247
## (79,99] (79,99]   1.890416 0.006993593
svyby(~Depression, ~vetstatus, des, svymean, na.rm=T)
##             vetstatus Depression          se
## Veteran       Veteran   1.821962 0.005489156
## Nonveteran Nonveteran   1.816495 0.001786697
#Poisson glm fit to survey data
fit1<-svyglm(Depression~factor(race_eth)+factor(educ)+factor(agec), design=des, family=poisson)
summary(fit1)
## 
## Call:
## svyglm(formula = Depression ~ factor(race_eth) + factor(educ) + 
##     factor(agec), design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss_17[is.na(brfss_17$mmsawt) == 
##     F, ])
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.577168   0.003664 157.538  < 2e-16 ***
## factor(race_eth)hispanic      0.049362   0.002970  16.618  < 2e-16 ***
## factor(race_eth)nh black      0.037690   0.002937  12.831  < 2e-16 ***
## factor(race_eth)nh multirace -0.028501   0.007811  -3.649 0.000263 ***
## factor(race_eth)nh other      0.044133   0.004481   9.849  < 2e-16 ***
## factor(educ)0Prim            -0.008152   0.005501  -1.482 0.138370    
## factor(educ)1somehs          -0.038697   0.005071  -7.631 2.34e-14 ***
## factor(educ)3somecol         -0.008130   0.002702  -3.009 0.002620 ** 
## factor(educ)4colgrad          0.026397   0.002280  11.579  < 2e-16 ***
## factor(agec)(24,39]          -0.004870   0.003605  -1.351 0.176669    
## factor(agec)(39,59]          -0.006623   0.003526  -1.878 0.060380 .  
## factor(agec)(59,79]           0.003123   0.003714   0.841 0.400438    
## factor(agec)(79,99]           0.050374   0.005028  10.018  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 0.08173664)
## 
## Number of Fisher Scoring iterations: 4
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
##     factor(race_eth)hispanic     factor(race_eth)nh black 
##                        1.051                        1.038 
## factor(race_eth)nh multirace     factor(race_eth)nh other 
##                        0.972                        1.045 
##            factor(educ)0Prim          factor(educ)1somehs 
##                        0.992                        0.962 
##         factor(educ)3somecol         factor(educ)4colgrad 
##                        0.992                        1.027 
##          factor(agec)(24,39]          factor(agec)(39,59] 
##                        0.995                        0.993 
##          factor(agec)(59,79]          factor(agec)(79,99] 
##                        1.003                        1.052

#2) Consider a Poisson regression model for the outcomea. Evaluate the level of dispersion in the outcomeb. Is the Poisson model a good choice?

fit2<-glm(Depression~factor(race_eth)+factor(educ)+factor(agec), data=brfss_17, family=poisson)
summary(fit2)
## 
## Call:
## glm(formula = Depression ~ factor(race_eth) + factor(educ) + 
##     factor(agec), family = poisson, data = brfss_17)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.78671   0.08718   0.13098   0.15835   0.28078  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   0.572589   0.006942  82.486  < 2e-16 ***
## factor(race_eth)hispanic      0.035217   0.005721   6.156 7.46e-10 ***
## factor(race_eth)nh black      0.037540   0.005213   7.201 5.96e-13 ***
## factor(race_eth)nh multirace -0.030867   0.012105  -2.550   0.0108 *  
## factor(race_eth)nh other      0.038391   0.007894   4.863 1.15e-06 ***
## factor(educ)0Prim            -0.012931   0.011436  -1.131   0.2582    
## factor(educ)1somehs          -0.041253   0.008582  -4.807 1.54e-06 ***
## factor(educ)3somecol         -0.006808   0.004419  -1.541   0.1234    
## factor(educ)4colgrad          0.026491   0.004059   6.527 6.73e-11 ***
## factor(agec)(24,39]          -0.010687   0.007386  -1.447   0.1479    
## factor(agec)(39,59]          -0.012658   0.006960  -1.819   0.0690 .  
## factor(agec)(59,79]           0.006461   0.006902   0.936   0.3492    
## factor(agec)(79,99]           0.058271   0.008467   6.882 5.90e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 22582  on 224369  degrees of freedom
## Residual deviance: 22273  on 224357  degrees of freedom
##   (6505 observations deleted due to missingness)
## AIC: 581669
## 
## Number of Fisher Scoring iterations: 4
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 0.315076
fit3<-glm(Depression~factor(race_eth)+factor(educ)+factor(agec), data=brfss_17, family=quasipoisson)
summary(fit3)
## 
## Call:
## glm(formula = Depression ~ factor(race_eth) + factor(educ) + 
##     factor(agec), family = quasipoisson, data = brfss_17)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.78671   0.08718   0.13098   0.15835   0.28078  
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.572589   0.002044 280.105  < 2e-16 ***
## factor(race_eth)hispanic      0.035217   0.001685  20.904  < 2e-16 ***
## factor(race_eth)nh black      0.037540   0.001535  24.455  < 2e-16 ***
## factor(race_eth)nh multirace -0.030867   0.003565  -8.659  < 2e-16 ***
## factor(race_eth)nh other      0.038391   0.002325  16.515  < 2e-16 ***
## factor(educ)0Prim            -0.012931   0.003368  -3.839 0.000123 ***
## factor(educ)1somehs          -0.041253   0.002527 -16.322  < 2e-16 ***
## factor(educ)3somecol         -0.006808   0.001301  -5.231 1.69e-07 ***
## factor(educ)4colgrad          0.026491   0.001195  22.163  < 2e-16 ***
## factor(agec)(24,39]          -0.010687   0.002175  -4.914 8.95e-07 ***
## factor(agec)(39,59]          -0.012658   0.002050  -6.176 6.60e-10 ***
## factor(agec)(59,79]           0.006461   0.002032   3.179 0.001480 ** 
## factor(agec)(79,99]           0.058271   0.002493  23.370  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.08671977)
## 
##     Null deviance: 22582  on 224369  degrees of freedom
## Residual deviance: 22273  on 224357  degrees of freedom
##   (6505 observations deleted due to missingness)
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

#3) Consider a Negative binomial model

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.6.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 3.6.3
fit.nb1<-glm.nb(Depression~factor(race_eth),
              data=sub,
              weights=mmsawt/mean(mmsawt, na.rm=T))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
fit.nb2<-glm.nb(Depression~factor(race_eth)+factor(educ)+factor(agec)+factor(vetstatus),
              data=sub,
              weights=mmsawt/mean(mmsawt, na.rm=T))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in glm.nb(Depression ~ factor(race_eth) + factor(educ) + factor(agec)
## + : alternation limit reached
#clx2(fit.nb2,cluster =sub$ststr)
tests1<-coeftest(fit.nb1, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
tests<-coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
library(stargazer)

stargazer(fit.nb1, fit.nb2,style="demography", type = "text", t.auto=F,p.auto=F,coef=list(tests1[, 1],tests[,1]),  se =list(tests1[, 2], tests[, 2]), p=list(tests1[,4],tests[, 4])   )
## 
## --------------------------------------------------------------------------------
##                                                  Depression                     
##                                       Model 1                   Model 2         
## --------------------------------------------------------------------------------
## factor(race_eth)hispanic             0.036***                  0.049***         
##                                       (0.003)                   (0.003)         
## factor(race_eth)nh black             0.030***                  0.038***         
##                                       (0.003)                   (0.003)         
## factor(race_eth)nh multirace         -0.035***                 -0.029***        
##                                       (0.008)                   (0.008)         
## factor(race_eth)nh other             0.047***                  0.045***         
##                                       (0.004)                   (0.004)         
## factor(educ)0Prim                                               -0.008          
##                                                                 (0.006)         
## factor(educ)1somehs                                            -0.037***        
##                                                                 (0.005)         
## factor(educ)3somecol                                           -0.008**         
##                                                                 (0.003)         
## factor(educ)4colgrad                                           0.027***         
##                                                                 (0.002)         
## factor(agec)(24,39]                                             -0.004          
##                                                                 (0.004)         
## factor(agec)(39,59]                                             -0.006          
##                                                                 (0.004)         
## factor(agec)(59,79]                                              0.004          
##                                                                 (0.004)         
## factor(agec)(79,99]                                            0.051***         
##                                                                 (0.005)         
## factor(vetstatus)Nonveteran                                     -0.004          
##                                                                 (0.003)         
## Constant                             0.582***                  0.579***         
##                                       (0.005)                   (0.005)         
## N                                     222,225                   222,225         
## Log Likelihood                     -288,399.600              -288,313.200       
## theta                        381,536.400 (210,793.400) 382,303.500 (211,324.300)
## AIC                                 576,809.200               576,654.300       
## --------------------------------------------------------------------------------
## *p < .05; **p < .01; ***p < .001

#4) Comparing the models, the AIC of the negative binomial model is lower which suggest that it is the better fit.

More veterans have depression than nonveterans.