#Data Prep
nams<-names(brfss_17)
head(nams, n=10)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss_17)<-newnames
##Get TX Cities
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1
##Remove Non-Responses
brfss_17<-brfss_17%>%
filter(tx==1, is.na(mmsawt)==F, is.na(hlthpln1)==F)
##Recodes
brfss_17$hlthpln1<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:3='0hs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
brfss_17$sex<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))
brfss_17$menthlth<-recode(brfss_17$menthlth, recodes = "88=0; 77=NA; 99=NA")
#1. Define a count outcome for the dataset of your choosing
# a. State a research question about your outcome
#b. Is an offset term necessary? why or why not?
##Outcome: Mental Health - Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?
##Question: Do men or women with insurance and at particular age ranges have more or less "not good" mental health days?
##No, an offset term is not necessary
#2) Consider a Poisson regression model for the outcome
#a. Evaluate the level of dispersion in the outcome
##So, I compared the residual deviance to the residual degrees of freedom and my result was 3.252148. According to your lecture, the ratio should be 1 if the data fits because the the mean and the variance are functions of one another.
#b. Is the Poisson model a good choice?
## Because my result was not a 1, the Poisson is not a good model choice for my data. (I also ran a chi square test and the result was 0).
sub<-brfss_17%>%
select(menthlth, sex, ins, agec, mmsawt, ststr) %>%
filter( complete.cases(.))
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )
#First I do some simple descriptives
svyhist(~menthlth, des)

svyby(~menthlth, ~sex+ins+agec, des, svymean, na.rm=T)
## sex ins agec menthlth se
## Female.0.(0,24] Female 0 (0,24] 3.939403268 1.981056418
## Male.0.(0,24] Male 0 (0,24] 6.072709386 1.584050333
## Female.1.(0,24] Female 1 (0,24] 4.667179270 0.907141173
## Male.1.(0,24] Male 1 (0,24] 2.570133595 0.502195518
## Female.0.(24,39] Female 0 (24,39] 5.231458851 1.032839628
## Male.0.(24,39] Male 0 (24,39] 2.622008209 0.466837089
## Female.1.(24,39] Female 1 (24,39] 4.971400214 0.552072619
## Male.1.(24,39] Male 1 (24,39] 2.855081565 0.520631693
## Female.0.(39,59] Female 0 (39,59] 4.171160086 0.751796249
## Male.0.(39,59] Male 0 (39,59] 3.224623402 1.228378557
## Female.1.(39,59] Female 1 (39,59] 5.559298422 0.706171516
## Male.1.(39,59] Male 1 (39,59] 2.304955880 0.331595245
## Female.0.(59,79] Female 0 (59,79] 4.828986899 2.237288861
## Male.0.(59,79] Male 0 (59,79] 0.683075231 0.361273931
## Female.1.(59,79] Female 1 (59,79] 3.218155057 0.468024114
## Male.1.(59,79] Male 1 (59,79] 3.165727627 0.563740504
## Female.0.(79,99] Female 0 (79,99] 9.506009579 6.121732633
## Male.0.(79,99] Male 0 (79,99] 0.002207588 0.002915385
## Female.1.(79,99] Female 1 (79,99] 2.014637976 0.643916954
## Male.1.(79,99] Male 1 (79,99] 2.749476909 1.345274826
#Poisson glm fit to survey data
fit1<-svyglm(menthlth~factor(sex)+factor(ins)+factor(agec), design=des, family=poisson)
summary(fit1)
##
## Call:
## svyglm(formula = menthlth ~ factor(sex) + factor(ins) + factor(agec),
## design = des, family = poisson)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.69743 0.16934 10.024 < 2e-16 ***
## factor(sex)Male -0.47714 0.09911 -4.815 1.5e-06 ***
## factor(ins)1 -0.04908 0.11656 -0.421 0.6737
## factor(agec)(24,39] -0.09084 0.15325 -0.593 0.5534
## factor(agec)(39,59] -0.08835 0.15681 -0.563 0.5732
## factor(agec)(59,79] -0.31249 0.16898 -1.849 0.0645 .
## factor(agec)(79,99] -0.73653 0.30811 -2.390 0.0168 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 16.76934)
##
## Number of Fisher Scoring iterations: 7
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
## factor(sex)Male factor(ins)1 factor(agec)(24,39] factor(agec)(39,59]
## 0.621 0.952 0.913 0.915
## factor(agec)(59,79] factor(agec)(79,99]
## 0.732 0.479
fit2<-glm(menthlth~factor(sex)+factor(ins)+factor(agec), data=brfss_17, family=poisson)
summary(fit2)
##
## Call:
## glm(formula = menthlth ~ factor(sex) + factor(ins) + factor(agec),
## family = poisson, data = brfss_17)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.8541 -2.5479 -2.1199 -0.7795 11.2575
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.00512 0.02366 84.731 <2e-16 ***
## factor(sex)Male -0.43240 0.01257 -34.389 <2e-16 ***
## factor(ins)1 -0.26444 0.01584 -16.696 <2e-16 ***
## factor(agec)(24,39] -0.26604 0.02410 -11.041 <2e-16 ***
## factor(agec)(39,59] -0.22342 0.02270 -9.842 <2e-16 ***
## factor(agec)(59,79] -0.56329 0.02315 -24.328 <2e-16 ***
## factor(agec)(79,99] -0.97265 0.03256 -29.870 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 92434 on 8446 degrees of freedom
## Residual deviance: 89265 on 8440 degrees of freedom
## (186 observations deleted due to missingness)
## AIC: 99088
##
## Number of Fisher Scoring iterations: 7
#For the Poisson distribution, the mean and the variance are functions of one another (variance = mean for Poisson). So when you have more variability than you expect in your data, you have overdispersion. An easy check on this is to compare the residual deviance to the residual degrees of freedom. They ratio should be 1 if the model fits the data.
#So here is the test.
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 3.252148
#So used the chi sq test as well which give me a p of 0
1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0
#3) Consider a Negative binomial model
library(lmtest)
library(sandwich)
coeftest(fit2, vcov=vcovHC(fit2, type="HC1",cluster="ststr"))
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.005124 0.092395 21.7016 < 2.2e-16 ***
## factor(sex)Male -0.432401 0.052985 -8.1608 3.328e-16 ***
## factor(ins)1 -0.264441 0.065234 -4.0537 5.040e-05 ***
## factor(agec)(24,39] -0.266040 0.092103 -2.8885 0.003871 **
## factor(agec)(39,59] -0.223421 0.088028 -2.5381 0.011147 *
## factor(agec)(59,79] -0.563287 0.091187 -6.1773 6.521e-10 ***
## factor(agec)(79,99] -0.972650 0.135672 -7.1691 7.547e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
fit.nb1<-glm.nb(menthlth~factor(sex),
data=sub,
weights=mmsawt/mean(mmsawt, na.rm=T))
fit.nb2<-glm.nb(menthlth~factor(sex)+factor(ins)+factor(agec),
data=sub,
weights=mmsawt/mean(mmsawt, na.rm=T))
#clx2(fit.nb2,cluster =sub$ststr)
tests1<-coeftest(fit.nb1, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
tests<-coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
stargazer(fit.nb1, fit.nb2,style="demography", type = "text", t.auto=F,p.auto=F,coef=list(tests1[, 1],tests[,1]), se =list(tests1[, 2], tests[, 2]), p=list(tests1[,4],tests[, 4]) )
##
## -----------------------------------------------------
## menthlth
## Model 1 Model 2
## -----------------------------------------------------
## factor(sex)Male -0.462*** -0.471***
## (0.100) (0.100)
## factor(ins)1 -0.063
## (0.121)
## factor(agec)(24,39] -0.138
## (0.154)
## factor(agec)(39,59] -0.148
## (0.157)
## factor(agec)(59,79] -0.300
## (0.171)
## factor(agec)(79,99] -0.706*
## (0.334)
## Constant 1.523*** 1.737***
## (0.172) (0.172)
## N 8,447 8,447
## Log Likelihood -15,355.070 -15,346.140
## theta 0.132*** (0.003) 0.132*** (0.003)
## AIC 30,714.130 30,706.290
## -----------------------------------------------------
## *p < .05; **p < .01; ***p < .001
#4) Compare the model fits of the alternative models using AIC
AIC(fit1)
## eff.p AIC deltabar
## 412.6704 92473.0919 68.7784
AIC(fit2)
## [1] 99088.35
AIC(fit.nb1)
## [1] 30714.13
AIC(fit.nb2)
## [1] 30706.29
#The best fit model, based on AIC is the second negative binomial model, which includes the predictors of sex, ins, and agec. This model has the lowest number and therefore is the most parsimonious fit.