#Data Prep
nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

##Get TX Cities 
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1

##Remove Non-Responses 
brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(hlthpln1)==F)

##Recodes
brfss_17$hlthpln1<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

brfss_17$educ<-Recode(brfss_17$educa, recodes="1:3='0hs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)

brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")

brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
                   
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

brfss_17$sex<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))

brfss_17$menthlth<-recode(brfss_17$menthlth, recodes = "88=0; 77=NA; 99=NA")
#1. Define a count outcome for the dataset of your choosing
  # a. State a research question about your outcome
   #b. Is an offset term necessary? why or why not?

##Outcome: Mental Health -  Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?
##Question: Do men or women with insurance and at particular age ranges have more or less "not good" mental health days?
##No, an offset term is not necessary 
#2) Consider a Poisson regression model for the outcome
   #a. Evaluate the level of dispersion in the outcome
        ##So, I compared the residual deviance to the residual degrees of freedom and my result was 3.252148. According to your lecture, the ratio should be 1 if the data fits because the the mean and the variance are functions of one another. 
   #b. Is the Poisson model a good choice?
     ## Because my result was not a 1, the Poisson is not a good model choice for my data. (I also ran a chi square test and the result was 0). 


sub<-brfss_17%>%
  select(menthlth, sex, ins, agec, mmsawt, ststr) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )
#First I do some simple descriptives
svyhist(~menthlth, des)

svyby(~menthlth, ~sex+ins+agec, des, svymean, na.rm=T)
##                     sex ins    agec    menthlth          se
## Female.0.(0,24]  Female   0  (0,24] 3.939403268 1.981056418
## Male.0.(0,24]      Male   0  (0,24] 6.072709386 1.584050333
## Female.1.(0,24]  Female   1  (0,24] 4.667179270 0.907141173
## Male.1.(0,24]      Male   1  (0,24] 2.570133595 0.502195518
## Female.0.(24,39] Female   0 (24,39] 5.231458851 1.032839628
## Male.0.(24,39]     Male   0 (24,39] 2.622008209 0.466837089
## Female.1.(24,39] Female   1 (24,39] 4.971400214 0.552072619
## Male.1.(24,39]     Male   1 (24,39] 2.855081565 0.520631693
## Female.0.(39,59] Female   0 (39,59] 4.171160086 0.751796249
## Male.0.(39,59]     Male   0 (39,59] 3.224623402 1.228378557
## Female.1.(39,59] Female   1 (39,59] 5.559298422 0.706171516
## Male.1.(39,59]     Male   1 (39,59] 2.304955880 0.331595245
## Female.0.(59,79] Female   0 (59,79] 4.828986899 2.237288861
## Male.0.(59,79]     Male   0 (59,79] 0.683075231 0.361273931
## Female.1.(59,79] Female   1 (59,79] 3.218155057 0.468024114
## Male.1.(59,79]     Male   1 (59,79] 3.165727627 0.563740504
## Female.0.(79,99] Female   0 (79,99] 9.506009579 6.121732633
## Male.0.(79,99]     Male   0 (79,99] 0.002207588 0.002915385
## Female.1.(79,99] Female   1 (79,99] 2.014637976 0.643916954
## Male.1.(79,99]     Male   1 (79,99] 2.749476909 1.345274826
#Poisson glm fit to survey data
fit1<-svyglm(menthlth~factor(sex)+factor(ins)+factor(agec), design=des, family=poisson)
summary(fit1)
## 
## Call:
## svyglm(formula = menthlth ~ factor(sex) + factor(ins) + factor(agec), 
##     design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.69743    0.16934  10.024  < 2e-16 ***
## factor(sex)Male     -0.47714    0.09911  -4.815  1.5e-06 ***
## factor(ins)1        -0.04908    0.11656  -0.421   0.6737    
## factor(agec)(24,39] -0.09084    0.15325  -0.593   0.5534    
## factor(agec)(39,59] -0.08835    0.15681  -0.563   0.5732    
## factor(agec)(59,79] -0.31249    0.16898  -1.849   0.0645 .  
## factor(agec)(79,99] -0.73653    0.30811  -2.390   0.0168 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 16.76934)
## 
## Number of Fisher Scoring iterations: 7
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
##     factor(sex)Male        factor(ins)1 factor(agec)(24,39] factor(agec)(39,59] 
##               0.621               0.952               0.913               0.915 
## factor(agec)(59,79] factor(agec)(79,99] 
##               0.732               0.479
fit2<-glm(menthlth~factor(sex)+factor(ins)+factor(agec), data=brfss_17, family=poisson)
summary(fit2)
## 
## Call:
## glm(formula = menthlth ~ factor(sex) + factor(ins) + factor(agec), 
##     family = poisson, data = brfss_17)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.8541  -2.5479  -2.1199  -0.7795  11.2575  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          2.00512    0.02366  84.731   <2e-16 ***
## factor(sex)Male     -0.43240    0.01257 -34.389   <2e-16 ***
## factor(ins)1        -0.26444    0.01584 -16.696   <2e-16 ***
## factor(agec)(24,39] -0.26604    0.02410 -11.041   <2e-16 ***
## factor(agec)(39,59] -0.22342    0.02270  -9.842   <2e-16 ***
## factor(agec)(59,79] -0.56329    0.02315 -24.328   <2e-16 ***
## factor(agec)(79,99] -0.97265    0.03256 -29.870   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 92434  on 8446  degrees of freedom
## Residual deviance: 89265  on 8440  degrees of freedom
##   (186 observations deleted due to missingness)
## AIC: 99088
## 
## Number of Fisher Scoring iterations: 7
#For the Poisson distribution, the mean and the variance are functions of one another (variance = mean for Poisson). So when you have more variability than you expect in your data, you have overdispersion. An easy check on this is to compare the residual deviance to the residual degrees of freedom. They ratio should be 1 if the model fits the data.
#So here is the test. 
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 3.252148
#So used the chi sq test as well which give me a p of 0
1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0
#3) Consider a Negative binomial model
library(lmtest)
library(sandwich)
coeftest(fit2, vcov=vcovHC(fit2, type="HC1",cluster="ststr"))
## 
## z test of coefficients:
## 
##                      Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)          2.005124   0.092395 21.7016 < 2.2e-16 ***
## factor(sex)Male     -0.432401   0.052985 -8.1608 3.328e-16 ***
## factor(ins)1        -0.264441   0.065234 -4.0537 5.040e-05 ***
## factor(agec)(24,39] -0.266040   0.092103 -2.8885  0.003871 ** 
## factor(agec)(39,59] -0.223421   0.088028 -2.5381  0.011147 *  
## factor(agec)(59,79] -0.563287   0.091187 -6.1773 6.521e-10 ***
## factor(agec)(79,99] -0.972650   0.135672 -7.1691 7.547e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
fit.nb1<-glm.nb(menthlth~factor(sex),
              data=sub,
              weights=mmsawt/mean(mmsawt, na.rm=T))

fit.nb2<-glm.nb(menthlth~factor(sex)+factor(ins)+factor(agec),
              data=sub,
              weights=mmsawt/mean(mmsawt, na.rm=T))
#clx2(fit.nb2,cluster =sub$ststr)
tests1<-coeftest(fit.nb1, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
tests<-coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
stargazer(fit.nb1, fit.nb2,style="demography", type = "text", t.auto=F,p.auto=F,coef=list(tests1[, 1],tests[,1]),  se =list(tests1[, 2], tests[, 2]), p=list(tests1[,4],tests[, 4])   )
## 
## -----------------------------------------------------
##                                 menthlth             
##                         Model 1          Model 2     
## -----------------------------------------------------
## factor(sex)Male        -0.462***        -0.471***    
##                         (0.100)          (0.100)     
## factor(ins)1                              -0.063     
##                                          (0.121)     
## factor(agec)(24,39]                       -0.138     
##                                          (0.154)     
## factor(agec)(39,59]                       -0.148     
##                                          (0.157)     
## factor(agec)(59,79]                       -0.300     
##                                          (0.171)     
## factor(agec)(79,99]                      -0.706*     
##                                          (0.334)     
## Constant                1.523***         1.737***    
##                         (0.172)          (0.172)     
## N                        8,447            8,447      
## Log Likelihood        -15,355.070      -15,346.140   
## theta               0.132*** (0.003) 0.132*** (0.003)
## AIC                    30,714.130       30,706.290   
## -----------------------------------------------------
## *p < .05; **p < .01; ***p < .001
#4) Compare the model fits of the alternative models using AIC
AIC(fit1)
##      eff.p        AIC   deltabar 
##   412.6704 92473.0919    68.7784
AIC(fit2)
## [1] 99088.35
AIC(fit.nb1)
## [1] 30714.13
AIC(fit.nb2)
## [1] 30706.29
#The best fit model, based on AIC is the second negative binomial model, which includes the predictors of sex, ins, and agec. This model has the lowest number and therefore is the most parsimonious fit.