##  QUESTION 1) For this assignment we will look at self reported mental health (reported by "bad" days in a month) and how it is influenced by marriage, sexual orientation, self reported stress (Likert scale) and self reported depression (yes or no).  
#Load packages required 
# a) How does marriage and sexual orientation influence an individuals mental health?
# b) An offset term is not necessary.  You use an offset term when the units of observation are different in some dimension, such as count comparisons over different times.
library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(foreign)
brfss2017=read.xport("~/Desktop/DataFolder/brfss.XPT ")
#Rename variables 
brfss2017$marital<-Recode(brfss2017$MARITAL,recodes="1=1;5=0;else=NA",as.factor=T)
brfss2017$sxorient<-Recode(brfss2017$SXORIENT,recodes="1='a) Straight';2='b) Gay';else=NA")
brfss2017$genh<-Recode(brfss2017$GENHLTH,recodes="1:3=0;4:5=1;else=NA")
brfss2017$int<-interaction(brfss2017$marital,brfss2017$sxorient)
brfss2017$age<-Recode(brfss2017$X_AGE_G, recodes="1='18-24';2='25-34';3='35-44';4='45-54';5='55-64';6='65+';else=NA",as.factor=T)
brfss2017$educ<-Recode(brfss2017$EDUCA, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss2017$educ<-relevel(brfss2017$educ, ref='0Prim')
brfss2017$hyper<-Recode(brfss2017$BPMEDS,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$choles<-Recode(brfss2017$TOLDHI2,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$blpressure<-Recode(brfss2017$BPMEDS,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$heart<-Recode(brfss2017$CVDCRHD4,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$stroke<-Recode(brfss2017$CVDSTRK3,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$chronic<-Recode(brfss2017$CHCCOPD1,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$depressed<-Recode(brfss2017$ADDEPEV2,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$hlthinsur<-Recode(brfss2017$HLTHPLN1,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$checkup<-Recode(brfss2017$CHECKUP1,recodes="1='1 year';2='2 years';3='5 years';4='5+ years';else=NA",as.factor=T)
brfss2017$stress<-Recode(brfss2017$SDHSTRES,recodes="1='a-None';2='b-Little';3='c-Some';4='d-Most';5='e-Always';else=NA",as.factor=T)
brfss2017$emosupport<-Recode(brfss2017$EMTSUPRT,recodes="1='a-Always';2='b-Usually';3='c-Sometimes';4='d-Rarely';5='e-Never';else=NA",as.factor=T)
brfss2017$satisfed<-Recode(brfss2017$LSATISFY,recodes="1:2='a-Satisfied';3:4='b-Dissatisfied';else=NA",as.factor=T)
brfss2017$safeneigh<-Recode(brfss2017$HOWSAFE1,recodes="1:2='a-Safe';3:4='b-Unsafe';else=NA",as.factor=T)
brfss2017$menthlth<-Recode(brfss2017$MENTHLTH,recodes="88=0;77=NA;99=NA")
brfss2017$educ<-Recode(brfss2017$EDUCA, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss2017$educ<-relevel(brfss2017$educ, ref='0Prim')
brfss2017$age<-Recode(brfss2017$X_AGE_G, recodes="1='18-24';2='25-34';3='35-44';4='45-54';5='55-64';6='65+';else=NA",as.factor=T)
brfss2017$poorh<-Recode(brfss2017$POORHLTH,recodes="88=0;77=NA;99=NA")
brfss2017$physh<-Recode(brfss2017$PHYSHLTH,recodes="88=0;77=NA;99=NA")
brfss2017$menth<-Recode(brfss2017$MENTHLTH,recodes="88=0;77=NA;99=NA")
#Our outcome is self reported mental health.
hist(brfss2017$menthlth)

summary(brfss2017$menthlth)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   3.595   2.000  30.000    7203
library(dplyr)
#Choose variables of interest and tell R our survey design
library(dplyr)
sub<-brfss2017%>%
  select(menthlth,marital,sxorient,X_LLCPWT,X_STRWT,hyper,choles,blpressure,depressed,stress,emosupport)%>%
  filter(complete.cases(.))
options(survey.lonely.psu = "adjust")
des<-svydesign(ids = ~1,strata=~X_STRWT,weights = ~X_LLCPWT,data = sub)
#descriptive statistics
svyhist(~menthlth,des)

svyby(~menthlth,~sxorient+marital,des,svymean,na.rm=T)
##                  sxorient marital menthlth        se
## a) Straight.0 a) Straight       0 6.773951 0.9743884
## b) Gay.0           b) Gay       0 3.727829 1.6683535
## a) Straight.1 a) Straight       1 2.455619 0.1836451
## b) Gay.1           b) Gay       1 1.476447 0.6146046
# QUESTION 2) Poisson glm fit to survey data
fit1<-svyglm(menthlth~factor(sxorient)+factor(marital)+factor(stress)+factor(depressed),design=des,family=poisson)
summary(fit1)
## 
## Call:
## svyglm(formula = menthlth ~ factor(sxorient) + factor(marital) + 
##     factor(stress) + factor(depressed), design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~X_STRWT, weights = ~X_LLCPWT, data = sub)
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              0.5186     0.3693   1.404 0.160409    
## factor(sxorient)b) Gay  -0.6468     0.2568  -2.518 0.011840 *  
## factor(marital)1        -0.6196     0.1826  -3.393 0.000699 ***
## factor(stress)b-Little   0.4826     0.2791   1.729 0.083806 .  
## factor(stress)c-Some     1.0922     0.2746   3.978 7.08e-05 ***
## factor(stress)d-Most     1.8090     0.2799   6.464 1.15e-10 ***
## factor(stress)e-Always   1.9871     0.3252   6.111 1.09e-09 ***
## factor(depressed)Y       0.9928     0.1345   7.383 1.89e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 13.63436)
## 
## Number of Fisher Scoring iterations: 7
#"Risk ratios" which show the change in the mean
round(exp(summary(fit1)$coef[-1,1]),3)
## factor(sxorient)b) Gay       factor(marital)1 factor(stress)b-Little 
##                  0.524                  0.538                  1.620 
##   factor(stress)c-Some   factor(stress)d-Most factor(stress)e-Always 
##                  2.981                  6.104                  7.295 
##     factor(depressed)Y 
##                  2.699
#a) 
fit2<-glm(menthlth~factor(sxorient)+factor(marital)+factor(stress)+factor(depressed),data = brfss2017,family=poisson)
summary(fit2)
## 
## Call:
## glm(formula = menthlth ~ factor(sxorient) + factor(marital) + 
##     factor(stress) + factor(depressed), family = poisson, data = brfss2017)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -7.427  -1.767  -1.135  -1.047  13.107  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -0.236443   0.009712 -24.345  < 2e-16 ***
## factor(sxorient)b) Gay  0.096053   0.016635   5.774 7.74e-09 ***
## factor(marital)1       -0.204349   0.006076 -33.631  < 2e-16 ***
## factor(stress)b-Little  0.885911   0.010678  82.968  < 2e-16 ***
## factor(stress)c-Some    1.642011   0.010252 160.172  < 2e-16 ***
## factor(stress)d-Most    2.256092   0.010861 207.719  < 2e-16 ***
## factor(stress)e-Always  2.495321   0.011553 215.989  < 2e-16 ***
## factor(depressed)Y      0.962199   0.006343 151.684  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 411285  on 40880  degrees of freedom
## Residual deviance: 247715  on 40873  degrees of freedom
##   (409135 observations deleted due to missingness)
## AIC: 292552
## 
## Number of Fisher Scoring iterations: 7
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 2.461831
#The p value is 0 so the model is not a good fit.
1-pchisq(fit2$deviance,df=fit2$df.residual)
## [1] 0
fit3<-glm(menthlth~factor(sxorient)+factor(marital)+factor(stress)+factor(depressed),data = brfss2017,family=quasipoisson)
summary(fit3)
## 
## Call:
## glm(formula = menthlth ~ factor(sxorient) + factor(marital) + 
##     factor(stress) + factor(depressed), family = quasipoisson, 
##     data = brfss2017)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -7.427  -1.767  -1.135  -1.047  13.107  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.23644    0.03496  -6.762 1.37e-11 ***
## factor(sxorient)b) Gay  0.09605    0.05989   1.604    0.109    
## factor(marital)1       -0.20435    0.02187  -9.342  < 2e-16 ***
## factor(stress)b-Little  0.88591    0.03844  23.047  < 2e-16 ***
## factor(stress)c-Some    1.64201    0.03691  44.492  < 2e-16 ***
## factor(stress)d-Most    2.25609    0.03910  57.699  < 2e-16 ***
## factor(stress)e-Always  2.49532    0.04159  59.997  < 2e-16 ***
## factor(depressed)Y      0.96220    0.02284  42.134  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 12.96011)
## 
##     Null deviance: 411285  on 40880  degrees of freedom
## Residual deviance: 247715  on 40873  degrees of freedom
##   (409135 observations deleted due to missingness)
## AIC: NA
## 
## Number of Fisher Scoring iterations: 7
# b) the Poisson model is not a good fit.
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
coeftest(fit2,vcov=vcovHC(fit2,type="HC1",cluster="ststr"))
## 
## z test of coefficients:
## 
##                         Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept)            -0.236443   0.037476  -6.3092 2.804e-10 ***
## factor(sxorient)b) Gay  0.096053   0.050049   1.9192   0.05496 .  
## factor(marital)1       -0.204349   0.020130 -10.1514 < 2.2e-16 ***
## factor(stress)b-Little  0.885911   0.041202  21.5017 < 2.2e-16 ***
## factor(stress)c-Some    1.642011   0.040992  40.0565 < 2.2e-16 ***
## factor(stress)d-Most    2.256092   0.043907  51.3838 < 2.2e-16 ***
## factor(stress)e-Always  2.495321   0.045487  54.8580 < 2.2e-16 ***
## factor(depressed)Y      0.962199   0.024348  39.5189 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# QUESTION 3  Consider a negative binomial model
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
fit.nb2<-glm.nb(menthlth~factor(sxorient)+factor(marital)+factor(stress)+factor(depressed),
                data = sub,
                weights=X_LLCPWT/mean(X_LLCPWT,na.rm=T))
summary(fit.nb2)
## 
## Call:
## glm.nb(formula = menthlth ~ factor(sxorient) + factor(marital) + 
##     factor(stress) + factor(depressed), data = sub, weights = X_LLCPWT/mean(X_LLCPWT, 
##     na.rm = T), init.theta = 0.1577185586, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2131  -0.7515  -0.4993  -0.2074   7.3268  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              0.7161     0.1193   6.002 1.95e-09 ***
## factor(sxorient)b) Gay  -1.2072     0.3441  -3.508 0.000451 ***
## factor(marital)1        -1.0159     0.1150  -8.836  < 2e-16 ***
## factor(stress)b-Little   0.5146     0.1021   5.041 4.62e-07 ***
## factor(stress)c-Some     1.2056     0.1276   9.451  < 2e-16 ***
## factor(stress)d-Most     2.0072     0.1942  10.334  < 2e-16 ***
## factor(stress)e-Always   2.0302     0.2789   7.280 3.35e-13 ***
## factor(depressed)Y       1.1083     0.1140   9.720  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1577) family taken to be 1)
## 
##     Null deviance: 3204.8  on 3889  degrees of freedom
## Residual deviance: 2533.4  on 3882  degrees of freedom
## AIC: 12031
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.15772 
##           Std. Err.:  0.00614 
## 
##  2 x log-likelihood:  -12013.02300
coeftest(fit.nb2,vcov=vcovHC(fit.nb2,type="HC1",cluster="X_STRWT"))
## 
## z test of coefficients:
## 
##                        Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)             0.71609    0.48876  1.4651  0.142890    
## factor(sxorient)b) Gay -1.20715    0.45477 -2.6544  0.007945 ** 
## factor(marital)1       -1.01587    0.40009 -2.5391  0.011114 *  
## factor(stress)b-Little  0.51464    0.24823  2.0733  0.038148 *  
## factor(stress)c-Some    1.20563    0.24679  4.8852 1.033e-06 ***
## factor(stress)d-Most    2.00721    0.26087  7.6944 1.422e-14 ***
## factor(stress)e-Always  2.03015    0.33717  6.0211 1.733e-09 ***
## factor(depressed)Y      1.10827    0.13997  7.9178 2.418e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# QUESTION 4  The negative binomial model gave the lowest AIC.