Dependent variable

#Mental health

## I use 2017 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART metro area survey data for this homework. My research question is how various socio-demographic variables may affect the average number of days in a month a person can tell his mental health was not good (which includes stress, depression, and problems with emotions). Therefore, the dependent variable is mental health (count variable measured as a number of days out of 30 days when mental health being not good). Predictors include income and employment status as socio-economic variables, age and health insurance coverage and bmi as health related variables.
brfss_17$men_health<-Recode(brfss_17$menthlth,recodes="77=NA;88=NA;99=NA")
head(brfss_17$men_health)
## [1] NA NA  5  3 NA NA

Predictors

#income
brfss_17$income<-Recode(brfss_17$income2,recodes="77=NA;99=NA")
head(brfss_17$income)
## [1] 5 8 1 4 6 6
str(brfss_17$income)
##  num [1:230875] 5 8 1 4 6 6 NA NA 6 NA ...
##  - attr(*, "label")= chr "INCOME LEVEL"
#age; reference group - 18-24 old
brfss_17$age<-Recode(brfss_17$age80,recodes="18:24='Age 18 to 24';
                                             25:29='Age 25 to 29';
                                             30:34='Age 30 to 34';
                                             35:39='Age 35 to 39';
                                             40:44='Age 40 to 44';
                                             45:49='Age 45 to 49';
                                             50:54='Age 50 to 54';
                                             55:59='Age 55 to 59';
                                             60:64='Age 60 to 64';
                                             65:69='Age 65 to 69';
                                             70:74='Age 70 to 74';
                                             75:79='Age 75 to 79';
                                             80:99='Age 80 or older'",as.factor=TRUE)

#bmi
brfss_17$bmi<-brfss_17$bmi5/100
#health_care; reference group - "1" has healthcare coverage
brfss_17$health_care<-Recode(brfss_17$hlthpln1,recodes="7=NA;9=NA",as.factor=TRUE)
levels(brfss_17$health_care)
## [1] "1" "2"
#employment Status; 1 - employed (reference), 2 - unemployed, 3 - other (homemaker, student, retired, unable to work)
table(brfss_17$employ1)
## 
##     1     2     3     4     5     6     7     8     9 
## 99410 19204  5422  4933 12129  6975 66157 14504  2138
brfss_17$employ<-Recode(brfss_17$employ1,recodes="1:2='Employed';3:4='Unemployed';5:8='Other';9=NA", as.factor=T)
str(brfss_17$employ)
##  Factor w/ 3 levels "Employed","Other",..: 1 1 1 1 2 1 1 2 2 2 ...
levels(brfss_17$employ)
## [1] "Employed"   "Other"      "Unemployed"

##Analysis

sub<-brfss_17%>%
  select(men_health, income, bmi,
         age,health_care,employ, mmsawt, ststr) %>%
  filter( complete.cases(.))
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, 
               weights=~mmsawt,
               data =sub[is.na(sub$mmsawt)==F,])  

#Using Poisson glm fit the survey design               
fit1<-svyglm(men_health~income+age+bmi+health_care+employ, design=des, family=poisson)
summary(fit1)
## 
## Call:
## svyglm(formula = men_health ~ income + age + bmi + health_care + 
##     employ, design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub[is.na(sub$mmsawt) == 
##     F, ])
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.300967   0.044745  51.424  < 2e-16 ***
## income             -0.064747   0.003715 -17.428  < 2e-16 ***
## ageAge 25 to 29     0.057943   0.033037   1.754 0.079458 .  
## ageAge 30 to 34     0.053423   0.034101   1.567 0.117214    
## ageAge 35 to 39     0.075320   0.033328   2.260 0.023828 *  
## ageAge 40 to 44     0.063029   0.035320   1.784 0.074351 .  
## ageAge 45 to 49     0.066648   0.035262   1.890 0.058755 .  
## ageAge 50 to 54     0.139957   0.035569   3.935 8.34e-05 ***
## ageAge 55 to 59     0.142778   0.032574   4.383 1.17e-05 ***
## ageAge 60 to 64     0.117346   0.033649   3.487 0.000488 ***
## ageAge 65 to 69     0.021603   0.040399   0.535 0.592834    
## ageAge 70 to 74    -0.044115   0.048290  -0.914 0.360955    
## ageAge 75 to 79    -0.085165   0.055015  -1.548 0.121622    
## ageAge 80 or older -0.026883   0.049614  -0.542 0.587930    
## bmi                 0.008518   0.001010   8.436  < 2e-16 ***
## health_care2        0.075150   0.025106   2.993 0.002761 ** 
## employOther         0.217753   0.020177  10.792  < 2e-16 ***
## employUnemployed    0.226700   0.029143   7.779 7.44e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 9.048215)
## 
## Number of Fisher Scoring iterations: 5
#The results show that income has a negative realtionship with average number of days reporting bad mental health. As for age variable the older respondents get the larger is the average number of days with bad mental health (this trend is true until the age of 64), respondents of older age groups have on average fewer bad mental health days than the reference group (but the results are not significant for these groups). Those who do not have an insurance coverage on avearge report more days of bad mental health; unemployed or other categories report on avearge more days of having bad mental health than employed.
fit2<-glm(men_health~income+age+bmi+health_care+employ, data=sub, family=poisson)
summary(fit2)
## 
## Call:
## glm(formula = men_health ~ income + age + bmi + health_care + 
##     employ, family = poisson, data = sub)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -6.278  -2.717  -1.367   1.697   6.860  
## 
## Coefficients:
##                      Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)         2.4492995  0.0075956  322.463  < 2e-16 ***
## income             -0.0831352  0.0005953 -139.661  < 2e-16 ***
## ageAge 25 to 29     0.0467084  0.0064293    7.265 3.73e-13 ***
## ageAge 30 to 34     0.0700591  0.0064544   10.854  < 2e-16 ***
## ageAge 35 to 39     0.1006858  0.0063601   15.831  < 2e-16 ***
## ageAge 40 to 44     0.0831107  0.0065464   12.696  < 2e-16 ***
## ageAge 45 to 49     0.0958100  0.0062466   15.338  < 2e-16 ***
## ageAge 50 to 54     0.1277875  0.0059426   21.504  < 2e-16 ***
## ageAge 55 to 59     0.1391562  0.0057667   24.131  < 2e-16 ***
## ageAge 60 to 64     0.0857108  0.0058879   14.557  < 2e-16 ***
## ageAge 65 to 69     0.0119522  0.0062354    1.917   0.0553 .  
## ageAge 70 to 74    -0.0481779  0.0069511   -6.931 4.18e-12 ***
## ageAge 75 to 79    -0.1086489  0.0082983  -13.093  < 2e-16 ***
## ageAge 80 or older -0.0905993  0.0080089  -11.312  < 2e-16 ***
## bmi                 0.0066148  0.0001750   37.808  < 2e-16 ***
## health_care2        0.0867025  0.0042977   20.174  < 2e-16 ***
## employOther         0.2257106  0.0032301   69.877  < 2e-16 ***
## employUnemployed    0.2086925  0.0050497   41.327  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 553509  on 60043  degrees of freedom
## Residual deviance: 506161  on 60026  degrees of freedom
## AIC: 728751
## 
## Number of Fisher Scoring iterations: 5
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 2.903854
1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0
##Regular glm does not fit the data well, since p value is 0 when we use deviance to test the model fit. 
fit3<-glm(men_health~income+age+bmi+health_care+employ, data=sub, family=quasipoisson)
summary(fit3)
## 
## Call:
## glm(formula = men_health ~ income + age + bmi + health_care + 
##     employ, family = quasipoisson, data = sub)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -6.278  -2.717  -1.367   1.697   6.860  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.4492995  0.0229795 106.586  < 2e-16 ***
## income             -0.0831352  0.0018009 -46.163  < 2e-16 ***
## ageAge 25 to 29     0.0467084  0.0194509   2.401 0.016338 *  
## ageAge 30 to 34     0.0700591  0.0195269   3.588 0.000334 ***
## ageAge 35 to 39     0.1006858  0.0192415   5.233 1.68e-07 ***
## ageAge 40 to 44     0.0831107  0.0198051   4.196 2.72e-05 ***
## ageAge 45 to 49     0.0958100  0.0188982   5.070 3.99e-07 ***
## ageAge 50 to 54     0.1277875  0.0179785   7.108 1.19e-12 ***
## ageAge 55 to 59     0.1391562  0.0174462   7.976 1.53e-15 ***
## ageAge 60 to 64     0.0857108  0.0178130   4.812 1.50e-06 ***
## ageAge 65 to 69     0.0119522  0.0188644   0.634 0.526354    
## ageAge 70 to 74    -0.0481779  0.0210295  -2.291 0.021969 *  
## ageAge 75 to 79    -0.1086489  0.0251054  -4.328 1.51e-05 ***
## ageAge 80 or older -0.0905993  0.0242299  -3.739 0.000185 ***
## bmi                 0.0066148  0.0005293  12.497  < 2e-16 ***
## health_care2        0.0867025  0.0130020   6.668 2.61e-11 ***
## employOther         0.2257106  0.0097723  23.097  < 2e-16 ***
## employUnemployed    0.2086925  0.0152773  13.660  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 9.152833)
## 
##     Null deviance: 553509  on 60043  degrees of freedom
## Residual deviance: 506161  on 60026  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
##We use quasi poisson distribution to scale the variance and get the results very similar to survey design model (however, coefficients are more significant than in survey).
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
coeftest(fit2, vcov=vcovHC(fit2, type="HC1",cluster="ststr"))
## 
## z test of coefficients:
## 
##                       Estimate  Std. Error  z value  Pr(>|z|)    
## (Intercept)         2.44929954  0.02268729 107.9591 < 2.2e-16 ***
## income             -0.08313517  0.00178003 -46.7043 < 2.2e-16 ***
## ageAge 25 to 29     0.04670836  0.01889952   2.4714 0.0134584 *  
## ageAge 30 to 34     0.07005909  0.01915658   3.6572 0.0002550 ***
## ageAge 35 to 39     0.10068576  0.01888837   5.3306 9.791e-08 ***
## ageAge 40 to 44     0.08311071  0.01928458   4.3097 1.635e-05 ***
## ageAge 45 to 49     0.09580996  0.01840718   5.2050 1.940e-07 ***
## ageAge 50 to 54     0.12778755  0.01748185   7.3097 2.677e-13 ***
## ageAge 55 to 59     0.13915616  0.01692073   8.2240 < 2.2e-16 ***
## ageAge 60 to 64     0.08571079  0.01753926   4.8868 1.025e-06 ***
## ageAge 65 to 69     0.01195216  0.01879096   0.6361 0.5247378    
## ageAge 70 to 74    -0.04817792  0.02128211  -2.2638 0.0235879 *  
## ageAge 75 to 79    -0.10864890  0.02560260  -4.2437 2.199e-05 ***
## ageAge 80 or older -0.09059926  0.02477666  -3.6566 0.0002555 ***
## bmi                 0.00661480  0.00051901  12.7451 < 2.2e-16 ***
## health_care2        0.08670253  0.01278110   6.7837 1.172e-11 ***
## employOther         0.22571058  0.00987013  22.8681 < 2.2e-16 ***
## employUnemployed    0.20869251  0.01494534  13.9637 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##Negative binomial results are very close to complex survey design.