library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

Fix variable names

#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

1. Define a binary outcome variable of your choosing and define how you recode the original variable.

Depression - (Ever told) you have a depressive disorder (including depression, major depression, dysthymia, or minor depression)?

2. State a research question about what factors you believe will affect your outcome variable.

What relationship do veteran status and labor participation have with depression? 

3. Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

EMPLOY1 - Employment Status; VETERAN3 - Are You A Veteran

Recoding of variables

#depression
brfss_17$depr<-Recode(brfss_17$addepev2, recodes ="7:9=NA; 1=1;2=0")

#veteran
#brfss_17$vet<-Recode(brfss_17$veteran3, recodes ="7:9=NA; 1=1;2=0", as.factor=T)

brfss_17$vet<-Recode(brfss_17$veteran3, recodes="1='Veteran'; 2='NV'; else=NA", as.factor=T)

#employment
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 3:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

Filter cases

brfss_17<-brfss_17%>%
  filter(sex!=9, is.na(vet)==F,is.na(employ)==F)

4. Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!

#First we will do some tables
#Raw frequencies
table(brfss_17$depr, brfss_17$vet )
##    
##         NV Veteran
##   0 158907   23743
##   1  39937    4691
table(brfss_17$depr, brfss_17$employ)
##    
##     Employed  nilf retired unable
##   0    98732 22363   54988   6567
##   1    19175  6892   10783   7778
#column percentages
prop.table(table(brfss_17$depr, brfss_17$vet  ), margin=2)
##    
##            NV   Veteran
##   0 0.7991541 0.8350215
##   1 0.2008459 0.1649785
prop.table(table(brfss_17$depr, brfss_17$employ  ), margin=2)
##    
##      Employed      nilf   retired    unable
##   0 0.8373718 0.7644163 0.8360524 0.4577902
##   1 0.1626282 0.2355837 0.1639476 0.5422098
#basic chi square test of independence
chisq.test(table(brfss_17$depr, brfss_17$vet))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(brfss_17$depr, brfss_17$vet)
## X-squared = 202.58, df = 1, p-value < 2.2e-16
chisq.test(table(brfss_17$depr, brfss_17$employ))
## 
##  Pearson's Chi-squared test
## 
## data:  table(brfss_17$depr, brfss_17$employ)
## X-squared = 12447, df = 3, p-value < 2.2e-16

Create survey design object

brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1

brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(depr)==F)

#
options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17 )
  • 4.1 Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.

sv.table<-svyby(formula = ~depr, by = ~vet, design = des, FUN = svymean, na.rm=T)


knitr::kable(sv.table,
      caption = "Survey Estimates of Depression by Veteran Status",
      align = 'c',  
      format = "html")
Survey Estimates of Depression by Veteran Status
vet depr se
NV NV 0.1605685 0.0091099
Veteran Veteran 0.2085911 0.0300425
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1,  weights = ~1, data = brfss_17)

sv.table<-svyby(formula = ~depr, by = ~vet, design = nodes, FUN = svymean, na.rm=T)


knitr::kable(sv.table,
      caption = "Estimates of Depression by Veteran Status - No survey design",
      align = 'c',  
      format = "html")
Estimates of Depression by Veteran Status - No survey design
vet depr se
NV NV 0.1804511 0.0044966
Veteran Veteran 0.1565957 0.0106027
sv.table<-svyby(formula = ~depr, by = ~employ, design = des, FUN = svymean, na.rm=T)


knitr::kable(sv.table,
      caption = "Survey Estimates of Depression by Employment Status",
      align = 'c',  
      format = "html")
Survey Estimates of Depression by Employment Status
employ depr se
Employed Employed 0.1191093 0.0086004
nilf nilf 0.1788280 0.0239301
retired retired 0.1509674 0.0184346
unable unable 0.6205544 0.0469430
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1,  weights = ~1, data = brfss_17)

sv.table<-svyby(formula = ~depr, by = ~employ, design = nodes, FUN = svymean, na.rm=T)


knitr::kable(sv.table,
      caption = "Estimates of Depression by Employment Status - No survey design",
      align = 'c',  
      format = "html")
Estimates of Depression by Employment Status - No survey design
employ depr se
Employed Employed 0.1394644 0.0055069
nilf nilf 0.1841693 0.0108520
retired retired 0.1582840 0.0070198
unable unable 0.5235507 0.0212590
  • 4.2 Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (tableone package helps)

library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c("employ", "vet"), strata = "depr", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
##                    Stratified by depr
##                     0    1    p      test
##   n                 6986 1504            
##   employ (%)                  <0.001     
##      Employed       48.8 36.7            
##      nilf           14.9 15.6            
##      retired        32.6 28.5            
##      unable          3.8 19.2            
##   vet = Veteran (%) 14.2 12.2  0.052
#using survey design
st1<-svyCreateTableOne(vars = c("employ", "vet"), strata = "depr", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
##                    Stratified by depr
##                     0          1         p      test
##   n                 12597218.4 2498940.0            
##   employ (%)                             <0.001     
##      Employed             61.4      41.8            
##      nilf                 21.6      23.7            
##      retired              14.4      12.9            
##      unable                2.6      21.5            
##   vet = Veteran (%)        9.8      13.0  0.097
  • 4.3 Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?

    Yes, the percent estimates are differet for for both independent variables across all levels and when looking at depression by veteran status, the survey design results show a greater proportion of veterans report depression whereas the non-survey design results show the opposite.