library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats)
brfss20sm <- readRDS("C:/Users/shahi/Dropbox/PC/Downloads/brfss20sm.rds")

names(brfss20sm) <- tolower(gsub(pattern = "_",replacement =  "",x =  names(brfss20sm)))

Question 1: Define a binary outcome variable of your choosing and define how you recode the original variable

My binary outcome variable is Depressive disorder (ADDEPEV3).

brfss20sm$depression<-Recode(brfss20sm$addepev3, recodes="1=1; 2=0; 7:9=NA")

Question 2: State a research question about what factors you believe will affect your outcome variable

What are the effects of employment and marital status in depressive disorder?

Question 3: Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

Here 2 predictor variables are: Employment (employ1) and Marital Status (marital).

#employment
brfss20sm$employ<-Recode(brfss20sm$employ1,
                       recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
                       as.factor=T)
brfss20sm$employ<-relevel(brfss20sm$employ, ref='Employed')

#marital status
brfss20sm$marst<-Recode(brfss20sm$marital,
                      recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
                      as.factor=T)
brfss20sm$marst<-relevel(brfss20sm$marst, ref='married')
brfss20sm<-brfss20sm%>%
  filter(is.na(marst)==F,
         is.na(employ)==F,
         is.na(depression)==F)

Question 4: Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!

1) Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights:

#Raw frequencies of depression by employment
table(brfss20sm$depression, brfss20sm$employ)
##    
##     Employed  nilf retired unable
##   0    84849 19674   43204   5280
##   1    16601  6302    7830   5082
#column percentages
100*prop.table(table(brfss20sm$depression, brfss20sm$employ), margin=2)
##    
##     Employed     nilf  retired   unable
##   0 83.63627 75.73914 84.65729 50.95541
##   1 16.36373 24.26086 15.34271 49.04459

Here we can see higher proportion of people who are unable to work have depressive disorder compared to who are retired or employed.

#Raw frequencies of depression by marital status
table(brfss20sm$depression, brfss20sm$marst)
##    
##     married cohab divorced    nm separated widowed
##   0   82617  5954    17404 29260      2735   15037
##   1   14162  1961     6465  8968      1122    3137
#column percentages
100*prop.table(table(brfss20sm$depression, brfss20sm$marst), margin=2)
##    
##      married    cohab divorced       nm separated  widowed
##   0 85.36666 75.22426 72.91466 76.54076  70.91003 82.73908
##   1 14.63334 24.77574 27.08534 23.45924  29.08997 17.26092

Here we observe a relative higher proportion of people who are seperated or divorced have depressive disorder rather than who are in married status.

#Bivariate tests of independence:

#basic chi square test of independence
chisq.test(table(brfss20sm$depression, brfss20sm$employ))
## 
##  Pearson's Chi-squared test
## 
## data:  table(brfss20sm$depression, brfss20sm$employ)
## X-squared = 7456.1, df = 3, p-value < 2.2e-16

So the \(\chi^2\) statistic is large and the p - value is less than 0.05, suggesting that there is an association between depressive disorder and employment.

#basic chi square test of independence
chisq.test(table(brfss20sm$depression, brfss20sm$marst))
## 
##  Pearson's Chi-squared test
## 
## data:  table(brfss20sm$depression, brfss20sm$marst)
## X-squared = 3173.3, df = 5, p-value < 2.2e-16

So the \(\chi^2\) statistic is large and the p - value is less than 0.05, suggesting that there is an association between the depressive disorder and marital status.

Creating a survey design object:

library(srvyr)
## 
## Attaching package: 'srvyr'
## The following object is masked from 'package:stats':
## 
##     filter
options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss20sm )
des
## Stratified Independent Sampling design (with replacement)
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss20sm)

Simple weighted analysis

Now , we re-do the analysis from above using only weights

Weight1<-wtd.table(brfss20sm$depression,
               brfss20sm$employ,
               weights = brfss20sm$mmsawt)

#proportions
prop.table(
  Weight1,
  margin=2)
##    Employed      nilf   retired    unable
## 0 0.8493814 0.7842569 0.8549209 0.5605408
## 1 0.1506186 0.2157431 0.1450791 0.4394592
#compare that with the original, unweighted proportions
prop.table(table(brfss20sm$depression,
                 brfss20sm$employ),
           margin=2)
##    
##      Employed      nilf   retired    unable
##   0 0.8363627 0.7573914 0.8465729 0.5095541
##   1 0.1636373 0.2426086 0.1534271 0.4904459

Here, the prevalence of depressive disorder is higher in the sample than the population.

Weight2<-wtd.table(brfss20sm$depression,
               brfss20sm$marst,
               weights = brfss20sm$mmsawt)

#proportions
prop.table(
  Weight2,
  margin=2)
##     married     cohab  divorced        nm separated   widowed
## 0 0.8652365 0.7625222 0.7482149 0.7864186 0.7329320 0.8301481
## 1 0.1347635 0.2374778 0.2517851 0.2135814 0.2670680 0.1698519
#compare that with the original, unweighted proportions
prop.table(table(brfss20sm$depression,
                 brfss20sm$marst),
           margin=2)
##    
##       married     cohab  divorced        nm separated   widowed
##   0 0.8536666 0.7522426 0.7291466 0.7654076 0.7091003 0.8273908
##   1 0.1463334 0.2477574 0.2708534 0.2345924 0.2908997 0.1726092

2)Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics:

Proper survey design analysis

#Now consider the full sample design + weights
Weight1<-svyby(formula = ~depression,
           by=~employ,
           design = des,
           FUN=svymean)

svychisq(~depression+employ, design = des)
Pearson's X^2: Rao & Scott adjustment

data: svychisq(~depression + employ, design = des) F = 431.42, ndf = 2.988, ddf = 560673.041, p-value < 2.2e-16

knitr::kable(Weight1,
      caption = "Survey Estimates of Depressive Disorder by Employment",
      align = 'c',  
      format = "html")
Survey Estimates of Depressive Disorder by Employment
employ depression se
Employed Employed 0.1506186 0.0022782
nilf nilf 0.2157431 0.0049397
retired retired 0.1450791 0.0039544
unable unable 0.4394592 0.0107443
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1,  weights = ~1, data = brfss20sm)

Weight1<-svyby(formula = ~factor(depression),
                by = ~employ,
                design = nodes,
                FUN = svymean,
                na.rm=T)
knitr::kable(Weight1,
      caption = "Estimates of Depressive Disorder by Employment - No survey design",
      align = 'c',  
      format = "html")
Estimates of Depressive Disorder by Employment - No survey design
employ factor(depression)0 factor(depression)1 se.factor(depression)0 se.factor(depression)1
Employed Employed 0.8363627 0.1636373 0.0011615 0.0011615
nilf nilf 0.7573914 0.2426086 0.0026597 0.0026597
retired retired 0.8465729 0.1534271 0.0015953 0.0015953
unable unable 0.5095541 0.4904459 0.0049110 0.0049110
Weight2<-svyby(formula = ~depression,
                by = ~marst,
                design = des,
                FUN = svymean,
                na.rm=T)


knitr::kable(Weight2,
      caption = "Survey Estimates of Depressive Disorder by Marital Status",
      align = 'c',  
      format = "html")
Survey Estimates of Depressive Disorder by Marital Status
marst depression se
married married 0.1347635 0.0023107
cohab cohab 0.2374778 0.0093567
divorced divorced 0.2517851 0.0064294
nm nm 0.2135814 0.0041967
separated separated 0.2670680 0.0135078
widowed widowed 0.1698519 0.0067230
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1,  weights = ~1, data = brfss20sm)

Weight2<-svyby(formula = ~factor(depression),
                by = ~marst,
                design = nodes,
                FUN = svymean,
                na.rm=T)
knitr::kable(Weight2,
      caption = "Estimates of Depressive Disorder by Marital Status - No survey design",
      align = 'c',  
      format = "html")
Estimates of Depressive Disorder by Marital Status - No survey design
marst factor(depression)0 factor(depression)1 se.factor(depression)0 se.factor(depression)1
married married 0.8536666 0.1463334 0.0011361 0.0011361
cohab cohab 0.7522426 0.2477574 0.0048525 0.0048525
divorced divorced 0.7291466 0.2708534 0.0028765 0.0028765
nm nm 0.7654076 0.2345924 0.0021673 0.0021673
separated separated 0.7091003 0.2908997 0.0073131 0.0073131
widowed widowed 0.8273908 0.1726092 0.0028033 0.0028033

##By Using tableone package:

library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c( "depression"),
                   strata="employ",
                   data = brfss20sm)



#t1<-print(t1, format="p")
print(t1,format="p" )
##                         Stratified by employ
##                          Employed      nilf         retired      unable      
##   n                      101450        25976        51034        10362       
##   depression (mean (SD))   0.16 (0.37)  0.24 (0.43)  0.15 (0.36)  0.49 (0.50)
##                         Stratified by employ
##                          p      test
##   n                                 
##   depression (mean (SD)) <0.001
#, strata = "depression", test = T,
#using survey design
st1<-svyCreateTableOne(vars = c("depression"),
                       strata = "employ",
                       test = T,
                       data = des)

print(st1,
      format="p")
##                         Stratified by employ
##                          Employed           nilf              
##   n                      83396000.23        27089184.05       
##   depression (mean (SD))        0.15 (0.36)        0.22 (0.41)
##                         Stratified by employ
##                          retired            unable            p      test
##   n                      26257816.55        8203329.94                   
##   depression (mean (SD))        0.15 (0.35)       0.44 (0.50) <0.001
library(tableone)
#not using survey design
t2<-CreateTableOne(vars = c( "depression"),
                   strata="marst",
                   data = brfss20sm)



#t2<-print(t2, format="p")
print(t2,format="p" )
##                         Stratified by marst
##                          married      cohab       divorced     nm          
##   n                      96779        7915        23869        38228       
##   depression (mean (SD))  0.15 (0.35) 0.25 (0.43)  0.27 (0.44)  0.23 (0.42)
##                         Stratified by marst
##                          separated   widowed      p      test
##   n                      3857        18174                   
##   depression (mean (SD)) 0.29 (0.45)  0.17 (0.38) <0.001
#, strata = "depression", test = T,
#using survey design
st2<-svyCreateTableOne(vars = c("depression"),
                       strata = "marst",
                       test = T,
                       data = des)

print(st2,
      format="p")
##                         Stratified by marst
##                          married            cohab            
##   n                      71971669.67        7424479.64       
##   depression (mean (SD))        0.13 (0.34)       0.24 (0.43)
##                         Stratified by marst
##                          divorced           nm                
##   n                      14908990.05        37729721.34       
##   depression (mean (SD))        0.25 (0.43)        0.21 (0.41)
##                         Stratified by marst
##                          separated         widowed           p      test
##   n                      3622958.24        9288511.82                   
##   depression (mean (SD))       0.27 (0.44)       0.17 (0.38) <0.001

3)Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?

**Yes, there are substantive differences.Comparing standard errors, SE of estimates is higher in the survey design analysis than they are in the random sampling because samples are nested as I have dependency of observations and lower variance. If we don’t use survey design, the estimates of all variances will be too small. It will help us to know either it is statistically significant or not.