library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats)
brfss20sm <- readRDS("C:/Users/shahi/Dropbox/PC/Downloads/brfss20sm.rds")
names(brfss20sm) <- tolower(gsub(pattern = "_",replacement = "",x = names(brfss20sm)))
My binary outcome variable is Depressive disorder (ADDEPEV3).
brfss20sm$depression<-Recode(brfss20sm$addepev3, recodes="1=1; 2=0; 7:9=NA")
What are the effects of employment and marital status in depressive disorder?
Here 2 predictor variables are: Employment (employ1) and Marital Status (marital).
#employment
brfss20sm$employ<-Recode(brfss20sm$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss20sm$employ<-relevel(brfss20sm$employ, ref='Employed')
#marital status
brfss20sm$marst<-Recode(brfss20sm$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss20sm$marst<-relevel(brfss20sm$marst, ref='married')
brfss20sm<-brfss20sm%>%
filter(is.na(marst)==F,
is.na(employ)==F,
is.na(depression)==F)
#Raw frequencies of depression by employment
table(brfss20sm$depression, brfss20sm$employ)
##
## Employed nilf retired unable
## 0 84849 19674 43204 5280
## 1 16601 6302 7830 5082
#column percentages
100*prop.table(table(brfss20sm$depression, brfss20sm$employ), margin=2)
##
## Employed nilf retired unable
## 0 83.63627 75.73914 84.65729 50.95541
## 1 16.36373 24.26086 15.34271 49.04459
Here we can see higher proportion of people who are unable to work have depressive disorder compared to who are retired or employed.
#Raw frequencies of depression by marital status
table(brfss20sm$depression, brfss20sm$marst)
##
## married cohab divorced nm separated widowed
## 0 82617 5954 17404 29260 2735 15037
## 1 14162 1961 6465 8968 1122 3137
#column percentages
100*prop.table(table(brfss20sm$depression, brfss20sm$marst), margin=2)
##
## married cohab divorced nm separated widowed
## 0 85.36666 75.22426 72.91466 76.54076 70.91003 82.73908
## 1 14.63334 24.77574 27.08534 23.45924 29.08997 17.26092
Here we observe a relative higher proportion of people who are seperated or divorced have depressive disorder rather than who are in married status.
#Bivariate tests of independence:
#basic chi square test of independence
chisq.test(table(brfss20sm$depression, brfss20sm$employ))
##
## Pearson's Chi-squared test
##
## data: table(brfss20sm$depression, brfss20sm$employ)
## X-squared = 7456.1, df = 3, p-value < 2.2e-16
So the \(\chi^2\) statistic is large and the p - value is less than 0.05, suggesting that there is an association between depressive disorder and employment.
#basic chi square test of independence
chisq.test(table(brfss20sm$depression, brfss20sm$marst))
##
## Pearson's Chi-squared test
##
## data: table(brfss20sm$depression, brfss20sm$marst)
## X-squared = 3173.3, df = 5, p-value < 2.2e-16
So the \(\chi^2\) statistic is large and the p - value is less than 0.05, suggesting that there is an association between the depressive disorder and marital status.
Creating a survey design object:
library(srvyr)
##
## Attaching package: 'srvyr'
## The following object is masked from 'package:stats':
##
## filter
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss20sm )
des
## Stratified Independent Sampling design (with replacement)
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss20sm)
Now , we re-do the analysis from above using only weights
Weight1<-wtd.table(brfss20sm$depression,
brfss20sm$employ,
weights = brfss20sm$mmsawt)
#proportions
prop.table(
Weight1,
margin=2)
## Employed nilf retired unable
## 0 0.8493814 0.7842569 0.8549209 0.5605408
## 1 0.1506186 0.2157431 0.1450791 0.4394592
#compare that with the original, unweighted proportions
prop.table(table(brfss20sm$depression,
brfss20sm$employ),
margin=2)
##
## Employed nilf retired unable
## 0 0.8363627 0.7573914 0.8465729 0.5095541
## 1 0.1636373 0.2426086 0.1534271 0.4904459
Here, the prevalence of depressive disorder is higher in the sample than the population.
Weight2<-wtd.table(brfss20sm$depression,
brfss20sm$marst,
weights = brfss20sm$mmsawt)
#proportions
prop.table(
Weight2,
margin=2)
## married cohab divorced nm separated widowed
## 0 0.8652365 0.7625222 0.7482149 0.7864186 0.7329320 0.8301481
## 1 0.1347635 0.2374778 0.2517851 0.2135814 0.2670680 0.1698519
#compare that with the original, unweighted proportions
prop.table(table(brfss20sm$depression,
brfss20sm$marst),
margin=2)
##
## married cohab divorced nm separated widowed
## 0 0.8536666 0.7522426 0.7291466 0.7654076 0.7091003 0.8273908
## 1 0.1463334 0.2477574 0.2708534 0.2345924 0.2908997 0.1726092
#Now consider the full sample design + weights
Weight1<-svyby(formula = ~depression,
by=~employ,
design = des,
FUN=svymean)
svychisq(~depression+employ, design = des)
Pearson's X^2: Rao & Scott adjustment
data: svychisq(~depression + employ, design = des) F = 431.42, ndf = 2.988, ddf = 560673.041, p-value < 2.2e-16
knitr::kable(Weight1,
caption = "Survey Estimates of Depressive Disorder by Employment",
align = 'c',
format = "html")
| employ | depression | se | |
|---|---|---|---|
| Employed | Employed | 0.1506186 | 0.0022782 |
| nilf | nilf | 0.2157431 | 0.0049397 |
| retired | retired | 0.1450791 | 0.0039544 |
| unable | unable | 0.4394592 | 0.0107443 |
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1, weights = ~1, data = brfss20sm)
Weight1<-svyby(formula = ~factor(depression),
by = ~employ,
design = nodes,
FUN = svymean,
na.rm=T)
knitr::kable(Weight1,
caption = "Estimates of Depressive Disorder by Employment - No survey design",
align = 'c',
format = "html")
| employ | factor(depression)0 | factor(depression)1 | se.factor(depression)0 | se.factor(depression)1 | |
|---|---|---|---|---|---|
| Employed | Employed | 0.8363627 | 0.1636373 | 0.0011615 | 0.0011615 |
| nilf | nilf | 0.7573914 | 0.2426086 | 0.0026597 | 0.0026597 |
| retired | retired | 0.8465729 | 0.1534271 | 0.0015953 | 0.0015953 |
| unable | unable | 0.5095541 | 0.4904459 | 0.0049110 | 0.0049110 |
Weight2<-svyby(formula = ~depression,
by = ~marst,
design = des,
FUN = svymean,
na.rm=T)
knitr::kable(Weight2,
caption = "Survey Estimates of Depressive Disorder by Marital Status",
align = 'c',
format = "html")
| marst | depression | se | |
|---|---|---|---|
| married | married | 0.1347635 | 0.0023107 |
| cohab | cohab | 0.2374778 | 0.0093567 |
| divorced | divorced | 0.2517851 | 0.0064294 |
| nm | nm | 0.2135814 | 0.0041967 |
| separated | separated | 0.2670680 | 0.0135078 |
| widowed | widowed | 0.1698519 | 0.0067230 |
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1, weights = ~1, data = brfss20sm)
Weight2<-svyby(formula = ~factor(depression),
by = ~marst,
design = nodes,
FUN = svymean,
na.rm=T)
knitr::kable(Weight2,
caption = "Estimates of Depressive Disorder by Marital Status - No survey design",
align = 'c',
format = "html")
| marst | factor(depression)0 | factor(depression)1 | se.factor(depression)0 | se.factor(depression)1 | |
|---|---|---|---|---|---|
| married | married | 0.8536666 | 0.1463334 | 0.0011361 | 0.0011361 |
| cohab | cohab | 0.7522426 | 0.2477574 | 0.0048525 | 0.0048525 |
| divorced | divorced | 0.7291466 | 0.2708534 | 0.0028765 | 0.0028765 |
| nm | nm | 0.7654076 | 0.2345924 | 0.0021673 | 0.0021673 |
| separated | separated | 0.7091003 | 0.2908997 | 0.0073131 | 0.0073131 |
| widowed | widowed | 0.8273908 | 0.1726092 | 0.0028033 | 0.0028033 |
##By Using tableone package:
library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c( "depression"),
strata="employ",
data = brfss20sm)
#t1<-print(t1, format="p")
print(t1,format="p" )
## Stratified by employ
## Employed nilf retired unable
## n 101450 25976 51034 10362
## depression (mean (SD)) 0.16 (0.37) 0.24 (0.43) 0.15 (0.36) 0.49 (0.50)
## Stratified by employ
## p test
## n
## depression (mean (SD)) <0.001
#, strata = "depression", test = T,
#using survey design
st1<-svyCreateTableOne(vars = c("depression"),
strata = "employ",
test = T,
data = des)
print(st1,
format="p")
## Stratified by employ
## Employed nilf
## n 83396000.23 27089184.05
## depression (mean (SD)) 0.15 (0.36) 0.22 (0.41)
## Stratified by employ
## retired unable p test
## n 26257816.55 8203329.94
## depression (mean (SD)) 0.15 (0.35) 0.44 (0.50) <0.001
library(tableone)
#not using survey design
t2<-CreateTableOne(vars = c( "depression"),
strata="marst",
data = brfss20sm)
#t2<-print(t2, format="p")
print(t2,format="p" )
## Stratified by marst
## married cohab divorced nm
## n 96779 7915 23869 38228
## depression (mean (SD)) 0.15 (0.35) 0.25 (0.43) 0.27 (0.44) 0.23 (0.42)
## Stratified by marst
## separated widowed p test
## n 3857 18174
## depression (mean (SD)) 0.29 (0.45) 0.17 (0.38) <0.001
#, strata = "depression", test = T,
#using survey design
st2<-svyCreateTableOne(vars = c("depression"),
strata = "marst",
test = T,
data = des)
print(st2,
format="p")
## Stratified by marst
## married cohab
## n 71971669.67 7424479.64
## depression (mean (SD)) 0.13 (0.34) 0.24 (0.43)
## Stratified by marst
## divorced nm
## n 14908990.05 37729721.34
## depression (mean (SD)) 0.25 (0.43) 0.21 (0.41)
## Stratified by marst
## separated widowed p test
## n 3622958.24 9288511.82
## depression (mean (SD)) 0.27 (0.44) 0.17 (0.38) <0.001
**Yes, there are substantive differences.Comparing standard errors, SE of estimates is higher in the survey design analysis than they are in the random sampling because samples are nested as I have dependency of observations and lower variance. If we don’t use survey design, the estimates of all variances will be too small. It will help us to know either it is statistically significant or not.