library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats)
library(srvyr)
##
## Attaching package: 'srvyr'
## The following object is masked from 'package:stats':
##
## filter
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v readr 2.1.1
## v tibble 3.1.6 v purrr 0.3.4
## v tidyr 1.1.4 v stringr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x srvyr::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
brfss20sm <- readRDS("C:/Users/shahi/Dropbox/PC/Downloads/brfss20sm.rds")
names(brfss20sm) <- tolower(gsub(pattern = "_",replacement = "",x = names(brfss20sm)))
#My binary outcome variable is Depressive disorder (ADDEPEV3).
brfss20sm$depression<-Recode(brfss20sm$addepev3, recodes="1=1; 2=0; 7:9=NA")
What are the effects of employment and marital status in depressive disorder? Here 2 predictor variables are: Employment (employ1) and Marital Status (marital).I am interested in answering the following questions: a. Does depression vary with employment status? b. Does depression vary across different marital status? c. Does young adult suffer from poorer mental health than aged people?
#employment
brfss20sm$employ<-Recode(brfss20sm$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss20sm$employ<-relevel(brfss20sm$employ, ref='Employed')
#marital status
brfss20sm$marst<-Recode(brfss20sm$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss20sm$marst<-relevel(brfss20sm$marst, ref='married')
#Age cut into intervals
brfss20sm$agec <- cut(brfss20sm$age80,
breaks = c(0,24,39,59,79,99))
brfss20sm<-brfss20sm%>%
filter(is.na(marst)==F,
is.na(employ)==F,
is.na(depression)==F)
First, we will do some descriptive analysis, such as means and cross tabulations.
sub<-brfss20sm %>%
select(depression,employ, marst,agec, mmsawt, ststr) %>%
filter( complete.cases( . ))
#cat<-sample(1:nrow(sub), size = 1000, replace = FALSE)
#sub<-sub[cat, ]
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1,
strata= ~ststr,
weights= ~mmsawt,
data = sub )
First, we examine the % of US adults with depression by employment, and do a survey-corrected chi-square test for independence.
cat<-svyby(formula = ~depression,
by = ~employ,
design = des,
FUN = svymean,
na.rm=T)
svychisq(~depression+employ,
design = des)
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~depression + employ, design = des)
## F = 431.42, ndf = 2.988, ddf = 560673.041, p-value < 2.2e-16
cat%>%
ggplot()+
geom_point(aes(x=employ,y=depression))+
geom_errorbar(aes(x=employ, ymin = depression-1.96*se,
ymax= depression+1.96*se),
width=.25)+
labs(title = "Percent % of US Adults with Depression",
caption = "Source: CDC BRFSS - SMART Data, 2020 \n Calculations by Mahmuda",
x = "Employment",
y = "% Depressive Disorder")+
theme_minimal()
Here, we notice that who are unable to work have more chance to have depressive symptoms rather than who are recently employed and who were previously employed (retired).
dog<-svyby(formula = ~depression,
by = ~marst,
design = des,
FUN = svymean,
na.rm=T)
svychisq(~depression+marst,
design = des)
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~depression + marst, design = des)
## F = 128.28, ndf = 4.9706e+00, ddf = 9.3268e+05, p-value < 2.2e-16
dog%>%
ggplot()+
geom_point(aes(x=marst,y=depression))+
geom_errorbar(aes(x=marst, ymin = depression-1.96*se,
ymax= depression+1.96*se),
width=.25)+
labs(title = "Percent % of US with Depression by Marital Status",
caption = "Source: CDC BRFSS - SMART Data, 2020 \n Calculations by Mahmuda",
x = "Marital Status",
y = "% Depressive Disorder ")+
theme_minimal()
Here, we find that who are divorced and separated have more chance to have depressive disorder than who are married or widowed.
catdog<-svyby(formula = ~depression,
by = ~marst+employ,
design = des,
FUN = svymean,
na.rm=T)
#this plot is a little more complicated, but facet_wrap() plots separate plots for groups
catdog%>%
ggplot()+
#geom_point(aes(x=employ, y = depression, color=marst, group=marst), position="dodge")+
geom_errorbar(aes(x=employ,y = depression,
ymin = depression-1.96*se,
ymax= depression+1.96*se,
color=marst,
group=marst),
width=.25,
position="dodge")+
#facet_wrap(~ marst, nrow = 3)+
labs(title = "Percent % of US with Depression by Marital Status and Employment Status",
caption = "Source: CDC BRFSS - SMART Data, 2020 \n Calculations by Mahmuda",
x = "Employment",
y = "% Depression")+
theme_minimal()
The plot above shows the percentage of depression in US across different employment and marital status. Separated/cohabited people who are unable to work have more chance to have depressive disorder.
To fit the model to our survey data, we use svyglm(), and specify our model equation and name of our survey design object. Since we are using a logistic regression model, specify family = binomial. The default link function is the logit:
#Logit model
fit.logit<-svyglm(depression ~ marst + employ+agec,
design = des,
family = binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = depression ~ marst + employ + agec, design = des,
## family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.85800 0.05963 -31.158 < 2e-16 ***
## marstcohab 0.54421 0.05952 9.143 < 2e-16 ***
## marstdivorced 0.69568 0.04152 16.756 < 2e-16 ***
## marstnm 0.33305 0.03887 8.568 < 2e-16 ***
## marstseparated 0.65445 0.07083 9.239 < 2e-16 ***
## marstwidowed 0.42639 0.05880 7.251 4.16e-13 ***
## employnilf 0.38525 0.03744 10.291 < 2e-16 ***
## employretired 0.38578 0.04609 8.369 < 2e-16 ***
## employunable 1.51039 0.05028 30.037 < 2e-16 ***
## agec(24,39] 0.05384 0.05416 0.994 0.32014
## agec(39,59] -0.17532 0.05764 -3.041 0.00236 **
## agec(59,79] -0.44613 0.06477 -6.888 5.68e-12 ***
## agec(79,99] -1.17935 0.10805 -10.915 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9973553)
##
## Number of Fisher Scoring iterations: 4
the tbl_regression function in gtsummary is a good way to make a decent looking table easily and to exponentiate the regression effects to form odds ratios and their confidence intervals.
library(gtsummary)
fit.logit%>%
tbl_regression(exponentiate=TRUE )
| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| marst | |||
| married | — | — | |
| cohab | 1.72 | 1.53, 1.94 | <0.001 |
| divorced | 2.01 | 1.85, 2.18 | <0.001 |
| nm | 1.40 | 1.29, 1.51 | <0.001 |
| separated | 1.92 | 1.67, 2.21 | <0.001 |
| widowed | 1.53 | 1.36, 1.72 | <0.001 |
| employ | |||
| Employed | — | — | |
| nilf | 1.47 | 1.37, 1.58 | <0.001 |
| retired | 1.47 | 1.34, 1.61 | <0.001 |
| unable | 4.53 | 4.10, 5.00 | <0.001 |
| agec | |||
| (0,24] | — | — | |
| (24,39] | 1.06 | 0.95, 1.17 | 0.3 |
| (39,59] | 0.84 | 0.75, 0.94 | 0.002 |
| (59,79] | 0.64 | 0.56, 0.73 | <0.001 |
| (79,99] | 0.31 | 0.25, 0.38 | <0.001 |
|
1
OR = Odds Ratio, CI = Confidence Interval
|
|||
Here, those who are divorced and separated are more likely to suffer from depression. And the group of people who are unable to work are more likely to have depression. And in case of age gradient, the older people become less chance of depression.
A sligtly more digestible form can be obtained from the sjPlot library. In this plot, if the error bars overlap 1, the effects are not statistically significant.
library(sjPlot)
## #refugeeswelcome
plot_model(fit.logit,
axis.lim = c(.1, 10),
title = "Odds ratios for Depression")
Here, the error bars overlap 1 for only age(25-39), meaning the effects are statistically significant.
#get a series of predicted probabilites for different "types" of people for each model
#ref_grid will generate all possible combinations of predictors from a model
library(emmeans)
rg<-ref_grid(fit.logit)
marg_logit<-emmeans(object = rg,
specs = c( "marst", "employ"),
type="response" )
knitr::kable(marg_logit, digits = 4)
| marst | employ | prob | SE | df | asymp.LCL | asymp.UCL |
|---|---|---|---|---|---|---|
| married | Employed | 0.0991 | 0.0028 | Inf | 0.0937 | 0.1048 |
| cohab | Employed | 0.1593 | 0.0080 | Inf | 0.1442 | 0.1757 |
| divorced | Employed | 0.1807 | 0.0066 | Inf | 0.1680 | 0.1941 |
| nm | Employed | 0.1330 | 0.0044 | Inf | 0.1247 | 0.1419 |
| separated | Employed | 0.1747 | 0.0105 | Inf | 0.1551 | 0.1962 |
| widowed | Employed | 0.1442 | 0.0070 | Inf | 0.1310 | 0.1584 |
| married | nilf | 0.1392 | 0.0049 | Inf | 0.1299 | 0.1491 |
| cohab | nilf | 0.2179 | 0.0110 | Inf | 0.1972 | 0.2401 |
| divorced | nilf | 0.2448 | 0.0093 | Inf | 0.2270 | 0.2635 |
| nm | nilf | 0.1841 | 0.0064 | Inf | 0.1718 | 0.1969 |
| separated | nilf | 0.2373 | 0.0136 | Inf | 0.2116 | 0.2650 |
| widowed | nilf | 0.1985 | 0.0097 | Inf | 0.1801 | 0.2182 |
| married | retired | 0.1392 | 0.0051 | Inf | 0.1295 | 0.1496 |
| cohab | retired | 0.2180 | 0.0114 | Inf | 0.1965 | 0.2412 |
| divorced | retired | 0.2449 | 0.0107 | Inf | 0.2245 | 0.2666 |
| nm | retired | 0.1841 | 0.0072 | Inf | 0.1705 | 0.1986 |
| separated | retired | 0.2374 | 0.0142 | Inf | 0.2106 | 0.2664 |
| widowed | retired | 0.1986 | 0.0094 | Inf | 0.1807 | 0.2177 |
| married | unable | 0.3325 | 0.0123 | Inf | 0.3089 | 0.3569 |
| cohab | unable | 0.4619 | 0.0183 | Inf | 0.4262 | 0.4979 |
| divorced | unable | 0.4997 | 0.0148 | Inf | 0.4706 | 0.5287 |
| nm | unable | 0.4100 | 0.0130 | Inf | 0.3847 | 0.4358 |
| separated | unable | 0.4894 | 0.0202 | Inf | 0.4500 | 0.5289 |
| widowed | unable | 0.4328 | 0.0167 | Inf | 0.4004 | 0.4657 |
#I can find the probability for a Married person with retirement, compared to a Divorced person with retirement:
comps<-as.data.frame(marg_logit)
comps[comps$marst=="married" & comps$employ == "unable" , ]
comps[comps$marst=="divorced" & comps$employ == "unable" , ]
So,being divorced and unabled persons have higher probability of depression than being still married and unable to work individuals.
fit.logit1 <- svyglm (depression~ employ, design=des,family= binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit2 <- svyglm (depression~ employ+marst, design=des,family= binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit3 <- svyglm (depression~ employ+marst+agec, design=des,family= binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit1)
##
## Call:
## svyglm(formula = depression ~ employ, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.72976 0.01781 -97.136 <2e-16 ***
## employnilf 0.43911 0.03424 12.824 <2e-16 ***
## employretired -0.04397 0.03653 -1.204 0.229
## employunable 1.48640 0.04715 31.523 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000005)
##
## Number of Fisher Scoring iterations: 4
In model 1, we see that not in labor force and unable to work people have a higher odds of depression compared to employed people.
summary(fit.logit2)
##
## Call:
## svyglm(formula = depression ~ employ + marst, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.00158 0.02237 -89.457 < 2e-16 ***
## employnilf 0.38645 0.03570 10.825 < 2e-16 ***
## employretired 0.02580 0.03821 0.675 0.49952
## employunable 1.40087 0.04903 28.571 < 2e-16 ***
## marstcohab 0.65618 0.05818 11.278 < 2e-16 ***
## marstdivorced 0.66317 0.04125 16.075 < 2e-16 ***
## marstnm 0.45797 0.03415 13.409 < 2e-16 ***
## marstseparated 0.69110 0.07073 9.771 < 2e-16 ***
## marstwidowed 0.19339 0.05493 3.521 0.00043 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9973641)
##
## Number of Fisher Scoring iterations: 4
regTermTest(fit.logit2, test.terms = ~ employ, method = "Wald", df=NULL)
## Wald test for employ
## in svyglm(formula = depression ~ employ + marst, design = des, family = binomial)
## F = 292.0884 on 3 and 187632 df: p= < 2.22e-16
#After controlling for marital status,
summary(fit.logit3)
##
## Call:
## svyglm(formula = depression ~ employ + marst + agec, design = des,
## family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.85800 0.05963 -31.158 < 2e-16 ***
## employnilf 0.38525 0.03744 10.291 < 2e-16 ***
## employretired 0.38578 0.04609 8.369 < 2e-16 ***
## employunable 1.51039 0.05028 30.037 < 2e-16 ***
## marstcohab 0.54421 0.05952 9.143 < 2e-16 ***
## marstdivorced 0.69568 0.04152 16.756 < 2e-16 ***
## marstnm 0.33305 0.03887 8.568 < 2e-16 ***
## marstseparated 0.65445 0.07083 9.239 < 2e-16 ***
## marstwidowed 0.42639 0.05880 7.251 4.16e-13 ***
## agec(24,39] 0.05384 0.05416 0.994 0.32014
## agec(39,59] -0.17532 0.05764 -3.041 0.00236 **
## agec(59,79] -0.44613 0.06477 -6.888 5.68e-12 ***
## agec(79,99] -1.17935 0.10805 -10.915 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9973553)
##
## Number of Fisher Scoring iterations: 4
regTermTest(fit.logit3, test.terms = ~ employ, method = "Wald", df=NULL)
## Wald test for employ
## in svyglm(formula = depression ~ employ + marst + agec, design = des,
## family = binomial)
## F = 309.2137 on 3 and 187628 df: p= < 2.22e-16
As the F-value is higher, Model 3 is better explaining the data than others.
library(gtsummary)
f1 <- fit.logit1 %>%
tbl_regression(exponentiate = T)
f2 <- fit.logit2 %>%
tbl_regression(exponentiate = T)
f3 <- fit.logit3 %>%
tbl_regression(exponentiate = T)
f_all <- tbl_merge(tbls = list(f1,f2,f3),
tab_spanner = c("**Model 1**", "**Model 2**", "**Model 3**"))
f_all
| Characteristic | Model 1 | Model 2 | Model 3 | ||||||
|---|---|---|---|---|---|---|---|---|---|
| OR1 | 95% CI1 | p-value | OR1 | 95% CI1 | p-value | OR1 | 95% CI1 | p-value | |
| employ | |||||||||
| Employed | — | — | — | — | — | — | |||
| nilf | 1.55 | 1.45, 1.66 | <0.001 | 1.47 | 1.37, 1.58 | <0.001 | 1.47 | 1.37, 1.58 | <0.001 |
| retired | 0.96 | 0.89, 1.03 | 0.2 | 1.03 | 0.95, 1.11 | 0.5 | 1.47 | 1.34, 1.61 | <0.001 |
| unable | 4.42 | 4.03, 4.85 | <0.001 | 4.06 | 3.69, 4.47 | <0.001 | 4.53 | 4.10, 5.00 | <0.001 |
| marst | |||||||||
| married | — | — | — | — | |||||
| cohab | 1.93 | 1.72, 2.16 | <0.001 | 1.72 | 1.53, 1.94 | <0.001 | |||
| divorced | 1.94 | 1.79, 2.10 | <0.001 | 2.01 | 1.85, 2.18 | <0.001 | |||
| nm | 1.58 | 1.48, 1.69 | <0.001 | 1.40 | 1.29, 1.51 | <0.001 | |||
| separated | 2.00 | 1.74, 2.29 | <0.001 | 1.92 | 1.67, 2.21 | <0.001 | |||
| widowed | 1.21 | 1.09, 1.35 | <0.001 | 1.53 | 1.36, 1.72 | <0.001 | |||
| agec | |||||||||
| (0,24] | — | — | |||||||
| (24,39] | 1.06 | 0.95, 1.17 | 0.3 | ||||||
| (39,59] | 0.84 | 0.75, 0.94 | 0.002 | ||||||
| (59,79] | 0.64 | 0.56, 0.73 | <0.001 | ||||||
| (79,99] | 0.31 | 0.25, 0.38 | <0.001 | ||||||
|
1
OR = Odds Ratio, CI = Confidence Interval
|
|||||||||
AIC (fit.logit1, fit.logit2, fit.logit3)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## eff.p AIC deltabar
## [1,] 14.23903 171897.5 4.746345
## [2,] 36.82549 169995.8 4.603187
## [3,] 55.97528 169123.8 4.664607
We see here the third model has an AIC of 169123.8, which is lower than the other two models. So, it is better explaining the data.