library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats)
library(srvyr)
##
## Attaching package: 'srvyr'
## The following object is masked from 'package:stats':
##
## filter
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v readr 2.1.1
## v tibble 3.1.6 v purrr 0.3.4
## v tidyr 1.1.4 v stringr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x srvyr::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
brfss20sm <- readRDS("C:/Users/shahi/Dropbox/PC/Downloads/brfss20sm.rds")
names(brfss20sm) <- tolower(gsub(pattern = "_",replacement = "",x = names(brfss20sm)))
#My binary outcome variable is Depressive disorder (ADDEPEV3).
brfss20sm$depression<-Recode(brfss20sm$addepev3, recodes="1=1; 2=0; 7:9=NA")
What are the effects of employment and marital status in depressive disorder? Here 2 predictor variables are: Employment (employ1) and Marital Status (marital).
#employment
brfss20sm$employ<-Recode(brfss20sm$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss20sm$employ<-relevel(brfss20sm$employ, ref='Employed')
#marital status
brfss20sm$marst<-Recode(brfss20sm$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss20sm$marst<-relevel(brfss20sm$marst, ref='married')
brfss20sm<-brfss20sm%>%
filter(is.na(marst)==F,
is.na(employ)==F,
is.na(depression)==F)
First, we will do some descriptive analysis, such as means and cross tabulations.
sub<-brfss20sm %>%
select(depression,employ, marst, mmsawt, ststr) %>%
filter( complete.cases( . ))
#cat<-sample(1:nrow(sub), size = 1000, replace = FALSE)
#sub<-sub[cat, ]
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1,
strata= ~ststr,
weights= ~mmsawt,
data = sub )
First, we examine the % of US adults with depression by employment, and do a survey-corrected chi-square test for independence.
cat<-svyby(formula = ~depression,
by = ~employ,
design = des,
FUN = svymean,
na.rm=T)
svychisq(~depression+employ,
design = des)
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~depression + employ, design = des)
## F = 431.42, ndf = 2.988, ddf = 560673.041, p-value < 2.2e-16
cat%>%
ggplot()+
geom_point(aes(x=employ,y=depression))+
geom_errorbar(aes(x=employ, ymin = depression-1.96*se,
ymax= depression+1.96*se),
width=.25)+
labs(title = "Percent % of US Adults with Depression",
caption = "Source: CDC BRFSS - SMART Data, 2020 \n Calculations by Mahmuda",
x = "Employment",
y = "% Depressive Disorder")+
theme_minimal()
dog<-svyby(formula = ~depression,
by = ~marst,
design = des,
FUN = svymean,
na.rm=T)
svychisq(~depression+marst,
design = des)
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~depression + marst, design = des)
## F = 128.28, ndf = 4.9706e+00, ddf = 9.3268e+05, p-value < 2.2e-16
dog%>%
ggplot()+
geom_point(aes(x=marst,y=depression))+
geom_errorbar(aes(x=marst, ymin = depression-1.96*se,
ymax= depression+1.96*se),
width=.25)+
labs(title = "Percent % of US with Depression by Marital Status",
caption = "Source: CDC BRFSS - SMART Data, 2020 \n Calculations by Mahmuda",
x = "Marital Status",
y = "% Depressive Disorder ")+
theme_minimal()
catdog<-svyby(formula = ~depression,
by = ~marst+employ,
design = des,
FUN = svymean,
na.rm=T)
#this plot is a little more complicated, but facet_wrap() plots separate plots for groups
catdog%>%
ggplot()+
#geom_point(aes(x=employ, y = depression, color=marst, group=marst), position="dodge")+
geom_errorbar(aes(x=employ,y = depression,
ymin = depression-1.96*se,
ymax= depression+1.96*se,
color=marst,
group=marst),
width=.25,
position="dodge")+
#facet_wrap(~ marst, nrow = 3)+
labs(title = "Percent % of US with Depression by Marital Status and Employment Status",
caption = "Source: CDC BRFSS - SMART Data, 2020 \n Calculations by Mahmuda",
x = "Employment",
y = "% Depression")+
theme_minimal()
To fit the model to our survey data, we use svyglm(), and specify our model equation and name of our survey design object. Since we are using a logistic regression model, specify family = binomial. The default link function is the logit:
#Logit model
fit.logit<-svyglm(depression ~ marst + employ,
design = des,
family = binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = depression ~ marst + employ, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.00158 0.02237 -89.457 < 2e-16 ***
## marstcohab 0.65618 0.05818 11.278 < 2e-16 ***
## marstdivorced 0.66317 0.04125 16.075 < 2e-16 ***
## marstnm 0.45797 0.03415 13.409 < 2e-16 ***
## marstseparated 0.69110 0.07073 9.771 < 2e-16 ***
## marstwidowed 0.19339 0.05493 3.521 0.00043 ***
## employnilf 0.38645 0.03570 10.825 < 2e-16 ***
## employretired 0.02580 0.03821 0.675 0.49952
## employunable 1.40087 0.04903 28.571 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9973641)
##
## Number of Fisher Scoring iterations: 4
the tbl_regression function in gtsummary is a good way to make a decent looking table easily and to exponentiate the regression effects to form odds ratios and their confidence intervals.
library(gtsummary)
fit.logit%>%
tbl_regression(exponentiate=TRUE )
| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| marst | |||
| married | — | — | |
| cohab | 1.93 | 1.72, 2.16 | <0.001 |
| divorced | 1.94 | 1.79, 2.10 | <0.001 |
| nm | 1.58 | 1.48, 1.69 | <0.001 |
| separated | 2.00 | 1.74, 2.29 | <0.001 |
| widowed | 1.21 | 1.09, 1.35 | <0.001 |
| employ | |||
| Employed | — | — | |
| nilf | 1.47 | 1.37, 1.58 | <0.001 |
| retired | 1.03 | 0.95, 1.11 | 0.5 |
| unable | 4.06 | 3.69, 4.47 | <0.001 |
|
1
OR = Odds Ratio, CI = Confidence Interval
|
|||
A sligtly more digestible form can be obtained from the sjPlot library. In this plot, if the error bars overlap 1, the effects are not statistically significant.
library(sjPlot)
plot_model(fit.logit,
axis.lim = c(.1, 10),
title = "Odds ratios for Depression")
#get a series of predicted probabilites for different "types" of people for each model
#ref_grid will generate all possible combinations of predictors from a model
library(emmeans)
rg<-ref_grid(fit.logit)
marg_logit<-emmeans(object = rg,
specs = c( "marst", "employ"),
type="response" )
knitr::kable(marg_logit, digits = 4)
| marst | employ | prob | SE | df | asymp.LCL | asymp.UCL |
|---|---|---|---|---|---|---|
| married | Employed | 0.1190 | 0.0023 | Inf | 0.1145 | 0.1237 |
| cohab | Employed | 0.2066 | 0.0090 | Inf | 0.1895 | 0.2249 |
| divorced | Employed | 0.2078 | 0.0062 | Inf | 0.1960 | 0.2201 |
| nm | Employed | 0.1760 | 0.0044 | Inf | 0.1676 | 0.1848 |
| separated | Employed | 0.2124 | 0.0116 | Inf | 0.1906 | 0.2359 |
| widowed | Employed | 0.1409 | 0.0067 | Inf | 0.1282 | 0.1546 |
| married | nilf | 0.1659 | 0.0051 | Inf | 0.1562 | 0.1761 |
| cohab | nilf | 0.2771 | 0.0120 | Inf | 0.2542 | 0.3012 |
| divorced | nilf | 0.2785 | 0.0092 | Inf | 0.2607 | 0.2970 |
| nm | nilf | 0.2392 | 0.0061 | Inf | 0.2274 | 0.2514 |
| separated | nilf | 0.2841 | 0.0148 | Inf | 0.2561 | 0.3140 |
| widowed | nilf | 0.1944 | 0.0094 | Inf | 0.1766 | 0.2136 |
| married | retired | 0.1218 | 0.0037 | Inf | 0.1147 | 0.1292 |
| cohab | retired | 0.2109 | 0.0108 | Inf | 0.1905 | 0.2328 |
| divorced | retired | 0.2120 | 0.0084 | Inf | 0.1961 | 0.2289 |
| nm | retired | 0.1798 | 0.0065 | Inf | 0.1674 | 0.1928 |
| separated | retired | 0.2168 | 0.0127 | Inf | 0.1929 | 0.2427 |
| widowed | retired | 0.1440 | 0.0063 | Inf | 0.1321 | 0.1568 |
| married | unable | 0.3542 | 0.0114 | Inf | 0.3321 | 0.3769 |
| cohab | unable | 0.5139 | 0.0179 | Inf | 0.4788 | 0.5488 |
| divorced | unable | 0.5156 | 0.0132 | Inf | 0.4897 | 0.5414 |
| nm | unable | 0.4644 | 0.0124 | Inf | 0.4401 | 0.4888 |
| separated | unable | 0.5226 | 0.0193 | Inf | 0.4846 | 0.5603 |
| widowed | unable | 0.3996 | 0.0157 | Inf | 0.3691 | 0.4308 |
Generate predicted probabilities for some “interesting” cases from your analysis, to highlight the effects from the model and your stated research question
Answer: I will find the probability for a Married person with retirement, compared to a Divorced person with retirement:
comps<-as.data.frame(marg_logit)
comps[comps$marst=="married" & comps$employ == "retired" , ]
comps[comps$marst=="divorced" & comps$employ == "retired" , ]
So,being divorced and retired persons have higher probability of depression than being still married and retired individuals.