Stats II Homework 5

Author

Jacob M. Souch

Initialize

setwd("C:/Users/jacob/OneDrive - University of Texas at San Antonio/Courses/Stats for Demographic Data 2/Homework 4")
library(purrr)
library(haven)
library(janitor)

Attaching package: 'janitor'
The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(car)
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:purrr':

    some
library(usmap)
library(ggthemes)
library(tigris)
To enable caching of data, set `options(tigris_use_cache = TRUE)`
in your R script or .Rprofile.
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
──
✔ ggplot2 3.4.0      ✔ dplyr   1.0.10
✔ tibble  3.1.8      ✔ stringr 1.5.0 
✔ tidyr   1.3.0      ✔ forcats 0.5.2 
✔ readr   2.1.3      
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ dplyr::recode() masks car::recode()
✖ car::some()     masks purrr::some()
library(tidycensus)
library(lme4)
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack
library(haven)
library(foreign)
library(FactoMineR)
library(ggrepel)
library(margins)
library(emmeans)
library(stargazer)

Please cite as: 

 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
library(stringr)
library(gtsummary)
#BlackLivesMatter

Data Import

brfss21 <- haven::read_xpt("LLCP2021.XPT_")

attr(brfss21, which = "labels")
NULL
names(brfss21) <- str_replace(names(brfss21),"_","")

Explore

tabyl(brfss21$SEXVAR) %>% gt::gt()
brfss21$SEXVAR n percent
1 203810 0.4645846
2 234883 0.5354154
tabyl(brfss21$EDUCAG) %>% gt::gt()
brfss21$EDUCAG n percent
1 25991 0.059246443
2 111545 0.254266651
3 120102 0.273772319
4 178577 0.407065989
9 2478 0.005648597
tabyl(brfss21$RFSMOK3) %>% gt::gt()
brfss21$RFSMOK3 n percent
1 359891 0.82037097
2 53832 0.12270996
9 24970 0.05691908
tabyl(brfss21$CHCSCNCR) %>% gt::gt()
brfss21$CHCSCNCR n percent valid_percent
1 41112 9.371474e-02 0.0937151663
2 396140 9.030005e-01 0.9030046206
7 1151 2.623703e-03 0.0026237146
9 288 6.564955e-04 0.0006564985
NA 2 4.558997e-06 NA

Codebook

Skin Cancer (IV)

Educational Attainment (DV)

Smoking Behavior (DV)

Lifetime Depression Diagnosis (IV)

Cleaning

#Remove invalid values. Creates new dataframe.

brfss21.c <- brfss21 %>% filter(EDUCAG %in% c(1:4)) 


brfss21.c <-filter(brfss21.c, AGEG5YR != 14)

brfss21.c <- filter(brfss21.c, !CHCSCNCR %in% c(7:9))

brfss21.c <- filter(brfss21.c, RFSMOK3 !=9)

brfss21.c <- filter(brfss21.c, !ADDEPEV3 %in% c(7:9))
#Data check.

tabyl(brfss21.c$SEXVAR)   %>% gt::gt()
brfss21.c$SEXVAR n percent
1 186342 0.4638315
2 215403 0.5361685
tabyl(brfss21.c$EDUCAG)   %>% gt::gt()
brfss21.c$EDUCAG n percent
1 23654 0.05887814
2 102092 0.25412140
3 111065 0.27645646
4 164934 0.41054400
tabyl(brfss21.c$RFSMOK3)  %>% gt::gt()
brfss21.c$RFSMOK3 n percent
1 349171 0.8691359
2 52574 0.1308641
tabyl(brfss21.c$CHCSCNCR) %>% gt::gt()
brfss21.c$CHCSCNCR n percent
1 38426 0.09564774
2 363319 0.90435226
[1] "There were 401745 observations found."

Recode

brfss21.c$SEXVAR.female  <- car::recode(brfss21.c$SEXVAR, recodes = "1 = 0;2=1") 

brfss21.c$CHCSCNCR.skincancer<- car::recode(brfss21.c$CHCSCNCR, recodes = "1=1;2=0; else=NA")

brfss21.c$RFSMOK3.smoke <- car::recode(brfss21.c$RFSMOK3, recodes=  "1=0;2=1; else=NA")

brfss21.c$EDUCAG.graduate <- car::recode(brfss21.c$EDUCAG, recodes=  "1:3 = 0; 4=1")

brfss21.c$ADDEPEV3.depressed <-car::recode(brfss21.c$ADDEPEV3, recodes = "1=1; 2=0")
tabyl(brfss21.c$SEXVAR.female)   %>% gt::gt()
brfss21.c$SEXVAR.female n percent
0 186342 0.4638315
1 215403 0.5361685
tabyl(brfss21.c$EDUCAG.graduate)   %>% gt::gt()
brfss21.c$EDUCAG.graduate n percent
0 236811 0.589456
1 164934 0.410544
tabyl(brfss21.c$RFSMOK3.smoke)  %>% gt::gt()
brfss21.c$RFSMOK3.smoke n percent
0 349171 0.8691359
1 52574 0.1308641
tabyl(brfss21.c$CHCSCNCR.skincancer) %>% gt::gt()
brfss21.c$CHCSCNCR.skincancer n percent
0 363319 0.90435226
1 38426 0.09564774
mean(brfss21.c$AGE80)
[1] 54.74752
mean(brfss21.c$SEXVAR.female)
[1] 0.5361685
mean(brfss21.c$CHCSCNCR.skincancer)
[1] 0.09564774
mean(brfss21.c$EDUCAG.graduate)
[1] 0.410544
mean(brfss21.c$ADDEPEV3.depressed)
[1] 0.2006074

Specifications of Age

brfss21.c$AGE80SQ<- brfss21.c$AGE80**2


brfss21.c$AGE80CU<- brfss21.c$AGE80**3

Lifetime Depression

ageprobs<- brfss21.c %>%
    group_by(AGE80) %>%
    summarize(p=mean(ADDEPEV3.depressed),n=n())
              
ageprobs$num <- ageprobs$p*(1-ageprobs$p)

ageprobs$sep <- sqrt(ageprobs$num/ageprobs$n)

ageprobs$me <- 2*ageprobs$sep
  
ggplot(data =ageprobs, aes(x = AGE80, y = p, ymin=p-me, ymax=p+me)) +
     geom_line() + ylim(0,1) + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Lifetime Depression by Age", 
       subtitle="Respondents in Sample", 
       caption="Source: 2021 BRFSS Data") +
       xlab(label="Age at Survey") +
  ylab(label="Proportion of Respondents with Lifetime Depression Diagnosis") 

model0 <- glm(ADDEPEV3.depressed ~ AGE80, family = "binomial", data = brfss21.c)
summary(model0)

Call:
glm(formula = ADDEPEV3.depressed ~ AGE80, family = "binomial", 
    data = brfss21.c)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.8295  -0.7008  -0.6224  -0.5657   1.9552  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.6399188  0.0121749  -52.56   <2e-16 ***
AGE80       -0.0138933  0.0002207  -62.95   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 402744  on 401744  degrees of freedom
Residual deviance: 398783  on 401743  degrees of freedom
AIC: 398787

Number of Fisher Scoring iterations: 4
model0$coefficients = exp(model0$coefficients)


summary(model0)

Call:
glm(formula = ADDEPEV3.depressed ~ AGE80, family = "binomial", 
    data = brfss21.c)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.8295  -0.7008  -0.6224  -0.5657   1.9552  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) 0.5273353  0.0121749   43.31   <2e-16 ***
AGE80       0.9862028  0.0002207 4468.11   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 402744  on 401744  degrees of freedom
Residual deviance: 398783  on 401743  degrees of freedom
AIC: 398787

Number of Fisher Scoring iterations: 4
Note

Interpretation:

There is a 1.5% reduction in the odds of reporting depression for every additional year of life, holding all else constant.

Marginal Effects at Representative Values

data <- brfss21.c %>% select(AGE80,ADDEPEV3.depressed)


model0 <- glm(ADDEPEV3.depressed ~ AGE80, family = "binomial", data = data)
summary(model0)

Call:
glm(formula = ADDEPEV3.depressed ~ AGE80, family = "binomial", 
    data = data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.8295  -0.7008  -0.6224  -0.5657   1.9552  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.6399188  0.0121749  -52.56   <2e-16 ***
AGE80       -0.0138933  0.0002207  -62.95   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 402744  on 401744  degrees of freedom
Residual deviance: 398783  on 401743  degrees of freedom
AIC: 398787

Number of Fisher Scoring iterations: 4
summary(model0)

Call:
glm(formula = ADDEPEV3.depressed ~ AGE80, family = "binomial", 
    data = data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.8295  -0.7008  -0.6224  -0.5657   1.9552  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.6399188  0.0121749  -52.56   <2e-16 ***
AGE80       -0.0138933  0.0002207  -62.95   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 402744  on 401744  degrees of freedom
Residual deviance: 398783  on 401743  degrees of freedom
AIC: 398787

Number of Fisher Scoring iterations: 4
MERs<-margins(model0,variables="AGE80", at=list(AGE80=c(18,20,30,40,50,60,70,80)))

MERS.Depressed <- summary(MERs)

MERS.Depressed
 factor   AGE80     AME     SE        z      p   lower   upper
  AGE80 18.0000 -0.0029 0.0001 -52.3316 0.0000 -0.0030 -0.0028
  AGE80 20.0000 -0.0028 0.0001 -52.6052 0.0000 -0.0029 -0.0027
  AGE80 30.0000 -0.0027 0.0000 -54.4802 0.0000 -0.0028 -0.0026
  AGE80 40.0000 -0.0025 0.0000 -57.2950 0.0000 -0.0026 -0.0024
  AGE80 50.0000 -0.0023 0.0000 -61.2403 0.0000 -0.0024 -0.0022
  AGE80 60.0000 -0.0021 0.0000 -66.6283 0.0000 -0.0022 -0.0020
  AGE80 70.0000 -0.0019 0.0000 -73.9670 0.0000 -0.0020 -0.0019
  AGE80 80.0000 -0.0018 0.0000 -84.0966 0.0000 -0.0018 -0.0017
plot(MERS.Depressed$AGE80,MERS.Depressed$AME)

#Will if not using subset data.
Note

Interpretation

MERs.depressed1YR <-margins(model0,variables="AGE80", at=list(AGE80=c(39,40,41)))

MERs.depressed1YR <- summary(MERs.depressed1YR)

plot(MERs.depressed1YR$AGE80,MERs.depressed1YR$AME)

Note

Interpretation

The marginal effect of aging on lifetime depression diagnosis diminishes across the lifespan.

ageprobs<- brfss21.c %>%
    group_by(AGE80) %>%
    summarize(p=mean(ADDEPEV3.depressed),n=n())
              
ageprobs$num <- ageprobs$p*(1-ageprobs$p)

ageprobs$sep <- sqrt(ageprobs$num/ageprobs$n)

ageprobs$me <- 2*ageprobs$sep
  
ggplot(data =ageprobs, aes(x = AGE80, y = p, ymin=p-me, ymax=p+me)) +
     geom_line() + ylim(0,1) + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Lifetime Depression by Age", 
       subtitle="Respondents in Sample", 
       caption="Source: 2021 BRFSS Data") +
       xlab(label="Age at Survey") +
  ylab(label="Proportion of Respondents with Lifetime Depression Diagnosis")+
  theme_bw()

AMEs<-margins(model0,variables=c("AGE80"))

summary(AMEs)
 factor     AME     SE        z      p   lower   upper
  AGE80 -0.0022 0.0000 -63.5499 0.0000 -0.0023 -0.0021
Note

Interpretation:

The average marginal effect of age across the lifespan is 0.22% reduction in odds of reporting depression.

model1 <- glm(ADDEPEV3.depressed ~ AGE80 + AGE80SQ, family = "binomial", data = brfss21.c)
summary(model1)

Call:
glm(formula = ADDEPEV3.depressed ~ AGE80 + AGE80SQ, family = "binomial", 
    data = brfss21.c)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.7591  -0.7348  -0.6338  -0.5154   2.0418  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.466e+00  3.321e-02  -44.14   <2e-16 ***
AGE80        2.328e-02  1.399e-03   16.64   <2e-16 ***
AGE80SQ     -3.669e-04  1.363e-05  -26.92   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 402744  on 401744  degrees of freedom
Residual deviance: 398045  on 401742  degrees of freedom
AIC: 398051

Number of Fisher Scoring iterations: 4
model1 <- glm(ADDEPEV3.depressed ~ AGE80 + AGE80SQ+AGE80CU, family = "binomial", data = brfss21.c)
summary(model1)

Call:
glm(formula = ADDEPEV3.depressed ~ AGE80 + AGE80SQ + AGE80CU, 
    family = "binomial", data = brfss21.c)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.8032  -0.7222  -0.6537  -0.4861   2.0949  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.269e-01  8.632e-02  -1.471    0.141    
AGE80       -7.274e-02  5.906e-03 -12.315   <2e-16 ***
AGE80SQ      1.697e-03  1.244e-04  13.646   <2e-16 ***
AGE80CU     -1.366e-05  8.195e-07 -16.667   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 402744  on 401744  degrees of freedom
Residual deviance: 397767  on 401741  degrees of freedom
AIC: 397775

Number of Fisher Scoring iterations: 4
Note

Interpretation:

ageprobs<- brfss21.c %>%
    group_by(EDUCAG.graduate, SEXVAR.female,AGE80) %>%
    summarize(pDepressed =mean(ADDEPEV3.depressed),n=n())
`summarise()` has grouped output by 'EDUCAG.graduate', 'SEXVAR.female'. You can
override using the `.groups` argument.
ggplot(data =ageprobs, aes(x = AGE80, y = pDepressed, color =  factor(EDUCAG.graduate), group = EDUCAG.graduate)) + geom_line() + facet_wrap(~ SEXVAR.female) + ylim(0,.5)+
  ggtitle("Lifetime Depression Diagnosis by Sex and College Graduate Status")+
  theme_bw()+
  theme(legend.position="bottom")

Note

Interpretation: College education is protective against depression for both men and women. The probability of reporting depression diminishes across the lifespan for both men and women. Women report depression at higher rates at corresponding ages to that of men. The effect of college education on mental health is greater for women than men.

model <- glm(ADDEPEV3.depressed ~ AGE80 + EDUCAG.graduate + SEXVAR.female, family = "binomial", data = brfss21.c)
summary(model)

Call:
glm(formula = ADDEPEV3.depressed ~ AGE80 + EDUCAG.graduate + 
    SEXVAR.female, family = "binomial", data = brfss21.c)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.9986  -0.7126  -0.5969  -0.4619   2.2178  

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.8925466  0.0133414  -66.90   <2e-16 ***
AGE80           -0.0154828  0.0002238  -69.19   <2e-16 ***
EDUCAG.graduate -0.2386695  0.0082599  -28.89   <2e-16 ***
SEXVAR.female    0.7349933  0.0083700   87.81   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 402744  on 401744  degrees of freedom
Residual deviance: 389959  on 401741  degrees of freedom
AIC: 389967

Number of Fisher Scoring iterations: 4
modelExp = model
modelExp$coefficients = exp(modelExp$coefficients)
summary(modelExp)

Call:
glm(formula = ADDEPEV3.depressed ~ AGE80 + EDUCAG.graduate + 
    SEXVAR.female, family = "binomial", data = brfss21.c)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.9986  -0.7126  -0.5969  -0.4619   2.2178  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.4096113  0.0133414   30.70   <2e-16 ***
AGE80           0.9846364  0.0002238 4400.10   <2e-16 ***
EDUCAG.graduate 0.7876751  0.0082599   95.36   <2e-16 ***
SEXVAR.female   2.0854679  0.0083700  249.16   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 402744  on 401744  degrees of freedom
Residual deviance: 389959  on 401741  degrees of freedom
AIC: 389967

Number of Fisher Scoring iterations: 4
Note

Interpretation:

MEMS

emmeans(model, specs = "EDUCAG.graduate",
regrid = "response")
 EDUCAG.graduate  prob       SE  df asymp.LCL asymp.UCL
               0 0.209 0.000826 Inf     0.207     0.210
               1 0.173 0.000917 Inf     0.171     0.174

Results are averaged over the levels of: SEXVAR.female 
Confidence level used: 0.95 
Note

Interpretation: The probability of reporting depression is associated 0.036 pp reduction when evaluated at the means. That is, across all respondents, the effect of being a college graduate is protective and associated with a reduced probability of reporting depression.

for (i in c(20,40,50,60,80))
  
{
  print(paste("At age = ", i))
emmeans(model, specs = "EDUCAG.graduate", by = "SEXVAR.female", at = list(AGE80=i),
regrid = "response") |>
contrast(method = "revpairwise") |>
confint() %>% print()

  
}
[1] "At age =  20"
SEXVAR.female = 0:
 contrast                            estimate      SE  df asymp.LCL asymp.UCL
 EDUCAG.graduate1 - EDUCAG.graduate0  -0.0397 0.00136 Inf   -0.0423   -0.0370

SEXVAR.female = 1:
 contrast                            estimate      SE  df asymp.LCL asymp.UCL
 EDUCAG.graduate1 - EDUCAG.graduate0  -0.0548 0.00188 Inf   -0.0585   -0.0511

Confidence level used: 0.95 
[1] "At age =  40"
SEXVAR.female = 0:
 contrast                            estimate      SE  df asymp.LCL asymp.UCL
 EDUCAG.graduate1 - EDUCAG.graduate0  -0.0327 0.00112 Inf   -0.0349   -0.0305

SEXVAR.female = 1:
 contrast                            estimate      SE  df asymp.LCL asymp.UCL
 EDUCAG.graduate1 - EDUCAG.graduate0  -0.0491 0.00168 Inf   -0.0524   -0.0458

Confidence level used: 0.95 
[1] "At age =  50"
SEXVAR.female = 0:
 contrast                            estimate      SE  df asymp.LCL asymp.UCL
 EDUCAG.graduate1 - EDUCAG.graduate0  -0.0294 0.00100 Inf   -0.0313   -0.0274

SEXVAR.female = 1:
 contrast                            estimate      SE  df asymp.LCL asymp.UCL
 EDUCAG.graduate1 - EDUCAG.graduate0  -0.0458 0.00157 Inf   -0.0489   -0.0427

Confidence level used: 0.95 
[1] "At age =  60"
SEXVAR.female = 0:
 contrast                            estimate       SE  df asymp.LCL asymp.UCL
 EDUCAG.graduate1 - EDUCAG.graduate0  -0.0262 0.000897 Inf   -0.0280   -0.0245

SEXVAR.female = 1:
 contrast                            estimate       SE  df asymp.LCL asymp.UCL
 EDUCAG.graduate1 - EDUCAG.graduate0  -0.0423 0.001448 Inf   -0.0452   -0.0395

Confidence level used: 0.95 
[1] "At age =  80"
SEXVAR.female = 0:
 contrast                            estimate       SE  df asymp.LCL asymp.UCL
 EDUCAG.graduate1 - EDUCAG.graduate0  -0.0206 0.000713 Inf   -0.0220   -0.0192

SEXVAR.female = 1:
 contrast                            estimate       SE  df asymp.LCL asymp.UCL
 EDUCAG.graduate1 - EDUCAG.graduate0  -0.0353 0.001212 Inf   -0.0376   -0.0329

Confidence level used: 0.95 
Note

Interpretation

Holding all else constant, for 40 year old women, the effect of college graduation is -0.0327 pp reduction in the odds of reporting depression.

grad_effect <- predict(model, newdata = data.frame(EDUCAG.graduate=1, SEXVAR.female=brfss21.c$SEXVAR.female,AGE80=brfss21.c$AGE80), 
type = "response")

nongrad_effect <- predict(model, newdata = data.frame(EDUCAG.graduate=0, SEXVAR.female=brfss21.c$SEXVAR.female, AGE80=brfss21.c$AGE80), 
type = "response")

mean(grad_effect - nongrad_effect)
[1] -0.03659929
Note

Interpretation:
The marginal effect graduating college is -0.03659929 pp reduction in the odds of reporting depression.

Excel

Marginal Effects at Representative Values

Note.
Coefficients:
b x bx
(Intercept) -0.89255 1 -0.89255
X_AGE80 -0.01548 40 -0.61931
EDUCAG.graduate -0.23867 0 0
SEXVAR.female 0.734993 1 0.734993
Graduate Nongraduate
Prob 0.265898 0.314996
Hand R studio
Delta -0.0491 -0.0491

Marginal Effects at Mean

Coefficients:
b x bx
(Intercept) -0.89255 1 -0.89255
AGE80 -0.01548 57.74 -0.89398
EDUCAG.graduate -0.23867 0 0
SEXVAr.female 0.734993 0.535316 0.393453
0.77687 -1.39307
0.459854 0.248312
0.314996 0.198918
Graduates Nongraduates
Prob 0.163592 0.198918
Hand R studio
Delta -0.03533 -0.0366
Note.

Note

Interpretation:

Excel produced very similar output to that of R. This shows that the methods between R and Excel are convergent.