Setup and Data

library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(ggeffects)
library(plotly)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

The BRFSS 2020 Dataset

We continue using the Behavioral Risk Factor Surveillance System (BRFSS) 2020. In previous lectures, we modeled mentally unhealthy days as a continuous outcome. Today, we transform this into a binary outcome: frequent mental distress (FMD), defined as reporting 14 or more mentally unhealthy days in the past 30 days. This threshold is used by the CDC as a standard measure of frequent mental distress.

Research question:

What individual, behavioral, and socioeconomic factors are associated with frequent mental distress among U.S. adults?

brfss_full <- read_xpt(
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.XPT"
) |>
  clean_names()
brfss_logistic <- brfss_full |>
  mutate(
    # Binary outcome: frequent mental distress (>= 14 days)
    menthlth_days = case_when(
      menthlth == 88                  ~ 0,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE                           ~ NA_real_
    ),
    fmd = factor(
      ifelse(menthlth_days >= 14, 1, 0),
      levels = c(0, 1),
      labels = c("No", "Yes")
    ),
    # Predictors
    physhlth_days = case_when(
      physhlth == 88                  ~ 0,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE                           ~ NA_real_
    ),
    sleep_hrs = case_when(
      sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
      TRUE                           ~ NA_real_
    ),
    age = age80,
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    smoker = factor(case_when(
      smokday2 %in% c(1, 2) ~ "Current",
      smokday2 == 3          ~ "Former/Never",
      TRUE                   ~ NA_character_
    ), levels = c("Former/Never", "Current"))
  ) |>
  filter(
    !is.na(fmd), !is.na(physhlth_days), !is.na(sleep_hrs),
    !is.na(age), age >= 18, !is.na(sex), !is.na(bmi),
    !is.na(exercise), !is.na(income_cat), !is.na(smoker)
  )

set.seed(1220)
brfss_logistic <- brfss_logistic |>
  dplyr::select(fmd, menthlth_days, physhlth_days, sleep_hrs, age, sex,
                bmi, exercise, income_cat, smoker) |>
  slice_sample(n = 5000)

# Save for lab use
saveRDS(brfss_logistic,
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.rds")

tibble(Metric = c("Observations", "Variables", "FMD Cases", "FMD Prevalence"),
       Value  = c(nrow(brfss_logistic), ncol(brfss_logistic),
                  sum(brfss_logistic$fmd == "Yes"),
                  paste0(round(100 * mean(brfss_logistic$fmd == "Yes"), 1), "%"))) |>
  kable(caption = "Analytic Dataset Overview") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Analytic Dataset Overview
Metric Value
Observations 5000
Variables 10
FMD Cases 757
FMD Prevalence 15.1%

Descriptive Overview

brfss_logistic |>
  tbl_summary(
    by = fmd,
    include = c(physhlth_days, sleep_hrs, age, sex, bmi, exercise,
                income_cat, smoker),
    type = list(
      c(physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      physhlth_days ~ "Physical unhealthy days",
      sleep_hrs     ~ "Sleep hours",
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercise in past 30 days",
      income_cat    ~ "Income category (1-8)",
      smoker        ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()
Characteristic Overall
N = 5,000
1
No
N = 4,243
1
Yes
N = 757
1
p-value
Physical unhealthy days 4 (9) 3 (8) 10 (13)
Sleep hours 7.00 (1.48) 7.09 (1.40) 6.51 (1.83)
Age (years) 56 (16) 57 (16) 50 (16)
Sex



    Male 2,701 (54%) 2,378 (56%) 323 (43%)
    Female 2,299 (46%) 1,865 (44%) 434 (57%)
BMI 28.5 (6.3) 28.4 (6.2) 29.3 (7.0)
Exercise in past 30 days 3,673 (73%) 3,192 (75%) 481 (64%)
Income category (1-8) 5.85 (2.11) 6.00 (2.04) 5.05 (2.29)
Smoking status



    Former/Never 3,280 (66%) 2,886 (68%) 394 (52%)
    Current 1,720 (34%) 1,357 (32%) 363 (48%)
1 Mean (SD); n (%)

In-Class Lab Activity

Data

Variable Description Type
fmd Frequent mental distress (No/Yes) Factor (outcome)
menthlth_days Mentally unhealthy days (0-30) Numeric
physhlth_days Physically unhealthy days (0-30) Numeric
sleep_hrs Hours of sleep per night Numeric
age Age in years Numeric
sex Male / Female Factor
bmi Body mass index Numeric
exercise Exercised in past 30 days (No/Yes) Factor
income_cat Household income category (1-8) Numeric
smoker Former/Never vs. Current Factor

Task 1: Explore the Binary Outcome (15 points)

1a. (5 pts) Create a frequency table showing the number and percentage of individuals with and without frequent mental distress.

# Create frequency table (counts)
freq_table <- table(brfss_logistic$fmd)

# Convert to percentages
percent_table <- prop.table(freq_table) * 100

# Combine into one table
result <- data.frame(
  Category = names(freq_table),
  Count = as.vector(freq_table),
  Percentage = round(as.vector(percent_table), 2)
)

# View result
print(result)
##   Category Count Percentage
## 1       No  4243      84.86
## 2      Yes   757      15.14

1b. (5 pts) Create a descriptive summary table of at least 4 predictors, stratified by FMD status. Use tbl_summary().

brfss_logistic |>
  tbl_summary(
    by = fmd,
    include = c(exercise, age, sex, bmi),
    type = list(
      c(age, bmi) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      exercise      ~ "Exercise",
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()
Characteristic Overall
N = 5,000
1
No
N = 4,243
1
Yes
N = 757
1
p-value
Exercise 3,673 (73%) 3,192 (75%) 481 (64%)
Age (years) 56 (16) 57 (16) 50 (16)
Sex



    Male 2,701 (54%) 2,378 (56%) 323 (43%)
    Female 2,299 (46%) 1,865 (44%) 434 (57%)
BMI 28.5 (6.3) 28.4 (6.2) 29.3 (7.0)
1 n (%); Mean (SD)

1c. (5 pts) Create a bar chart showing the proportion of FMD by exercise status OR smoking status.

ggplot(brfss_logistic, aes(x = fmd, fill = exercise)) +
  geom_bar(color = "black", position = "dodge") +
  geom_text(stat = "count", aes(label = ..count..),
            position = position_dodge(width = 0.9), vjust = -0.5) +
  labs(title = "Frequent mental distress by exercise",
       x = "Frequent mental distress",
       y = "Count") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5)
  )

Task 2: Simple Logistic Regression (20 points)

2a. (5 pts) Fit a simple logistic regression model predicting FMD from exercise. Report the coefficients on the log-odds scale.

mod_ex <- glm(fmd ~ exercise, data = brfss_logistic,
                      family = binomial(link = "logit"))
tidy(mod_ex, conf.int = TRUE, exponentiate = FALSE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) -1.337 0.068 -19.769 0 -1.471 -1.206
exerciseYes -0.555 0.083 -6.655 0 -0.718 -0.391

2b. (5 pts) Exponentiate the coefficients to obtain odds ratios with 95% confidence intervals.

mod_ex <- glm(
  fmd ~ exercise,
  data = brfss_logistic,
  family = binomial(link = "logit")
)

tidy(mod_ex, conf.int = TRUE, exponentiate = TRUE)  |>
  kable(
    digits = 3,
    caption = "Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)"
  )  |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.263 0.068 -19.769 0 0.230 0.299
exerciseYes 0.574 0.083 -6.655 0 0.488 0.676

2c. (5 pts) Interpret the odds ratio for exercise in the context of the research question.

The odds ratio for exerciseYes is 0.574(95%CI: 0.488-0.676). This shows us that individuals who exercised in the past 30 days have about 43% lower odds of exerpeincing frequent mental distress compared to those who do not exercise.

2d. (5 pts) Create a plot showing the predicted probability of FMD across levels of a continuous predictor (e.g., age or sleep hours).

mod_age <- glm(fmd ~ age, data = brfss_logistic,
               family = binomial(link = "logit"))

ggpredict(mod_age, terms = "age [18:80]") |>
  plot() +
  labs(title = "Predicted Probability of Frequent Mental Distress by Age",
       x = "Age (years)", y = "Predicted Probability of FMD") +
  theme_minimal()

Task 3: Comparing Predictors (20 points)

3a. (5 pts) Fit three separate simple logistic regression models, each with a different predictor of your choice.

mod_smoker <- glm(fmd ~ smoker, data = brfss_logistic,
                  family = binomial(link = "logit"))

tidy(mod_smoker, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ Smoking (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Simple Logistic Regression: FMD ~ Smoking (Odds Ratio Scale)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.137 0.054 -37.076 0 0.123 0.151
smokerCurrent 1.959 0.080 8.424 0 1.675 2.291
mod_bmi <- glm(fmd ~ bmi, data = brfss_logistic,
                  family = binomial(link = "logit"))

tidy(mod_bmi, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ BMI (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Simple Logistic Regression: FMD ~ BMI (Odds Ratio Scale)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.094 0.178 -13.299 0 0.066 0.133
bmi 1.023 0.006 3.745 0 1.011 1.034
mod_age <- glm(fmd ~ age, data = brfss_logistic,
                  family = binomial(link = "logit"))

tidy(mod_age, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ Age (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Simple Logistic Regression: FMD ~ Age (Odds Ratio Scale)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.725 0.131 -2.451 0.014 0.559 0.936
age 0.974 0.002 -10.703 0.000 0.970 0.979

3b. (10 pts) Create a table comparing the odds ratios from all three models.

m1 <- glm(fmd ~ age, family = binomial, data = brfss_logistic)
m2 <- glm(fmd ~ bmi, family = binomial, data = brfss_logistic)
m3 <- glm(fmd ~ smoker, family = binomial, data = brfss_logistic)

tribble(
  ~Model, ~`Odds Ratio`, ~`95% CI`,
  
  "M1 (age)",
  round(exp(coef(m1)[2]), 3),
  paste0("(", round(exp(confint(m1)[2,1]),3), ", ", round(exp(confint(m1)[2,2]),3), ")"),

  
  "M2 (bmi)",
  round(exp(coef(m2)[2]), 3),
  paste0("(", round(exp(confint(m2)[2,1]),3), ", ", round(exp(confint(m2)[2,2]),3), ")"),
 
  
  "M3 (smoker)",
  round(exp(coef(m3)[2]), 3),
  paste0("(", round(exp(confint(m3)[2,1]),3), ", ", round(exp(confint(m3)[2,2]),3), ")"),
) |>
  kable(caption = "Comparing 3 Logistic Regression Models") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(0, bold = TRUE)
Comparing 3 Logistic Regression Models
Model Odds Ratio 95% CI
M1 (age) 0.974 (0.97, 0.979)
M2 (bmi) 1.023 (1.011, 1.034)
M3 (smoker) 1.959 (1.675, 2.291)

3c. (5 pts) Which predictor has the strongest crude association with FMD? Justify your answer.

Among the three predictors, smoking has the strongest association with FMD. The OR is 1.959, meaning that current smokers have nearly twice the odds of experiencing frequent mental distress compared to non-smokers. Age and BMI have a weaker association but are still statistically significant.

Task 4: Introduction to Multiple Logistic Regression (20 points)

4a. (5 pts) Fit a multiple logistic regression model predicting FMD from at least 3 predictors.

4b. (5 pts) Report the adjusted odds ratios using tbl_regression().

mod_multi <- glm(fmd ~ age + bmi + smoker,
                 data = brfss_logistic,
                 family = binomial(link = "logit"))

mod_multi |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(
      age    ~ "Age",
      bmi    ~ "BMI",
      smoker ~ "Smoking status"
    )
  ) |>
  bold_labels() |>
  bold_p()
Characteristic OR 95% CI p-value
Age 0.98 0.97, 0.98 <0.001
BMI 1.02 1.01, 1.03 <0.001
Smoking status


    Former/Never
    Current 1.67 1.42, 1.97 <0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

4c. (5 pts) For one predictor, compare the crude OR (from Task 3) with the adjusted OR (from Task 4). Show both values.

# Crude ORs 

crude_smoker <- tidy(mod_smoker, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "smokerCurrent") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Crude")

# Adjusted ORs 

adj_smoker <- tidy(mod_multi, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "smokerCurrent") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Adjusted")

bind_rows(crude_smoker, adj_smoker) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "95% CI Lower", "95% CI Upper", "Type"),
        caption = "Crude vs. Adjusted Odds Ratios") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Crude vs. Adjusted Odds Ratios
Predictor OR 95% CI Lower 95% CI Upper Type
smokerCurrent 1.959 1.675 2.291 Crude
smokerCurrent 1.672 1.421 1.967 Adjusted

4d. (5 pts) In 2-3 sentences, assess whether confounding is present for the predictor you chose. Which direction did the OR change, and what does this mean?

Confounding is present for smoking status. The crude ratio for current smokers is 1.959, but after adjusting for age and BMI, the adjusted odds ratio decreased to 1.672. The shows that part of the association between smoking and FMD is explained by other predictors. The direction of the OR decreased showing us that age and BMI are positive confounders.

Completion credit (25 points): Awarded for a complete, good-faith attempt at all tasks. Total: 75 + 25 = 100 points.

End of Lab Activity