In-Class Lab Activity

EPI 553 — Logistic Regression Part 1 Lab Due: April 13, 2026

Instructions

Complete the four tasks below using the BRFSS 2020 dataset (brfss_logistic_2020.rds). Submit a knitted HTML file via Brightspace.


Data

The BRFSS 2020 Dataset

We continue using the Behavioral Risk Factor Surveillance System (BRFSS) 2020. In previous lectures, we modeled mentally unhealthy days as a continuous outcome. Today, we transform this into a binary outcome: frequent mental distress (FMD), defined as reporting 14 or more mentally unhealthy days in the past 30 days. This threshold is used by the CDC as a standard measure of frequent mental distress.

Variable Description Type
fmd Frequent mental distress (No/Yes) Factor (outcome)
menthlth_days Mentally unhealthy days (0-30) Numeric
physhlth_days Physically unhealthy days (0-30) Numeric
sleep_hrs Hours of sleep per night Numeric
age Age in years Numeric
sex Male / Female Factor
bmi Body mass index Numeric
exercise Exercised in past 30 days (No/Yes) Factor
income_cat Household income category (1-8) Numeric
smoker Former/Never vs. Current Factor
library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(ggeffects)
library(plotly)
library(gridExtra)
library(scales)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
brfss_full <- read_xpt(
  "/Users/nataliasmall/Downloads/EPI 553/LLCP2020.XPT "
) |>
  clean_names()
brfss_logistic <- brfss_full |>
  mutate(
    # Binary outcome: frequent mental distress (>= 14 days)
    menthlth_days = case_when(
      menthlth == 88                 ~ 0,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE                           ~ NA_real_
    ),
    fmd = factor(
      ifelse(menthlth_days >= 14, 1, 0),
      levels = c(0, 1),
      labels = c("No", "Yes")
    ),
    # Predictors
    physhlth_days = case_when(
      physhlth == 88                  ~ 0,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE                           ~ NA_real_
    ),
    sleep_hrs = case_when(
      sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
      TRUE                           ~ NA_real_
    ),
    age = age80,
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    smoker = factor(case_when(
      smokday2 %in% c(1, 2) ~ "Current",
      smokday2 == 3          ~ "Former/Never",
      TRUE                   ~ NA_character_
    ), levels = c("Former/Never", "Current"))
  ) |>
  filter(
    !is.na(fmd), !is.na(physhlth_days), !is.na(sleep_hrs),
    !is.na(age), age >= 18, !is.na(sex), !is.na(bmi),
    !is.na(exercise), !is.na(income_cat), !is.na(smoker)
  )

set.seed(1220)
brfss_logistic <- brfss_logistic |>
  dplyr::select(fmd, menthlth_days, physhlth_days, sleep_hrs, age, sex,
                bmi, exercise, income_cat, smoker) |>
  slice_sample(n = 5000)

# Save for lab use
saveRDS(brfss_logistic,
  "/Users/nataliasmall/Downloads/EPI 553/brfss_logistic_2020.rds")

tibble(Metric = c("Observations", "Variables", "FMD Cases", "FMD Prevalence"),
       Value  = c(nrow(brfss_logistic), ncol(brfss_logistic),
                  sum(brfss_logistic$fmd == "Yes"),
                  paste0(round(100 * mean(brfss_logistic$fmd == "Yes"), 1), "%"))) |>
  kable(caption = "Analytic Dataset Overview") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Analytic Dataset Overview
Metric Value
Observations 5000
Variables 10
FMD Cases 757
FMD Prevalence 15.1%

Task 1: Explore the Binary Outcome (15 points)

1a. (5 pts) Create a frequency table showing the number and percentage of individuals with and without frequent mental distress.

Frequency_mentdis = cbind(freq = table(brfss_logistic$fmd), percent = prop.table(table(brfss_logistic$fmd)))

Frequency_mentdis |>
  kable(caption = "Frequency Table: Frequent Mental Distress") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Frequency Table: Frequent Mental Distress
freq percent
No 4243 0.8486
Yes 757 0.1514

1b. (5 pts) Create a descriptive summary table of at least 4 predictors, stratified by FMD status. Use tbl_summary().

brfss_logistic |>
  tbl_summary(
    by = fmd,
    include = c(menthlth_days, physhlth_days, sleep_hrs, age, sex,
                bmi, exercise, income_cat, smoker),
    type = list(
      c(menthlth_days, physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      menthlth_days ~ "Mentally unhealthy days",
      physhlth_days ~ "Physical unhealthy days",
      sleep_hrs     ~ "Sleep hours",
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercise in past 30 days",
      income_cat    ~ "Income category (1-8)",
      smoker        ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()
Characteristic Overall
N = 5,000
1
No
N = 4,243
1
Yes
N = 757
1
p-value2
Mentally unhealthy days 5 (9) 1 (3) 24 (6) <0.001
Physical unhealthy days 4 (9) 3 (8) 10 (13) <0.001
Sleep hours 7.00 (1.48) 7.09 (1.40) 6.51 (1.83) <0.001
Age (years) 56 (16) 57 (16) 50 (16) <0.001
Sex


<0.001
    Male 2,701 (54%) 2,378 (56%) 323 (43%)
    Female 2,299 (46%) 1,865 (44%) 434 (57%)
BMI 28.5 (6.3) 28.4 (6.2) 29.3 (7.0) 0.001
Exercise in past 30 days 3,673 (73%) 3,192 (75%) 481 (64%) <0.001
Income category (1-8) 5.85 (2.11) 6.00 (2.04) 5.05 (2.29) <0.001
Smoking status


<0.001
    Former/Never 3,280 (66%) 2,886 (68%) 394 (52%)
    Current 1,720 (34%) 1,357 (32%) 363 (48%)
1 Mean (SD); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

1c. (5 pts) Create a bar chart showing the proportion of FMD by exercise status OR smoking status.

# Summary statistics by exercise status
mean_fmd_ex <- brfss_logistic %>%
  mutate(fmd_binary = ifelse(fmd == "Yes", 1, 0)) %>% 
  group_by(exercise) %>%
  summarise(Mean = mean(fmd_binary, na.rm = TRUE)) %>%
  filter(!is.na(exercise))

# Exercise status bar chart
p1 <- ggplot(mean_fmd_ex, aes(x = exercise, y = Mean, fill = exercise)) +
  geom_col(alpha = 0.85) + 
  geom_text(aes(label = round(Mean, 2)), vjust = -0.3) + 
  scale_fill_brewer(palette = "Blues") +
  labs(
    title = "Proportion of FMD by Exercise Status",
    subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
    x = "Exercised in past 30 days",
    y = "Proportion FMD"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Summary statistics by smoking status
mean_fmd_smoke <- brfss_logistic %>%
  mutate(fmd_binary = ifelse(fmd == "Yes", 1, 0)) %>% 
  group_by(smoker) %>%
  summarise(Mean = mean(fmd_binary, na.rm = TRUE)) %>%
  filter(!is.na(smoker))

# Smoking status bar chart
p2 <- ggplot(mean_fmd_smoke, aes(x = smoker, y = Mean, fill = smoker)) +
  geom_col(alpha = 0.85) + 
  geom_text(aes(label = round(Mean, 2)), vjust = -0.3) + 
  scale_fill_brewer(palette = "Greens") +
  labs(
    title = "Proportion of FMD by Smoking Status",
    subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
    x = "Smoking Status",
    y = "Proportion with FMD"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

grid.arrange(p1, p2, ncol = 2)

Task 2: Simple Logistic Regression (20 points)

2a. (5 pts) Fit a simple logistic regression model predicting FMD from exercise. Report the coefficients on the log-odds scale.

# Fit a simple logistic regression model predicting FMD from exercise
mod_fmd_exercise <- glm(fmd ~ exercise, data = brfss_logistic,
                    family = binomial(link = "logit"))

tidy(mod_fmd_exercise, conf.int = TRUE, exponentiate = FALSE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) -1.337 0.068 -19.769 0 -1.471 -1.206
exerciseYes -0.555 0.083 -6.655 0 -0.718 -0.391

2b. (5 pts) Exponentiate the coefficients to obtain odds ratios with 95% confidence intervals.

tidy(mod_fmd_exercise, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.263 0.068 -19.769 0 0.230 0.299
exerciseYes 0.574 0.083 -6.655 0 0.488 0.676

2c. (5 pts) Interpret the odds ratio for exercise in the context of the research question. Compared to those who have not exercised in the past 30 days, those who have exercised have 0.574 times the odds of displaying frequent mental distress (FMD). This represents a 42.6% reduction in odds (1 - 0.574 = 0.426).

2d. (5 pts) Create a plot showing the predicted probability of FMD across levels of a continuous predictor (e.g., age or sleep hours).

mod_age <- glm(fmd ~ age, data = brfss_logistic,
               family = binomial(link = "logit"))

ggpredict(mod_age, terms = "age [18:80]") |>
  plot() +
  labs(title = "Predicted Probability of Frequent Mental Distress by Age",
       subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
       x = "Age (years)", 
       y = "Predicted Probability of FMD") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal()

Task 3: Comparing Predictors (20 points)

3a. (5 pts) Fit three separate simple logistic regression models, each with a different predictor of your choice.

mod_A  <- glm(fmd ~ age,  data = brfss_logistic, 
              family = binomial)
mod_B  <- glm(fmd ~ bmi,        data = brfss_logistic, 
              family = binomial)
mod_C  <- glm(fmd ~ income_cat, data = brfss_logistic, 
              family = binomial)

tidy(mod_A, conf.int = TRUE, exponentiate = TRUE) |>
  filter(term != "(Intercept)") |>
  kable(digits = 3, caption = "Model 1 - FMD ~ Age (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Model 1 - FMD ~ Age (Odds Ratio Scale)
term estimate std.error statistic p.value conf.low conf.high
age 0.974 0.002 -10.703 0 0.97 0.979
tidy(mod_B, conf.int = TRUE, exponentiate = TRUE) |>
  filter(term != "(Intercept)") |>
  kable(digits = 3, caption = "Model 2 - FMD ~ BMI (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Model 2 - FMD ~ BMI (Odds Ratio Scale)
term estimate std.error statistic p.value conf.low conf.high
bmi 1.023 0.006 3.745 0 1.011 1.034
tidy(mod_C, conf.int = TRUE, exponentiate = TRUE) |>
  filter(term != "(Intercept)") |>
  kable(digits = 3, caption = "Model 3 - FMD ~ Household Income Category (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Model 3 - FMD ~ Household Income Category (Odds Ratio Scale)
term estimate std.error statistic p.value conf.low conf.high
income_cat 0.821 0.018 -11.102 0 0.793 0.85

3b. (10 pts) Create a table comparing the odds ratios from all three models.

tbl_or <- function(model, predictor_label) {
  tidy(model, conf.int = TRUE, exponentiate = TRUE) |>
    filter(term != "(Intercept)") |>
    mutate(Predictor = predictor_label) |>
    dplyr::select(Predictor, OR = estimate, `95% CI Lower` = conf.low,
                  `95% CI Upper` = conf.high, `p-value` = p.value)
}

bind_rows(
  tbl_or(mod_A, "Age (per 1 year)"),
  tbl_or(mod_B, "BMI (per 1 unit)"),
  tbl_or(mod_C, "Income category (per 1 unit)")
) |>
  mutate(
    across(c(OR, `95% CI Lower`, `95% CI Upper`), ~ round(.x, 3)),
    `p-value` = ifelse(`p-value` < 0.001, "< 0.001", round(`p-value`, 3))
  ) |>
  kable(caption = "Comparison: Crude Odds Ratios Across Three Simple Logistic Regression Models") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Comparison: Crude Odds Ratios Across Three Simple Logistic Regression Models
Predictor OR 95% CI Lower 95% CI Upper p-value
Age (per 1 year) 0.974 0.970 0.979 < 0.001
BMI (per 1 unit) 1.023 1.011 1.034 < 0.001
Income category (per 1 unit) 0.821 0.793 0.850 < 0.001

3c. (5 pts) Which predictor has the strongest crude association with FMD? Justify your answer. The predictor with the strongest crude association with FMD is the income category variable. Income category is the furthest away from 1.0 (the null value). For every 1-unit increase in income category, the odds of FMD decrease by 17.9% (1-0.821). For every 1-unit increase in BMI, the odds of FMD increase by 2.3% (1.023-1). For every 1-year increase in age, the odds of FMD decrease by 2.6% (1-0.974).

Task 4: Introduction to Multiple Logistic Regression (20 points)

4a. (5 pts) Fit a multiple logistic regression model predicting FMD from at least 3 predictors.

mod_multi  <- glm(fmd ~ age + bmi + income_cat,  data = brfss_logistic, 
                family = binomial(link = "logit"))

tidy(mod_multi, conf.int = TRUE, exponentiate = FALSE) |>
  kable(digits = 3,
        caption = "Multiple Logistic Regression: FMD ~ 3 Predictors (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Multiple Logistic Regression: FMD ~ 3 Predictors (Log-Odds Scale)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.412 0.249 1.654 0.098 -0.076 0.900
age -0.027 0.002 -10.804 0.000 -0.031 -0.022
bmi 0.015 0.006 2.467 0.014 0.003 0.026
income_cat -0.202 0.018 -11.257 0.000 -0.237 -0.167

4b. (5 pts) Report the adjusted odds ratios using tbl_regression().

mod_multi |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(
      age        ~ "Age (per year)",
      bmi        ~ "Body mass index",
      income_cat ~ "Income category (per unit)"
    )
  ) |>
  bold_labels() |>
  bold_p()
Characteristic OR 95% CI p-value
Age (per year) 0.97 0.97, 0.98 <0.001
Body mass index 1.01 1.00, 1.03 0.014
Income category (per unit) 0.82 0.79, 0.85 <0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

4c. (5 pts) For one predictor, compare the crude OR (from Task 3) with the adjusted OR (from Task 4). Show both values.

# Crude ORs from simple models
crude_age <- tidy(mod_A, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "age") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Crude")

# Adjusted ORs from multiple model
adj_age <- tidy(mod_multi, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "age") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Adjusted")

bind_rows(crude_age, adj_age) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "95% CI Lower", "95% CI Upper", "Type"),
        caption = "Crude vs. Adjusted Odds Ratios for Age") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Crude vs. Adjusted Odds Ratios for Age
Predictor OR 95% CI Lower 95% CI Upper Type
age 0.974 0.970 0.979 Crude
age 0.974 0.969 0.978 Adjusted

4d. (5 pts) In 2-3 sentences, assess whether confounding is present for the predictor you chose. Which direction did the OR change, and what does this mean? Confounding is not present for the age predictor. The Crude OR and the Adjusted OR are identical (OR = 0.974) and the 95% CIs are relatively similar (95% CI: 0.970-0.979) vs. (95% CI:0.969-0.978). This indicates that adjusting for BMI and income category did not change the relationship between age and frequent mental distress (FMD).

Completion credit (25 points): Awarded for a complete, good-faith attempt at all tasks. Total: 75 + 25 = 100 points.

End of Lab Activity