In-Class Lab Activity

EPI 553 — Logistic Regression Part 1 Lab Due: April 13, 2026

Instructions

Complete the four tasks below using the BRFSS 2020 dataset (brfss_logistic_2020.rds). Submit a knitted HTML file via Brightspace.

Data

The BRFSS 2020 Dataset

We continue using the Behavioral Risk Factor Surveillance System (BRFSS) 2020. In previous lectures, we modeled mentally unhealthy days as a continuous outcome. Today, we transform this into a binary outcome: frequent mental distress (FMD), defined as reporting 14 or more mentally unhealthy days in the past 30 days. This threshold is used by the CDC as a standard measure of frequent mental distress.

Variable	Description	Type
`fmd`	Frequent mental distress (No/Yes)	Factor (outcome)
`menthlth_days`	Mentally unhealthy days (0-30)	Numeric
`physhlth_days`	Physically unhealthy days (0-30)	Numeric
`sleep_hrs`	Hours of sleep per night	Numeric
`age`	Age in years	Numeric
`sex`	Male / Female	Factor
`bmi`	Body mass index	Numeric
`exercise`	Exercised in past 30 days (No/Yes)	Factor
`income_cat`	Household income category (1-8)	Numeric
`smoker`	Former/Never vs. Current	Factor

library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(ggeffects)
library(plotly)
library(gridExtra)
library(scales)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

brfss_full <- read_xpt(
  "/Users/nataliasmall/Downloads/EPI 553/LLCP2020.XPT "
) |>
  clean_names()

brfss_logistic <- brfss_full |>
  mutate(
    # Binary outcome: frequent mental distress (>= 14 days)
    menthlth_days = case_when(
      menthlth == 88                 ~ 0,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE                           ~ NA_real_
    ),
    fmd = factor(
      ifelse(menthlth_days >= 14, 1, 0),
      levels = c(0, 1),
      labels = c("No", "Yes")
    ),
    # Predictors
    physhlth_days = case_when(
      physhlth == 88                  ~ 0,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE                           ~ NA_real_
    ),
    sleep_hrs = case_when(
      sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
      TRUE                           ~ NA_real_
    ),
    age = age80,
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    smoker = factor(case_when(
      smokday2 %in% c(1, 2) ~ "Current",
      smokday2 == 3          ~ "Former/Never",
      TRUE                   ~ NA_character_
    ), levels = c("Former/Never", "Current"))
  ) |>
  filter(
    !is.na(fmd), !is.na(physhlth_days), !is.na(sleep_hrs),
    !is.na(age), age >= 18, !is.na(sex), !is.na(bmi),
    !is.na(exercise), !is.na(income_cat), !is.na(smoker)
  )

set.seed(1220)
brfss_logistic <- brfss_logistic |>
  dplyr::select(fmd, menthlth_days, physhlth_days, sleep_hrs, age, sex,
                bmi, exercise, income_cat, smoker) |>
  slice_sample(n = 5000)

# Save for lab use
saveRDS(brfss_logistic,
  "/Users/nataliasmall/Downloads/EPI 553/brfss_logistic_2020.rds")

tibble(Metric = c("Observations", "Variables", "FMD Cases", "FMD Prevalence"),
       Value  = c(nrow(brfss_logistic), ncol(brfss_logistic),
                  sum(brfss_logistic$fmd == "Yes"),
                  paste0(round(100 * mean(brfss_logistic$fmd == "Yes"), 1), "%"))) |>
  kable(caption = "Analytic Dataset Overview") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Analytic Dataset Overview
Metric	Value
Observations	5000
Variables	10
FMD Cases	757
FMD Prevalence	15.1%

Task 1: Explore the Binary Outcome (15 points)

1a. (5 pts) Create a frequency table showing the number and percentage of individuals with and without frequent mental distress.

Frequency_mentdis = cbind(freq = table(brfss_logistic$fmd), percent = prop.table(table(brfss_logistic$fmd)))

Frequency_mentdis |>
  kable(caption = "Frequency Table: Frequent Mental Distress") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Frequency Table: Frequent Mental Distress
	freq	percent
No	4243	0.8486
Yes	757	0.1514

1b. (5 pts) Create a descriptive summary table of at least 4 predictors, stratified by FMD status. Use tbl_summary().

brfss_logistic |>
  tbl_summary(
    by = fmd,
    include = c(menthlth_days, physhlth_days, sleep_hrs, age, sex,
                bmi, exercise, income_cat, smoker),
    type = list(
      c(menthlth_days, physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      menthlth_days ~ "Mentally unhealthy days",
      physhlth_days ~ "Physical unhealthy days",
      sleep_hrs     ~ "Sleep hours",
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercise in past 30 days",
      income_cat    ~ "Income category (1-8)",
      smoker        ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()

Characteristic	Overall N = 5,000¹	No N = 4,243¹	Yes N = 757¹	p-value²
Mentally unhealthy days	5 (9)	1 (3)	24 (6)	<0.001
Physical unhealthy days	4 (9)	3 (8)	10 (13)	<0.001
Sleep hours	7.00 (1.48)	7.09 (1.40)	6.51 (1.83)	<0.001
Age (years)	56 (16)	57 (16)	50 (16)	<0.001
Sex				<0.001
Male	2,701 (54%)	2,378 (56%)	323 (43%)
Female	2,299 (46%)	1,865 (44%)	434 (57%)
BMI	28.5 (6.3)	28.4 (6.2)	29.3 (7.0)	0.001
Exercise in past 30 days	3,673 (73%)	3,192 (75%)	481 (64%)	<0.001
Income category (1-8)	5.85 (2.11)	6.00 (2.04)	5.05 (2.29)	<0.001
Smoking status				<0.001
Former/Never	3,280 (66%)	2,886 (68%)	394 (52%)
Current	1,720 (34%)	1,357 (32%)	363 (48%)
¹ Mean (SD); n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

1c. (5 pts) Create a bar chart showing the proportion of FMD by exercise status OR smoking status.

# Summary statistics by exercise status
mean_fmd_ex <- brfss_logistic %>%
  mutate(fmd_binary = ifelse(fmd == "Yes", 1, 0)) %>% 
  group_by(exercise) %>%
  summarise(Mean = mean(fmd_binary, na.rm = TRUE)) %>%
  filter(!is.na(exercise))

# Exercise status bar chart
p1 <- ggplot(mean_fmd_ex, aes(x = exercise, y = Mean, fill = exercise)) +
  geom_col(alpha = 0.85) + 
  geom_text(aes(label = round(Mean, 2)), vjust = -0.3) + 
  scale_fill_brewer(palette = "Blues") +
  labs(
    title = "Proportion of FMD by Exercise Status",
    subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
    x = "Exercised in past 30 days",
    y = "Proportion FMD"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Summary statistics by smoking status
mean_fmd_smoke <- brfss_logistic %>%
  mutate(fmd_binary = ifelse(fmd == "Yes", 1, 0)) %>% 
  group_by(smoker) %>%
  summarise(Mean = mean(fmd_binary, na.rm = TRUE)) %>%
  filter(!is.na(smoker))

# Smoking status bar chart
p2 <- ggplot(mean_fmd_smoke, aes(x = smoker, y = Mean, fill = smoker)) +
  geom_col(alpha = 0.85) + 
  geom_text(aes(label = round(Mean, 2)), vjust = -0.3) + 
  scale_fill_brewer(palette = "Greens") +
  labs(
    title = "Proportion of FMD by Smoking Status",
    subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
    x = "Smoking Status",
    y = "Proportion with FMD"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

grid.arrange(p1, p2, ncol = 2)

Task 2: Simple Logistic Regression (20 points)

2a. (5 pts) Fit a simple logistic regression model predicting FMD from exercise. Report the coefficients on the log-odds scale.

# Fit a simple logistic regression model predicting FMD from exercise
mod_fmd_exercise <- glm(fmd ~ exercise, data = brfss_logistic,
                    family = binomial(link = "logit"))

tidy(mod_fmd_exercise, conf.int = TRUE, exponentiate = FALSE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	-1.337	0.068	-19.769	0	-1.471	-1.206
exerciseYes	-0.555	0.083	-6.655	0	-0.718	-0.391

2b. (5 pts) Exponentiate the coefficients to obtain odds ratios with 95% confidence intervals.

tidy(mod_fmd_exercise, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.263	0.068	-19.769	0	0.230	0.299
exerciseYes	0.574	0.083	-6.655	0	0.488	0.676

2c. (5 pts) Interpret the odds ratio for exercise in the context of the research question. Compared to those who have not exercised in the past 30 days, those who have exercised have 0.574 times the odds of displaying frequent mental distress (FMD). This represents a 42.6% reduction in odds (1 - 0.574 = 0.426).

2d. (5 pts) Create a plot showing the predicted probability of FMD across levels of a continuous predictor (e.g., age or sleep hours).

mod_age <- glm(fmd ~ age, data = brfss_logistic,
               family = binomial(link = "logit"))

ggpredict(mod_age, terms = "age [18:80]") |>
  plot() +
  labs(title = "Predicted Probability of Frequent Mental Distress by Age",
       subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
       x = "Age (years)", 
       y = "Predicted Probability of FMD") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal()

Task 3: Comparing Predictors (20 points)

3a. (5 pts) Fit three separate simple logistic regression models, each with a different predictor of your choice.

mod_A  <- glm(fmd ~ age,  data = brfss_logistic, 
              family = binomial)
mod_B  <- glm(fmd ~ bmi,        data = brfss_logistic, 
              family = binomial)
mod_C  <- glm(fmd ~ income_cat, data = brfss_logistic, 
              family = binomial)

tidy(mod_A, conf.int = TRUE, exponentiate = TRUE) |>
  filter(term != "(Intercept)") |>
  kable(digits = 3, caption = "Model 1 - FMD ~ Age (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Model 1 - FMD ~ Age (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
age	0.974	0.002	-10.703	0	0.97	0.979

tidy(mod_B, conf.int = TRUE, exponentiate = TRUE) |>
  filter(term != "(Intercept)") |>
  kable(digits = 3, caption = "Model 2 - FMD ~ BMI (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Model 2 - FMD ~ BMI (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
bmi	1.023	0.006	3.745	0	1.011	1.034

tidy(mod_C, conf.int = TRUE, exponentiate = TRUE) |>
  filter(term != "(Intercept)") |>
  kable(digits = 3, caption = "Model 3 - FMD ~ Household Income Category (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Model 3 - FMD ~ Household Income Category (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
income_cat	0.821	0.018	-11.102	0	0.793	0.85

3b. (10 pts) Create a table comparing the odds ratios from all three models.

tbl_or <- function(model, predictor_label) {
  tidy(model, conf.int = TRUE, exponentiate = TRUE) |>
    filter(term != "(Intercept)") |>
    mutate(Predictor = predictor_label) |>
    dplyr::select(Predictor, OR = estimate, `95% CI Lower` = conf.low,
                  `95% CI Upper` = conf.high, `p-value` = p.value)
}

bind_rows(
  tbl_or(mod_A, "Age (per 1 year)"),
  tbl_or(mod_B, "BMI (per 1 unit)"),
  tbl_or(mod_C, "Income category (per 1 unit)")
) |>
  mutate(
    across(c(OR, `95% CI Lower`, `95% CI Upper`), ~ round(.x, 3)),
    `p-value` = ifelse(`p-value` < 0.001, "< 0.001", round(`p-value`, 3))
  ) |>
  kable(caption = "Comparison: Crude Odds Ratios Across Three Simple Logistic Regression Models") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Comparison: Crude Odds Ratios Across Three Simple Logistic Regression Models
Predictor	OR	95% CI Lower	95% CI Upper	p-value
Age (per 1 year)	0.974	0.970	0.979	< 0.001
BMI (per 1 unit)	1.023	1.011	1.034	< 0.001
Income category (per 1 unit)	0.821	0.793	0.850	< 0.001

3c. (5 pts) Which predictor has the strongest crude association with FMD? Justify your answer. The predictor with the strongest crude association with FMD is the income category variable. Income category is the furthest away from 1.0 (the null value). For every 1-unit increase in income category, the odds of FMD decrease by 17.9% (1-0.821). For every 1-unit increase in BMI, the odds of FMD increase by 2.3% (1.023-1). For every 1-year increase in age, the odds of FMD decrease by 2.6% (1-0.974).

Task 4: Introduction to Multiple Logistic Regression (20 points)

4a. (5 pts) Fit a multiple logistic regression model predicting FMD from at least 3 predictors.

mod_multi  <- glm(fmd ~ age + bmi + income_cat,  data = brfss_logistic, 
                family = binomial(link = "logit"))

tidy(mod_multi, conf.int = TRUE, exponentiate = FALSE) |>
  kable(digits = 3,
        caption = "Multiple Logistic Regression: FMD ~ 3 Predictors (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Multiple Logistic Regression: FMD ~ 3 Predictors (Log-Odds Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.412	0.249	1.654	0.098	-0.076	0.900
age	-0.027	0.002	-10.804	0.000	-0.031	-0.022
bmi	0.015	0.006	2.467	0.014	0.003	0.026
income_cat	-0.202	0.018	-11.257	0.000	-0.237	-0.167

4b. (5 pts) Report the adjusted odds ratios using tbl_regression().

mod_multi |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(
      age        ~ "Age (per year)",
      bmi        ~ "Body mass index",
      income_cat ~ "Income category (per unit)"
    )
  ) |>
  bold_labels() |>
  bold_p()

Characteristic	OR	95% CI	p-value
Age (per year)	0.97	0.97, 0.98	<0.001
Body mass index	1.01	1.00, 1.03	0.014
Income category (per unit)	0.82	0.79, 0.85	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

4c. (5 pts) For one predictor, compare the crude OR (from Task 3) with the adjusted OR (from Task 4). Show both values.

# Crude ORs from simple models
crude_age <- tidy(mod_A, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "age") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Crude")

# Adjusted ORs from multiple model
adj_age <- tidy(mod_multi, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "age") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Adjusted")

bind_rows(crude_age, adj_age) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "95% CI Lower", "95% CI Upper", "Type"),
        caption = "Crude vs. Adjusted Odds Ratios for Age") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Crude vs. Adjusted Odds Ratios for Age
Predictor	OR	95% CI Lower	95% CI Upper	Type
age	0.974	0.970	0.979	Crude
age	0.974	0.969	0.978	Adjusted

4d. (5 pts) In 2-3 sentences, assess whether confounding is present for the predictor you chose. Which direction did the OR change, and what does this mean? Confounding is not present for the age predictor. The Crude OR and the Adjusted OR are identical (OR = 0.974) and the 95% CIs are relatively similar (95% CI: 0.970-0.979) vs. (95% CI:0.969-0.978). This indicates that adjusting for BMI and income category did not change the relationship between age and frequent mental distress (FMD).

Completion credit (25 points): Awarded for a complete, good-faith attempt at all tasks. Total: 75 + 25 = 100 points.

End of Lab Activity

Logistic Regression — Part 1

EPI 553 — Principles of Statistical Inference II (Spring 2026)

Natalia Small

April 9, 2026