library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(ggeffects)
library(plotly)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

Load data

brfss_full <- read_xpt("data/LLCP2020.XPT") |>
  clean_names()

Prepare data

brfss_logistic <- brfss_full |>
  mutate(
    # Binary outcome: frequent mental distress (>= 14 days)
    menthlth_days = case_when(
      menthlth == 88                  ~ 0,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE                           ~ NA_real_
    ),
    fmd = factor(
      ifelse(menthlth_days >= 14, 1, 0),
      levels = c(0, 1),
      labels = c("No", "Yes")
    ),
    # Predictors
    physhlth_days = case_when(
      physhlth == 88                  ~ 0,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE                           ~ NA_real_
    ),
    sleep_hrs = case_when(
      sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
      TRUE                           ~ NA_real_
    ),
    age = age80,
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    smoker = factor(case_when(
      smokday2 %in% c(1, 2) ~ "Current",
      smokday2 == 3          ~ "Former/Never",
      TRUE                   ~ NA_character_
    ), levels = c("Former/Never", "Current"))
  ) |>
  filter(
    !is.na(fmd), !is.na(physhlth_days), !is.na(sleep_hrs),
    !is.na(age), age >= 18, !is.na(sex), !is.na(bmi),
    !is.na(exercise), !is.na(income_cat), !is.na(smoker)
  )

set.seed(1220)
brfss_logistic <- brfss_logistic |>
  dplyr::select(fmd, menthlth_days, physhlth_days, sleep_hrs, age, sex,
                bmi, exercise, income_cat, smoker) |>
  slice_sample(n = 5000)

# Save for lab use
saveRDS(brfss_logistic, "data/brfss_logistic_2020.rds")

Instructions

Complete the four tasks below using the BRFSS 2020 dataset (brfss_logistic_2020.rds). Submit a knitted HTML file via Brightspace. You may collaborate, but each student must submit their own work.

Data

Variable	Description	Type
`fmd`	Frequent mental distress (No/Yes)	Factor (outcome)
`menthlth_days`	Mentally unhealthy days (0-30)	Numeric
`physhlth_days`	Physically unhealthy days (0-30)	Numeric
`sleep_hrs`	Hours of sleep per night	Numeric
`age`	Age in years	Numeric
`sex`	Male / Female	Factor
`bmi`	Body mass index	Numeric
`exercise`	Exercised in past 30 days (No/Yes)	Factor
`income_cat`	Household income category (1-8)	Numeric
`smoker`	Former/Never vs. Current	Factor

library(tidyverse)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(ggeffects)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

brfss_logistic <- readRDS("data/brfss_logistic_2020.rds")

Task 1: Explore the Binary Outcome (15 points)

1a. (5 pts) Create a frequency table showing the number and percentage of individuals with and without frequent mental distress.

tibble(Metric = c("Observations", "Variables", "FMD Cases", "FMD Prevalence"),
       Value  = c(nrow(brfss_logistic), ncol(brfss_logistic),
                  sum(brfss_logistic$fmd == "Yes"),
                  paste0(round(100 * mean(brfss_logistic$fmd == "Yes"), 1), "%"))) |>
  kable(caption = "Analytic Dataset Overview") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Analytic Dataset Overview
Metric	Value
Observations	5000
Variables	10
FMD Cases	757
FMD Prevalence	15.1%

1b. (5 pts) Create a descriptive summary table of at least 4 predictors, stratified by FMD status. Use tbl_summary().

brfss_logistic |>
  tbl_summary(
    by = fmd,
    include = c(physhlth_days, sleep_hrs, age, sex, bmi, exercise,
                income_cat, smoker),
    type = list(
      c(physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      physhlth_days ~ "Physical unhealthy days",
      sleep_hrs     ~ "Sleep hours",
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercise in past 30 days",
      income_cat    ~ "Income category (1-8)",
      smoker        ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()

Characteristic	Overall N = 5,000¹	No N = 4,243¹	Yes N = 757¹	p-value²
Physical unhealthy days	4 (9)	3 (8)	10 (13)	<0.001
Sleep hours	7.00 (1.48)	7.09 (1.40)	6.51 (1.83)	<0.001
Age (years)	56 (16)	57 (16)	50 (16)	<0.001
Sex				<0.001
Male	2,701 (54%)	2,378 (56%)	323 (43%)
Female	2,299 (46%)	1,865 (44%)	434 (57%)
BMI	28.5 (6.3)	28.4 (6.2)	29.3 (7.0)	0.001
Exercise in past 30 days	3,673 (73%)	3,192 (75%)	481 (64%)	<0.001
Income category (1-8)	5.85 (2.11)	6.00 (2.04)	5.05 (2.29)	<0.001
Smoking status				<0.001
Former/Never	3,280 (66%)	2,886 (68%)	394 (52%)
Current	1,720 (34%)	1,357 (32%)	363 (48%)
¹ Mean (SD); n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

1c. (5 pts) Create a bar chart showing the proportion of FMD by exercise status OR smoking status.

ggplot(brfss_logistic) +
  geom_bar(aes(x=exercise, y= fmd, fill = fmd), stat="identity") +
  theme_minimal() +
  labs(x = "Exercise", y = "FMD Status", title = "FMD Status by Exercise")

brfss_logistic |>
group_by(exercise) |> mutate(percent = mean(fmd == "Yes")) |>
mutate(percent = round(percent, digits = 2)) |>
ggplot(aes(x = exercise, y = percent, fill = percent)) +
  geom_bar(position="dodge", stat="identity") +
  geom_text(stat = "identity", aes(label = after_stat(y)), vjust = -0.3) +
  labs(
    title = "Distribution of Education Level",
    subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
    x = "Education Level",
    y = "Mean Days with Poor Mental Health"
  ) +
  theme_minimal() +
  scale_y_continuous(labels=scales::percent) +
  theme(legend.position = "none")

Task 2: Simple Logistic Regression (20 points)

2a. (5 pts) Fit a simple logistic regression model predicting FMD from exercise. Report the coefficients on the log-odds scale.

mod_exercise <- glm(fmd ~ exercise, data = brfss_logistic,
                      family = binomial(link = "logit"))

tidy(mod_exercise, conf.int = TRUE, exponentiate = FALSE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	-1.337	0.068	-19.769	0	-1.471	-1.206
exerciseYes	-0.555	0.083	-6.655	0	-0.718	-0.391

The coefficient for those who exercise is -0.56.

2b. (5 pts) Exponentiate the coefficients to obtain odds ratios with 95% confidence intervals.

tidy(mod_exercise, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.263	0.068	-19.769	0	0.230	0.299
exerciseYes	0.574	0.083	-6.655	0	0.488	0.676

2c. (5 pts) Interpret the odds ratio for exercise in the context of the research question.

The odds ratio for exercise is 0.57. This indicates that exercise is associated with 0.57 lower odds of FMD.

2d. (5 pts) Create a plot showing the predicted probability of FMD across levels of a continuous predictor (e.g., age or sleep hours).

mod_age <- glm(fmd ~ age, data = brfss_logistic,
               family = binomial(link = "logit"))
ggpredict(mod_age, terms = "age [18:80]") |>
  plot() +
  labs(title = "Predicted Probability of Frequent Mental Distress by Age",
       x = "Age (years)", y = "Predicted Probability of FMD") +
  theme_minimal()

Task 3: Comparing Predictors (20 points)

3a. (5 pts) Fit three separate simple logistic regression models, each with a different predictor of your choice.

mod_income <- glm(fmd ~ income_cat, data = brfss_logistic,
               family = binomial(link = "logit"))
tidy(mod_income, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Income (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Income (Log-Odds Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.531	0.101	-6.258	0	0.435	0.647
income_cat	0.821	0.018	-11.102	0	0.793	0.850

mod_sleep <- glm(fmd ~ sleep_hrs, data = brfss_logistic,
               family = binomial(link = "logit"))
tidy(mod_sleep, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Sleep (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Sleep (Log-Odds Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	1.101	0.184	0.523	0.601	0.767	1.580
sleep_hrs	0.765	0.027	-9.835	0.000	0.725	0.807

mod_exercise <- glm(fmd ~ exercise, data = brfss_logistic,
                      family = binomial(link = "logit"))
tidy(mod_exercise, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.263	0.068	-19.769	0	0.230	0.299
exerciseYes	0.574	0.083	-6.655	0	0.488	0.676

3b. (10 pts) Create a table comparing the odds ratios from all three models.

mod_income_table <- tidy(mod_income, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "income_cat") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Crude")

mod_sleep_table <- tidy(mod_sleep, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "sleep_hrs") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Crude")

mod_exercise_table <- tidy(mod_exercise, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "exerciseYes") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Crude")

bind_rows(mod_income_table, mod_sleep_table, mod_exercise_table) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "95% CI Lower", "95% CI Upper", "Type"),
        caption = "Crude vs. Adjusted Odds Ratios") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Crude vs. Adjusted Odds Ratios
Predictor	OR	95% CI Lower	95% CI Upper	Type
income_cat	0.821	0.793	0.850	Crude
sleep_hrs	0.765	0.725	0.807	Crude
exerciseYes	0.574	0.488	0.676	Crude

3c. (5 pts) Which predictor has the strongest crude association with FMD? Justify your answer. Exercise has the strongest crude protective function against FMD. The odds ratio is the farthest from 1, meaning the association is the strongest which in this case is the most protective given all the predictors’ odd’s ratios fall below 1.

Task 4: Introduction to Multiple Logistic Regression (20 points)

4a. (5 pts) Fit a multiple logistic regression model predicting FMD from at least 3 predictors.

mod_multi <- glm(fmd ~ exercise + sleep_hrs + income_cat,
                 data = brfss_logistic,
                 family = binomial(link = "logit"))

4b. (5 pts) Report the adjusted odds ratios using tbl_regression().

mod_multi |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(
      exercise   ~ "Exercise (past 30 days)",
      sleep_hrs  ~ "Sleep hours (per hour)",
      income_cat ~ "Income category (per unit)"
    )
  ) |>
  bold_labels() |>
  bold_p()

Characteristic	OR	95% CI	p-value
Exercise (past 30 days)
No	—	—
Yes	0.68	0.57, 0.81	<0.001
Sleep hours (per hour)	0.78	0.74, 0.82	<0.001
Income category (per unit)	0.84	0.81, 0.87	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

4c. (5 pts) For one predictor, compare the crude OR (from Task 3) with the adjusted OR (from Task 4). Show both values.

mod_exercise_table_crude <- tidy(mod_exercise, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "exerciseYes") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Crude")

mod_exercise_table_adjusted <- tidy(mod_multi, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "exerciseYes") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Adjusted")

bind_rows(mod_exercise_table_crude, mod_exercise_table_adjusted) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "95% CI Lower", "95% CI Upper", "Type"),
        caption = "Crude vs. Adjusted Odds Ratios") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Crude vs. Adjusted Odds Ratios
Predictor	OR	95% CI Lower	95% CI Upper	Type
exerciseYes	0.574	0.488	0.676	Crude
exerciseYes	0.681	0.574	0.808	Adjusted

4d. (5 pts) In 2-3 sentences, assess whether confounding is present for the predictor you chose. Which direction did the OR change, and what does this mean? The adjusted OR for exercise moves toward 1 (attenuates), therefore confounding was inflating the crude association. This means that confounders added to the model helped to show the true reduced association between exercise and mental health.

Logistic Regression — Part 1

EPI 553 — Principles of Statistical Inference II (Spring 2026)

Matthew Goldman

April 9, 2026