EPI 553 — Logistic Regression Part 1 Lab Due: April 13, 2026
Complete the four tasks below using the BRFSS 2020 dataset
(brfss_logistic_2020.rds). Submit a knitted HTML file via
Brightspace.
We continue using the Behavioral Risk Factor Surveillance System (BRFSS) 2020. In previous lectures, we modeled mentally unhealthy days as a continuous outcome. Today, we transform this into a binary outcome: frequent mental distress (FMD), defined as reporting 14 or more mentally unhealthy days in the past 30 days. This threshold is used by the CDC as a standard measure of frequent mental distress.
| Variable | Description | Type |
|---|---|---|
fmd |
Frequent mental distress (No/Yes) | Factor (outcome) |
menthlth_days |
Mentally unhealthy days (0-30) | Numeric |
physhlth_days |
Physically unhealthy days (0-30) | Numeric |
sleep_hrs |
Hours of sleep per night | Numeric |
age |
Age in years | Numeric |
sex |
Male / Female | Factor |
bmi |
Body mass index | Numeric |
exercise |
Exercised in past 30 days (No/Yes) | Factor |
income_cat |
Household income category (1-8) | Numeric |
smoker |
Former/Never vs. Current | Factor |
library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(ggeffects)
library(plotly)
library(gridExtra)
library(scales)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))brfss_logistic <- brfss_full |>
mutate(
# Binary outcome: frequent mental distress (>= 14 days)
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
fmd = factor(
ifelse(menthlth_days >= 14, 1, 0),
levels = c(0, 1),
labels = c("No", "Yes")
),
# Predictors
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
sleep_hrs = case_when(
sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
TRUE ~ NA_real_
),
age = age80,
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
income_cat = case_when(
income2 %in% 1:8 ~ as.numeric(income2),
TRUE ~ NA_real_
),
smoker = factor(case_when(
smokday2 %in% c(1, 2) ~ "Current",
smokday2 == 3 ~ "Former/Never",
TRUE ~ NA_character_
), levels = c("Former/Never", "Current"))
) |>
filter(
!is.na(fmd), !is.na(physhlth_days), !is.na(sleep_hrs),
!is.na(age), age >= 18, !is.na(sex), !is.na(bmi),
!is.na(exercise), !is.na(income_cat), !is.na(smoker)
)
set.seed(1220)
brfss_logistic <- brfss_logistic |>
dplyr::select(fmd, menthlth_days, physhlth_days, sleep_hrs, age, sex,
bmi, exercise, income_cat, smoker) |>
slice_sample(n = 5000)
# Save for lab use
saveRDS(brfss_logistic,
"/Users/nataliasmall/Downloads/EPI 553/brfss_logistic_2020.rds")
tibble(Metric = c("Observations", "Variables", "FMD Cases", "FMD Prevalence"),
Value = c(nrow(brfss_logistic), ncol(brfss_logistic),
sum(brfss_logistic$fmd == "Yes"),
paste0(round(100 * mean(brfss_logistic$fmd == "Yes"), 1), "%"))) |>
kable(caption = "Analytic Dataset Overview") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Observations | 5000 |
| Variables | 10 |
| FMD Cases | 757 |
| FMD Prevalence | 15.1% |
1a. (5 pts) Create a frequency table showing the number and percentage of individuals with and without frequent mental distress.
Frequency_mentdis = cbind(freq = table(brfss_logistic$fmd), percent = prop.table(table(brfss_logistic$fmd)))
Frequency_mentdis |>
kable(caption = "Frequency Table: Frequent Mental Distress") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| freq | percent | |
|---|---|---|
| No | 4243 | 0.8486 |
| Yes | 757 | 0.1514 |
1b. (5 pts) Create a descriptive summary table of at
least 4 predictors, stratified by FMD status. Use
tbl_summary().
brfss_logistic |>
tbl_summary(
by = fmd,
include = c(menthlth_days, physhlth_days, sleep_hrs, age, sex,
bmi, exercise, income_cat, smoker),
type = list(
c(menthlth_days, physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})"
),
label = list(
menthlth_days ~ "Mentally unhealthy days",
physhlth_days ~ "Physical unhealthy days",
sleep_hrs ~ "Sleep hours",
age ~ "Age (years)",
sex ~ "Sex",
bmi ~ "BMI",
exercise ~ "Exercise in past 30 days",
income_cat ~ "Income category (1-8)",
smoker ~ "Smoking status"
)
) |>
add_overall() |>
add_p() |>
bold_labels()| Characteristic | Overall N = 5,0001 |
No N = 4,2431 |
Yes N = 7571 |
p-value2 |
|---|---|---|---|---|
| Mentally unhealthy days | 5 (9) | 1 (3) | 24 (6) | <0.001 |
| Physical unhealthy days | 4 (9) | 3 (8) | 10 (13) | <0.001 |
| Sleep hours | 7.00 (1.48) | 7.09 (1.40) | 6.51 (1.83) | <0.001 |
| Age (years) | 56 (16) | 57 (16) | 50 (16) | <0.001 |
| Sex | <0.001 | |||
| Male | 2,701 (54%) | 2,378 (56%) | 323 (43%) | |
| Female | 2,299 (46%) | 1,865 (44%) | 434 (57%) | |
| BMI | 28.5 (6.3) | 28.4 (6.2) | 29.3 (7.0) | 0.001 |
| Exercise in past 30 days | 3,673 (73%) | 3,192 (75%) | 481 (64%) | <0.001 |
| Income category (1-8) | 5.85 (2.11) | 6.00 (2.04) | 5.05 (2.29) | <0.001 |
| Smoking status | <0.001 | |||
| Former/Never | 3,280 (66%) | 2,886 (68%) | 394 (52%) | |
| Current | 1,720 (34%) | 1,357 (32%) | 363 (48%) | |
| 1 Mean (SD); n (%) | ||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | ||||
1c. (5 pts) Create a bar chart showing the proportion of FMD by exercise status OR smoking status.
# Summary statistics by exercise status
mean_fmd_ex <- brfss_logistic %>%
mutate(fmd_binary = ifelse(fmd == "Yes", 1, 0)) %>%
group_by(exercise) %>%
summarise(Mean = mean(fmd_binary, na.rm = TRUE)) %>%
filter(!is.na(exercise))
# Exercise status bar chart
p1 <- ggplot(mean_fmd_ex, aes(x = exercise, y = Mean, fill = exercise)) +
geom_col(alpha = 0.85) +
geom_text(aes(label = round(Mean, 2)), vjust = -0.3) +
scale_fill_brewer(palette = "Blues") +
labs(
title = "Proportion of FMD by Exercise Status",
subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
x = "Exercised in past 30 days",
y = "Proportion FMD"
) +
theme_minimal() +
theme(legend.position = "none")
# Summary statistics by smoking status
mean_fmd_smoke <- brfss_logistic %>%
mutate(fmd_binary = ifelse(fmd == "Yes", 1, 0)) %>%
group_by(smoker) %>%
summarise(Mean = mean(fmd_binary, na.rm = TRUE)) %>%
filter(!is.na(smoker))
# Smoking status bar chart
p2 <- ggplot(mean_fmd_smoke, aes(x = smoker, y = Mean, fill = smoker)) +
geom_col(alpha = 0.85) +
geom_text(aes(label = round(Mean, 2)), vjust = -0.3) +
scale_fill_brewer(palette = "Greens") +
labs(
title = "Proportion of FMD by Smoking Status",
subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
x = "Smoking Status",
y = "Proportion with FMD"
) +
theme_minimal() +
theme(legend.position = "none")
grid.arrange(p1, p2, ncol = 2)2a. (5 pts) Fit a simple logistic regression model predicting FMD from exercise. Report the coefficients on the log-odds scale.
# Fit a simple logistic regression model predicting FMD from exercise
mod_fmd_exercise <- glm(fmd ~ exercise, data = brfss_logistic,
family = binomial(link = "logit"))
tidy(mod_fmd_exercise, conf.int = TRUE, exponentiate = FALSE) |>
kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | -1.337 | 0.068 | -19.769 | 0 | -1.471 | -1.206 |
| exerciseYes | -0.555 | 0.083 | -6.655 | 0 | -0.718 | -0.391 |
2b. (5 pts) Exponentiate the coefficients to obtain odds ratios with 95% confidence intervals.
tidy(mod_fmd_exercise, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3,
caption = "Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.263 | 0.068 | -19.769 | 0 | 0.230 | 0.299 |
| exerciseYes | 0.574 | 0.083 | -6.655 | 0 | 0.488 | 0.676 |
2c. (5 pts) Interpret the odds ratio for exercise in the context of the research question. Compared to those who have not exercised in the past 30 days, those who have exercised have 0.574 times the odds of displaying frequent mental distress (FMD). This represents a 42.6% reduction in odds (1 - 0.574 = 0.426).
2d. (5 pts) Create a plot showing the predicted probability of FMD across levels of a continuous predictor (e.g., age or sleep hours).
mod_age <- glm(fmd ~ age, data = brfss_logistic,
family = binomial(link = "logit"))
ggpredict(mod_age, terms = "age [18:80]") |>
plot() +
labs(title = "Predicted Probability of Frequent Mental Distress by Age",
subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
x = "Age (years)",
y = "Predicted Probability of FMD") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal()3a. (5 pts) Fit three separate simple logistic regression models, each with a different predictor of your choice.
mod_A <- glm(fmd ~ age, data = brfss_logistic,
family = binomial)
mod_B <- glm(fmd ~ bmi, data = brfss_logistic,
family = binomial)
mod_C <- glm(fmd ~ income_cat, data = brfss_logistic,
family = binomial)
tidy(mod_A, conf.int = TRUE, exponentiate = TRUE) |>
filter(term != "(Intercept)") |>
kable(digits = 3, caption = "Model 1 - FMD ~ Age (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| age | 0.974 | 0.002 | -10.703 | 0 | 0.97 | 0.979 |
tidy(mod_B, conf.int = TRUE, exponentiate = TRUE) |>
filter(term != "(Intercept)") |>
kable(digits = 3, caption = "Model 2 - FMD ~ BMI (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| bmi | 1.023 | 0.006 | 3.745 | 0 | 1.011 | 1.034 |
tidy(mod_C, conf.int = TRUE, exponentiate = TRUE) |>
filter(term != "(Intercept)") |>
kable(digits = 3, caption = "Model 3 - FMD ~ Household Income Category (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| income_cat | 0.821 | 0.018 | -11.102 | 0 | 0.793 | 0.85 |
3b. (10 pts) Create a table comparing the odds ratios from all three models.
tbl_or <- function(model, predictor_label) {
tidy(model, conf.int = TRUE, exponentiate = TRUE) |>
filter(term != "(Intercept)") |>
mutate(Predictor = predictor_label) |>
dplyr::select(Predictor, OR = estimate, `95% CI Lower` = conf.low,
`95% CI Upper` = conf.high, `p-value` = p.value)
}
bind_rows(
tbl_or(mod_A, "Age (per 1 year)"),
tbl_or(mod_B, "BMI (per 1 unit)"),
tbl_or(mod_C, "Income category (per 1 unit)")
) |>
mutate(
across(c(OR, `95% CI Lower`, `95% CI Upper`), ~ round(.x, 3)),
`p-value` = ifelse(`p-value` < 0.001, "< 0.001", round(`p-value`, 3))
) |>
kable(caption = "Comparison: Crude Odds Ratios Across Three Simple Logistic Regression Models") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Predictor | OR | 95% CI Lower | 95% CI Upper | p-value |
|---|---|---|---|---|
| Age (per 1 year) | 0.974 | 0.970 | 0.979 | < 0.001 |
| BMI (per 1 unit) | 1.023 | 1.011 | 1.034 | < 0.001 |
| Income category (per 1 unit) | 0.821 | 0.793 | 0.850 | < 0.001 |
3c. (5 pts) Which predictor has the strongest crude association with FMD? Justify your answer. The predictor with the strongest crude association with FMD is the income category variable. Income category is the furthest away from 1.0 (the null value). For every 1-unit increase in income category, the odds of FMD decrease by 17.9% (1-0.821). For every 1-unit increase in BMI, the odds of FMD increase by 2.3% (1.023-1). For every 1-year increase in age, the odds of FMD decrease by 2.6% (1-0.974).
4a. (5 pts) Fit a multiple logistic regression model predicting FMD from at least 3 predictors.
mod_multi <- glm(fmd ~ age + bmi + income_cat, data = brfss_logistic,
family = binomial(link = "logit"))
tidy(mod_multi, conf.int = TRUE, exponentiate = FALSE) |>
kable(digits = 3,
caption = "Multiple Logistic Regression: FMD ~ 3 Predictors (Log-Odds Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.412 | 0.249 | 1.654 | 0.098 | -0.076 | 0.900 |
| age | -0.027 | 0.002 | -10.804 | 0.000 | -0.031 | -0.022 |
| bmi | 0.015 | 0.006 | 2.467 | 0.014 | 0.003 | 0.026 |
| income_cat | -0.202 | 0.018 | -11.257 | 0.000 | -0.237 | -0.167 |
4b. (5 pts) Report the adjusted odds ratios using
tbl_regression().
mod_multi |>
tbl_regression(
exponentiate = TRUE,
label = list(
age ~ "Age (per year)",
bmi ~ "Body mass index",
income_cat ~ "Income category (per unit)"
)
) |>
bold_labels() |>
bold_p()| Characteristic | OR | 95% CI | p-value |
|---|---|---|---|
| Age (per year) | 0.97 | 0.97, 0.98 | <0.001 |
| Body mass index | 1.01 | 1.00, 1.03 | 0.014 |
| Income category (per unit) | 0.82 | 0.79, 0.85 | <0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
4c. (5 pts) For one predictor, compare the crude OR (from Task 3) with the adjusted OR (from Task 4). Show both values.
# Crude ORs from simple models
crude_age <- tidy(mod_A, exponentiate = TRUE, conf.int = TRUE) |>
filter(term == "age") |>
dplyr::select(term, estimate, conf.low, conf.high) |>
mutate(type = "Crude")
# Adjusted ORs from multiple model
adj_age <- tidy(mod_multi, exponentiate = TRUE, conf.int = TRUE) |>
filter(term == "age") |>
dplyr::select(term, estimate, conf.low, conf.high) |>
mutate(type = "Adjusted")
bind_rows(crude_age, adj_age) |>
mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
kable(col.names = c("Predictor", "OR", "95% CI Lower", "95% CI Upper", "Type"),
caption = "Crude vs. Adjusted Odds Ratios for Age") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Predictor | OR | 95% CI Lower | 95% CI Upper | Type |
|---|---|---|---|---|
| age | 0.974 | 0.970 | 0.979 | Crude |
| age | 0.974 | 0.969 | 0.978 | Adjusted |
4d. (5 pts) In 2-3 sentences, assess whether confounding is present for the predictor you chose. Which direction did the OR change, and what does this mean? Confounding is not present for the age predictor. The Crude OR and the Adjusted OR are identical (OR = 0.974) and the 95% CIs are relatively similar (95% CI: 0.970-0.979) vs. (95% CI:0.969-0.978). This indicates that adjusting for BMI and income category did not change the relationship between age and frequent mental distress (FMD).
Completion credit (25 points): Awarded for a complete, good-faith attempt at all tasks. Total: 75 + 25 = 100 points.
End of Lab Activity