Part 7: In-Class Lab Activity
EPI 553 — Logistic Regression Part 1 Lab Due: April 13, 2026
Complete the four tasks below using the BRFSS 2020 dataset
(brfss_logistic_2020.rds). Submit a knitted HTML file via
Brightspace. You may collaborate, but each student must submit their own
work.
| Variable | Description | Type |
|---|---|---|
fmd |
Frequent mental distress (No/Yes) | Factor (outcome) |
menthlth_days |
Mentally unhealthy days (0-30) | Numeric |
physhlth_days |
Physically unhealthy days (0-30) | Numeric |
sleep_hrs |
Hours of sleep per night | Numeric |
age |
Age in years | Numeric |
sex |
Male / Female | Factor |
bmi |
Body mass index | Numeric |
exercise |
Exercised in past 30 days (No/Yes) | Factor |
income_cat |
Household income category (1-8) | Numeric |
smoker |
Former/Never vs. Current | Factor |
library(tidyverse)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library (haven)
library (janitor)
library(ggeffects)
library (haven)
library (modelsummary)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))brfss_full <- janitor::clean_names (haven::read_xpt("C:/Users/suruc/OneDrive/Desktop/R/LLCP2020.XPT"))brfss_logistic <- brfss_full |>
mutate(
# Binary outcome: frequent mental distress (>= 14 days)
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
fmd = factor(
ifelse(menthlth_days >= 14, 1, 0),
levels = c(0, 1),
labels = c("No", "Yes")
),
# Predictors
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
sleep_hrs = case_when(
sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
TRUE ~ NA_real_
),
age = age80,
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
income_cat = case_when(
income2 %in% 1:8 ~ as.numeric(income2),
TRUE ~ NA_real_
),
smoker = factor(case_when(
smokday2 %in% c(1, 2) ~ "Current",
smokday2 == 3 ~ "Former/Never",
TRUE ~ NA_character_
), levels = c("Former/Never", "Current"))
) |>
dplyr::filter(
!is.na(fmd), !is.na(physhlth_days), !is.na(sleep_hrs),
!is.na(age), age >= 18, !is.na(sex), !is.na(bmi),
!is.na(exercise), !is.na(income_cat), !is.na(smoker)
)
set.seed(1220)
brfss_logistic <- brfss_logistic |>
dplyr::select(fmd, menthlth_days, physhlth_days, sleep_hrs, age, sex,
bmi, exercise, income_cat, smoker) |>
slice_sample(n = 5000)
# Save for lab use
saveRDS(brfss_logistic,
"C:/Users/suruc/OneDrive/Desktop/R/brfss_logistic_2020.rds")
tibble(Metric = c("Observations", "Variables", "FMD Cases", "FMD Prevalence"),
Value = c(nrow(brfss_logistic), ncol(brfss_logistic),
sum(brfss_logistic$fmd == "Yes"),
paste0(round(100 * mean(brfss_logistic$fmd == "Yes"), 1), "%"))) |>
kable(caption = "Analytic Dataset Overview") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Observations | 5000 |
| Variables | 10 |
| FMD Cases | 757 |
| FMD Prevalence | 15.1% |
brfss_logistic |>
tbl_summary(
by = fmd,
include = c(physhlth_days, sleep_hrs, age, sex, bmi, exercise,
income_cat, smoker),
type = list(
c(physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})"
),
label = list(
physhlth_days ~ "Physical unhealthy days",
sleep_hrs ~ "Sleep hours",
age ~ "Age (years)",
sex ~ "Sex",
bmi ~ "BMI",
exercise ~ "Exercise in past 30 days",
income_cat ~ "Income category (1-8)",
smoker ~ "Smoking status"
)
) |>
add_overall() |>
add_p() |>
bold_labels()| Characteristic | Overall N = 5,0001 |
No N = 4,2431 |
Yes N = 7571 |
p-value2 |
|---|---|---|---|---|
| Physical unhealthy days | 4 (9) | 3 (8) | 10 (13) | <0.001 |
| Sleep hours | 7.00 (1.48) | 7.09 (1.40) | 6.51 (1.83) | <0.001 |
| Age (years) | 56 (16) | 57 (16) | 50 (16) | <0.001 |
| Sex | <0.001 | |||
| Male | 2,701 (54%) | 2,378 (56%) | 323 (43%) | |
| Female | 2,299 (46%) | 1,865 (44%) | 434 (57%) | |
| BMI | 28.5 (6.3) | 28.4 (6.2) | 29.3 (7.0) | 0.001 |
| Exercise in past 30 days | 3,673 (73%) | 3,192 (75%) | 481 (64%) | <0.001 |
| Income category (1-8) | 5.85 (2.11) | 6.00 (2.04) | 5.05 (2.29) | <0.001 |
| Smoking status | <0.001 | |||
| Former/Never | 3,280 (66%) | 2,886 (68%) | 394 (52%) | |
| Current | 1,720 (34%) | 1,357 (32%) | 363 (48%) | |
| 1 Mean (SD); n (%) | ||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | ||||
1a. (5 pts) Create a frequency table showing the number and percentage of individuals with and without frequent mental distress.
brfss_logistic |>
count(fmd) |>
mutate(
Percentage = round(n / sum(n) * 100, 1),
fmd = as.character(fmd)
) |>
kable(
caption = "Table 1a. Frequency of Frequent Mental Distress (FMD) — BRFSS 2020 (n = 5,000)",
col.names = c("Frequent Mental Distress", "n", "%")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Frequent Mental Distress | n | % |
|---|---|---|
| No | 4243 | 84.9 |
| Yes | 757 | 15.1 |
1b. (5 pts) Create a descriptive summary table of at
least 4 predictors, stratified by FMD status. Use
tbl_summary().
brfss_logistic |>
tbl_summary(
by = fmd,
include = c(physhlth_days, sleep_hrs, age, sex, bmi, exercise,
income_cat, smoker),
type = list(
c(physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
label = list(
physhlth_days ~ "Physically unhealthy days",
sleep_hrs ~ "Sleep hours (per night)",
age ~ "Age (years)",
sex ~ "Sex",
bmi ~ "BMI",
exercise ~ "Exercised in past 30 days",
income_cat ~ "Income category (1–8)",
smoker ~ "Smoking status"
)
) |>
add_overall() |>
add_p() |>
bold_labels() |>
modify_caption("**Table 1b. Predictor Distribution by FMD Status — BRFSS 2020 (n = 5,000)**")| Characteristic | Overall N = 5,0001 |
No N = 4,2431 |
Yes N = 7571 |
p-value2 |
|---|---|---|---|---|
| Physically unhealthy days | 4 (9) | 3 (8) | 10 (13) | <0.001 |
| Sleep hours (per night) | 7.00 (1.48) | 7.09 (1.40) | 6.51 (1.83) | <0.001 |
| Age (years) | 56 (16) | 57 (16) | 50 (16) | <0.001 |
| Sex | <0.001 | |||
| Male | 2,701 (54%) | 2,378 (56%) | 323 (43%) | |
| Female | 2,299 (46%) | 1,865 (44%) | 434 (57%) | |
| BMI | 28.5 (6.3) | 28.4 (6.2) | 29.3 (7.0) | 0.001 |
| Exercised in past 30 days | 3,673 (73%) | 3,192 (75%) | 481 (64%) | <0.001 |
| Income category (1–8) | 5.85 (2.11) | 6.00 (2.04) | 5.05 (2.29) | <0.001 |
| Smoking status | <0.001 | |||
| Former/Never | 3,280 (66%) | 2,886 (68%) | 394 (52%) | |
| Current | 1,720 (34%) | 1,357 (32%) | 363 (48%) | |
| 1 Mean (SD); n (%) | ||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | ||||
1c. (5 pts) Create a bar chart showing the proportion of FMD by exercise status OR smoking status.
brfss_logistic |>
filter(!is.na(exercise), !is.na(fmd)) |>
group_by(exercise, fmd) |>
summarise(n = n(), .groups = "drop") |>
group_by(exercise) |>
mutate(pct = n / sum(n) * 100) |>
ungroup() |>
filter(fmd == "Yes") |>
ggplot(aes(x = exercise, y = pct, fill = exercise)) +
geom_col(width = 0.5, alpha = 0.85) +
geom_text(aes(label = paste0(round(pct, 1), "%")),
vjust = -0.5, size = 5) +
scale_fill_manual(values = c("No" = "#E74C3C", "Yes" = "#27AE60")) +
scale_y_continuous(
labels = scales::percent_format(scale = 1),
limits = c(0, 30)
) +
labs(
title = "Figure 1. Prevalence of Frequent Mental Distress by Exercise Status",
subtitle = "BRFSS 2020 (n = 5,000)",
x = "Exercised in Past 30 Days",
y = "Prevalence of FMD (%)",
caption = "FMD = ≥14 mentally unhealthy days in the past 30 days."
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")2a. (5 pts) Fit a simple logistic regression model predicting FMD from exercise. Report the coefficients on the log-odds scale.
mod_exercise <- glm(fmd ~ exercise, data = brfss_logistic,
family = binomial(link = "logit"))
tidy(mod_exercise, conf.int = TRUE, exponentiate = FALSE) |>
kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | -1.337 | 0.068 | -19.769 | 0 | -1.471 | -1.206 |
| exerciseYes | -0.555 | 0.083 | -6.655 | 0 | -0.718 | -0.391 |
In the simple logistic regression model predicting frequent mental distress (FMD) from exercise, the coefficient for exercise represents the difference in log-odds of FMD between adults who report exercising and those who do not. A negative coefficient (-0.555) indicates that exercise is associated with lower log-odds of experiencing FMD. The intercept represents the log-odds of FMD among adults who do not exercise. Exercisers have 0.555 lower log‑odds of FMD than non‑exercisers.
2b. (5 pts) Exponentiate the coefficients to obtain odds ratios with 95% confidence intervals.
tidy(mod_exercise, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3,
caption = "Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.263 | 0.068 | -19.769 | 0 | 0.230 | 0.299 |
| exerciseYes | 0.574 | 0.083 | -6.655 | 0 | 0.488 | 0.676 |
Interpretation: After exponentiating the coefficients, the odds ratio for exercise compares the odds of frequent mental distress among adults who exercise to those who do not. An OR below 1 (0.574, (95% CI: 0.488–0.676)) indicates that exercise is associated with reduced odds of FMD. The 95% confidence interval provides the range of plausible values for this association. Exercisers have 43% lower odds of FMD compared with non‑exercisers.
2c. (5 pts) Interpret the odds ratio for exercise in the context of the research question.
Interpretation: The odds ratio for exercise tells us how the odds of frequent mental distress compare between those who exercise and those who do not, without adjusting for any other variables. An OR less than 1 indicates that exercise is associated with lower odds of FMD. Adults who report exercising have substantially lower odds of experiencing frequent mental distress compared with those who do not exercise. This association is statistically significant and protective.
2d. (5 pts) Create a plot showing the predicted probability of FMD across levels of a continuous predictor (e.g., age or sleep hours).
mod_age <- glm(fmd ~ age, data = brfss_logistic,
family = binomial(link = "logit"))
tidy(mod_age, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3,
caption = "Simple Logistic Regression: FMD ~ Age (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.725 | 0.131 | -2.451 | 0.014 | 0.559 | 0.936 |
| age | 0.974 | 0.002 | -10.703 | 0.000 | 0.970 | 0.979 |
OR per 1‑year increase = 0.974 (95% CI: 0.970–0.979) Each additional year of age is associated with a 2.6% decrease in the odds of FMD.
beta_age <- coef(mod_age)["age"]
or_10yr <- exp(10 * beta_age)
ci_10yr <- exp(10 * confint(mod_age)["age", ])
tibble(
`Age Difference` = "10 years",
`OR` = round(or_10yr, 3),
`95% CI Lower` = round(ci_10yr[1], 3),
`95% CI Upper` = round(ci_10yr[2], 3)
) |>
kable(caption = "Odds Ratio for 10-Year Increase in Age") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Age Difference | OR | 95% CI Lower | 95% CI Upper |
|---|---|---|---|
| 10 years | 0.771 | 0.736 | 0.809 |
Interpretation: The OR for a 10-year increase in age tells us how the odds of frequent mental distress change when comparing two individuals who differ by 10 years of age.A 10‑year age difference corresponds to a 23% lower odds of FMD (OR = 0.771 (95% CI: 0.736–0.809)).Older adults have lower odds of frequent mental distress. The decline is gradual but consistent, and the association is statistically significant.
ggpredict(mod_age, terms = "age [18:80]") |>
plot() +
labs(title = "Predicted Probability of Frequent Mental Distress by Age",
x = "Age (years)", y = "Predicted Probability of FMD") +
theme_minimal()Interpretation: The curve shows the predicted probability of frequent mental distress (FMD) across the age range of 18 to 80 years, holding all other factors constant. The S-shaped (sigmoidal) trajectory is the hallmark of logistic regression — probabilities are bounded between 0 and 1 and change most steeply around the midpoint of the curve. The shaded band represents the 95% confidence interval around the predicted probability at each age. A rising curve indicates that older individuals have a higher predicted probability of FMD; a falling curve would indicate the opposite. Pay attention to both the direction of the trend and how wide the confidence band is, especially at the extremes of age where data may be sparse.
The predicted probability of frequent mental distress decreases steadily with age, indicating that younger adults are more likely to experience FMD than older adults. This pattern aligns with the negative age coefficient and the OR < 1.
3a. (5 pts) Fit three separate simple logistic regression models, each with a different predictor of your choice.
mod_smoker <- glm(fmd ~ smoker, data = brfss_logistic,
family = binomial(link = "logit"))
mod_sleep <- glm(fmd ~ sleep_hrs, data = brfss_logistic,
family = binomial(link = "logit"))
mod_income <- glm(fmd ~ income_cat, data = brfss_logistic,
family = binomial(link = "logit"))3b. (10 pts) Create a table comparing the odds ratios from all three models.
brfss_logistic |>
tbl_uvregression(
method = glm,
y = "fmd",
method.args = list(family = binomial),
exponentiate = TRUE,
include = c("smoker", "sleep_hrs", "income_cat"),
label = list(
smoker ~ "Smoking status (ref: Former/Never)",
sleep_hrs ~ "Sleep hours (per 1-hr increase)",
income_cat ~ "Income category (per 1-unit increase)"
)
) |>
bold_labels() |>
bold_p(t = 0.05) |>
modify_caption("**Table 3b. Crude Odds Ratios — Three Predictors of FMD (BRFSS 2020)**")| Characteristic | N | OR | 95% CI | p-value |
|---|---|---|---|---|
| Smoking status (ref: Former/Never) | 5,000 | |||
| Former/Never | — | — | ||
| Current | 1.96 | 1.68, 2.29 | <0.001 | |
| Sleep hours (per 1-hr increase) | 5,000 | 0.77 | 0.73, 0.81 | <0.001 |
| Income category (per 1-unit increase) | 5,000 | 0.82 | 0.79, 0.85 | <0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | ||||
3c. (5 pts) Which predictor has the strongest crude association with FMD? Justify your answer.
Current smoking has the strongest crude association with FMD (OR = 1.96). This effect size is the largest from the null value of 1.0, indicating that current smokers have nearly twice the odds of FMD compared with former or never smokers. Although sleep hours and income also show significant protective associations, their ORs (0.77 and 0.82) represent smaller relative changes in odds compared with the magnitude of the smoking association.
4a. (5 pts) Fit a multiple logistic regression model
predicting FMD from at least 3 predictors. 4b. (5 pts)
Report the adjusted odds ratios using tbl_regression().
mod_multi <- glm(fmd ~ exercise + age + sex + smoker + sleep_hrs + income_cat,
data = brfss_logistic,
family = binomial(link = "logit"))
mod_multi |>
tbl_regression(
exponentiate = TRUE,
label = list(
exercise ~ "Exercise (past 30 days)",
age ~ "Age (per year)",
sex ~ "Sex",
smoker ~ "Smoking status",
sleep_hrs ~ "Sleep hours (per hour)",
income_cat ~ "Income category (per unit)"
)
) |>
bold_labels() |>
bold_p() |>
modify_caption(
"**Table 4a. Adjusted Odds Ratios for Frequent Mental Distress**
Multiple logistic regression — BRFSS 2020 (n = 5,000)"
)| Characteristic | OR | 95% CI | p-value |
|---|---|---|---|
| Exercise (past 30 days) | |||
| No | — | — | |
| Yes | 0.61 | 0.51, 0.73 | <0.001 |
| Age (per year) | 0.97 | 0.97, 0.98 | <0.001 |
| Sex | |||
| Male | — | — | |
| Female | 1.67 | 1.42, 1.96 | <0.001 |
| Smoking status | |||
| Former/Never | — | — | |
| Current | 1.25 | 1.05, 1.48 | 0.012 |
| Sleep hours (per hour) | 0.81 | 0.77, 0.86 | <0.001 |
| Income category (per unit) | 0.85 | 0.82, 0.89 | <0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
4c. (5 pts) For one predictor, compare the crude OR (from Task 3) with the adjusted OR (from Task 4). Show both values.
# Crude ORs from simple models
crude_age <- tidy(mod_age, exponentiate = TRUE, conf.int = TRUE) |>
filter(term == "age") |>
dplyr::select(term, estimate, conf.low, conf.high) |>
mutate(type = "Crude")
# Adjusted ORs from multiple model
adj_age <- tidy(mod_multi, exponentiate = TRUE, conf.int = TRUE) |>
filter(term == "age") |>
dplyr::select(term, estimate, conf.low, conf.high) |>
mutate(type = "Adjusted")
bind_rows(crude_age, adj_age) |>
mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
kable(col.names = c("Predictor", "OR", "95% CI Lower", "95% CI Upper", "Type"),
caption = "Crude vs. Adjusted Odds Ratios") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Predictor | OR | 95% CI Lower | 95% CI Upper | Type |
|---|---|---|---|---|
| age | 0.974 | 0.97 | 0.979 | Crude |
| age | 0.975 | 0.97 | 0.980 | Adjusted |
4d. (5 pts) In 2-3 sentences, assess whether confounding is present for the predictor you chose. Which direction did the OR change, and what does this mean?
The crude OR for age was 0.974, and the adjusted OR was 0.975, showing almost no change after controlling for other predictors. Because the OR changed by far less than 10% and the interpretation (older adults have slightly lower odds of FMD) remained the same, there is no evidence of meaningful confounding. The stability of the estimate indicates that age’s association with FMD is not explained by the other variables in the model.
Completion credit (25 points): Awarded for a complete, good-faith attempt at all tasks. Total: 75 + 25 = 100 points.
End of Lab Activity