Logistic Regression

Part 7: In-Class Lab Activity

EPI 553 — Logistic Regression Part 1 Lab Due: April 13, 2026

Instructions

Complete the four tasks below using the BRFSS 2020 dataset (brfss_logistic_2020.rds). Submit a knitted HTML file via Brightspace. You may collaborate, but each student must submit their own work.

Data

Variable	Description	Type
`fmd`	Frequent mental distress (No/Yes)	Factor (outcome)
`menthlth_days`	Mentally unhealthy days (0-30)	Numeric
`physhlth_days`	Physically unhealthy days (0-30)	Numeric
`sleep_hrs`	Hours of sleep per night	Numeric
`age`	Age in years	Numeric
`sex`	Male / Female	Factor
`bmi`	Body mass index	Numeric
`exercise`	Exercised in past 30 days (No/Yes)	Factor
`income_cat`	Household income category (1-8)	Numeric
`smoker`	Former/Never vs. Current	Factor

library(tidyverse)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library (haven)
library (janitor)
library(ggeffects)
library (haven)
library (modelsummary)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

brfss_full <- janitor::clean_names (haven::read_xpt("C:/Users/suruc/OneDrive/Desktop/R/LLCP2020.XPT"))

brfss_logistic <- brfss_full |>
  mutate(
    # Binary outcome: frequent mental distress (>= 14 days)
    menthlth_days = case_when(
      menthlth == 88                  ~ 0,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE                           ~ NA_real_
    ),
    fmd = factor(
      ifelse(menthlth_days >= 14, 1, 0),
      levels = c(0, 1),
      labels = c("No", "Yes")
    ),
    # Predictors
    physhlth_days = case_when(
      physhlth == 88                  ~ 0,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE                           ~ NA_real_
    ),
    sleep_hrs = case_when(
      sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
      TRUE                           ~ NA_real_
    ),
    age = age80,
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    smoker = factor(case_when(
      smokday2 %in% c(1, 2) ~ "Current",
      smokday2 == 3          ~ "Former/Never",
      TRUE                   ~ NA_character_
    ), levels = c("Former/Never", "Current"))
  ) |>
  dplyr::filter(
    !is.na(fmd), !is.na(physhlth_days), !is.na(sleep_hrs),
    !is.na(age), age >= 18, !is.na(sex), !is.na(bmi),
    !is.na(exercise), !is.na(income_cat), !is.na(smoker)
  )

set.seed(1220)
brfss_logistic <- brfss_logistic |>
  dplyr::select(fmd, menthlth_days, physhlth_days, sleep_hrs, age, sex,
                bmi, exercise, income_cat, smoker) |>
  slice_sample(n = 5000)

# Save for lab use
saveRDS(brfss_logistic,
  "C:/Users/suruc/OneDrive/Desktop/R/brfss_logistic_2020.rds")

tibble(Metric = c("Observations", "Variables", "FMD Cases", "FMD Prevalence"),
       Value  = c(nrow(brfss_logistic), ncol(brfss_logistic),
                  sum(brfss_logistic$fmd == "Yes"),
                  paste0(round(100 * mean(brfss_logistic$fmd == "Yes"), 1), "%"))) |>
  kable(caption = "Analytic Dataset Overview") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Analytic Dataset Overview
Metric	Value
Observations	5000
Variables	10
FMD Cases	757
FMD Prevalence	15.1%

Descriptive Overview

brfss_logistic |>
  tbl_summary(
    by = fmd,
    include = c(physhlth_days, sleep_hrs, age, sex, bmi, exercise,
                income_cat, smoker),
    type = list(
      c(physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      physhlth_days ~ "Physical unhealthy days",
      sleep_hrs     ~ "Sleep hours",
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercise in past 30 days",
      income_cat    ~ "Income category (1-8)",
      smoker        ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()

Characteristic	Overall N = 5,000¹	No N = 4,243¹	Yes N = 757¹	p-value²
Physical unhealthy days	4 (9)	3 (8)	10 (13)	<0.001
Sleep hours	7.00 (1.48)	7.09 (1.40)	6.51 (1.83)	<0.001
Age (years)	56 (16)	57 (16)	50 (16)	<0.001
Sex				<0.001
Male	2,701 (54%)	2,378 (56%)	323 (43%)
Female	2,299 (46%)	1,865 (44%)	434 (57%)
BMI	28.5 (6.3)	28.4 (6.2)	29.3 (7.0)	0.001
Exercise in past 30 days	3,673 (73%)	3,192 (75%)	481 (64%)	<0.001
Income category (1-8)	5.85 (2.11)	6.00 (2.04)	5.05 (2.29)	<0.001
Smoking status				<0.001
Former/Never	3,280 (66%)	2,886 (68%)	394 (52%)
Current	1,720 (34%)	1,357 (32%)	363 (48%)
¹ Mean (SD); n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

Task 1: Explore the Binary Outcome (15 points)

1a. (5 pts) Create a frequency table showing the number and percentage of individuals with and without frequent mental distress.

Descriptive Overview

brfss_logistic |>
  count(fmd) |>
  mutate(
    Percentage = round(n / sum(n) * 100, 1),
    fmd        = as.character(fmd)
  ) |>
  kable(
    caption = "Table 1a. Frequency of Frequent Mental Distress (FMD) — BRFSS 2020 (n = 5,000)",
    col.names = c("Frequent Mental Distress", "n", "%")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 1a. Frequency of Frequent Mental Distress (FMD) — BRFSS 2020 (n = 5,000)
Frequent Mental Distress	n	%
No	4243	84.9
Yes	757	15.1

1b. (5 pts) Create a descriptive summary table of at least 4 predictors, stratified by FMD status. Use tbl_summary().

brfss_logistic |>
  tbl_summary(
    by = fmd,
    include = c(physhlth_days, sleep_hrs, age, sex, bmi, exercise,
                income_cat, smoker),
    type = list(
      c(physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    label = list(
      physhlth_days ~ "Physically unhealthy days",
      sleep_hrs     ~ "Sleep hours (per night)",
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercised in past 30 days",
      income_cat    ~ "Income category (1–8)",
      smoker        ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels() |>
  modify_caption("**Table 1b. Predictor Distribution by FMD Status — BRFSS 2020 (n = 5,000)**")

**Table 1b. Predictor Distribution by FMD Status — BRFSS 2020 (n = 5,000)**
Characteristic	Overall N = 5,000¹	No N = 4,243¹	Yes N = 757¹	p-value²
Physically unhealthy days	4 (9)	3 (8)	10 (13)	<0.001
Sleep hours (per night)	7.00 (1.48)	7.09 (1.40)	6.51 (1.83)	<0.001
Age (years)	56 (16)	57 (16)	50 (16)	<0.001
Sex				<0.001
Male	2,701 (54%)	2,378 (56%)	323 (43%)
Female	2,299 (46%)	1,865 (44%)	434 (57%)
BMI	28.5 (6.3)	28.4 (6.2)	29.3 (7.0)	0.001
Exercised in past 30 days	3,673 (73%)	3,192 (75%)	481 (64%)	<0.001
Income category (1–8)	5.85 (2.11)	6.00 (2.04)	5.05 (2.29)	<0.001
Smoking status				<0.001
Former/Never	3,280 (66%)	2,886 (68%)	394 (52%)
Current	1,720 (34%)	1,357 (32%)	363 (48%)
¹ Mean (SD); n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

1c. (5 pts) Create a bar chart showing the proportion of FMD by exercise status OR smoking status.

brfss_logistic |>
  filter(!is.na(exercise), !is.na(fmd)) |>
  group_by(exercise, fmd) |>
  summarise(n = n(), .groups = "drop") |>
  group_by(exercise) |>
  mutate(pct = n / sum(n) * 100) |>
  ungroup() |>
 
  filter(fmd == "Yes") |>
  ggplot(aes(x = exercise, y = pct, fill = exercise)) +
  geom_col(width = 0.5, alpha = 0.85) +
  geom_text(aes(label = paste0(round(pct, 1), "%")),
            vjust = -0.5, size = 5) +
  scale_fill_manual(values = c("No" = "#E74C3C", "Yes" = "#27AE60")) +
  scale_y_continuous(
    labels = scales::percent_format(scale = 1),
    limits = c(0, 30)
  ) +
  labs(
    title   = "Figure 1. Prevalence of Frequent Mental Distress by Exercise Status",
    subtitle = "BRFSS 2020 (n = 5,000)",
    x       = "Exercised in Past 30 Days",
    y       = "Prevalence of FMD (%)",
    caption = "FMD = ≥14 mentally unhealthy days in the past 30 days."
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Task 2: Simple Logistic Regression (20 points)

2a. (5 pts) Fit a simple logistic regression model predicting FMD from exercise. Report the coefficients on the log-odds scale.

mod_exercise <- glm(fmd ~ exercise, data = brfss_logistic,
                    family = binomial(link = "logit"))

tidy(mod_exercise, conf.int = TRUE, exponentiate = FALSE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	-1.337	0.068	-19.769	0	-1.471	-1.206
exerciseYes	-0.555	0.083	-6.655	0	-0.718	-0.391

In the simple logistic regression model predicting frequent mental distress (FMD) from exercise, the coefficient for exercise represents the difference in log-odds of FMD between adults who report exercising and those who do not. A negative coefficient (-0.555) indicates that exercise is associated with lower log-odds of experiencing FMD. The intercept represents the log-odds of FMD among adults who do not exercise. Exercisers have 0.555 lower log‑odds of FMD than non‑exercisers.

2b. (5 pts) Exponentiate the coefficients to obtain odds ratios with 95% confidence intervals.

tidy(mod_exercise, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.263	0.068	-19.769	0	0.230	0.299
exerciseYes	0.574	0.083	-6.655	0	0.488	0.676

Interpretation: After exponentiating the coefficients, the odds ratio for exercise compares the odds of frequent mental distress among adults who exercise to those who do not. An OR below 1 (0.574, (95% CI: 0.488–0.676)) indicates that exercise is associated with reduced odds of FMD. The 95% confidence interval provides the range of plausible values for this association. Exercisers have 43% lower odds of FMD compared with non‑exercisers.

2c. (5 pts) Interpret the odds ratio for exercise in the context of the research question.

Interpretation: The odds ratio for exercise tells us how the odds of frequent mental distress compare between those who exercise and those who do not, without adjusting for any other variables. An OR less than 1 indicates that exercise is associated with lower odds of FMD. Adults who report exercising have substantially lower odds of experiencing frequent mental distress compared with those who do not exercise. This association is statistically significant and protective.

2d. (5 pts) Create a plot showing the predicted probability of FMD across levels of a continuous predictor (e.g., age or sleep hours).

mod_age <- glm(fmd ~ age, data = brfss_logistic,
               family = binomial(link = "logit"))

tidy(mod_age, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ Age (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Age (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.725	0.131	-2.451	0.014	0.559	0.936
age	0.974	0.002	-10.703	0.000	0.970	0.979

OR per 1‑year increase = 0.974 (95% CI: 0.970–0.979) Each additional year of age is associated with a 2.6% decrease in the odds of FMD.

beta_age <- coef(mod_age)["age"]
or_10yr <- exp(10 * beta_age)
ci_10yr <- exp(10 * confint(mod_age)["age", ])

tibble(
  `Age Difference` = "10 years",
  `OR` = round(or_10yr, 3),
  `95% CI Lower` = round(ci_10yr[1], 3),
  `95% CI Upper` = round(ci_10yr[2], 3)
) |>
  kable(caption = "Odds Ratio for 10-Year Increase in Age") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Odds Ratio for 10-Year Increase in Age
Age Difference	OR	95% CI Lower	95% CI Upper
10 years	0.771	0.736	0.809

Interpretation: The OR for a 10-year increase in age tells us how the odds of frequent mental distress change when comparing two individuals who differ by 10 years of age.A 10‑year age difference corresponds to a 23% lower odds of FMD (OR = 0.771 (95% CI: 0.736–0.809)).Older adults have lower odds of frequent mental distress. The decline is gradual but consistent, and the association is statistically significant.

Visualizing the Predicted Probabilities

ggpredict(mod_age, terms = "age [18:80]") |>
  plot() +
  labs(title = "Predicted Probability of Frequent Mental Distress by Age",
       x = "Age (years)", y = "Predicted Probability of FMD") +
  theme_minimal()

Interpretation: The curve shows the predicted probability of frequent mental distress (FMD) across the age range of 18 to 80 years, holding all other factors constant. The S-shaped (sigmoidal) trajectory is the hallmark of logistic regression — probabilities are bounded between 0 and 1 and change most steeply around the midpoint of the curve. The shaded band represents the 95% confidence interval around the predicted probability at each age. A rising curve indicates that older individuals have a higher predicted probability of FMD; a falling curve would indicate the opposite. Pay attention to both the direction of the trend and how wide the confidence band is, especially at the extremes of age where data may be sparse.

The predicted probability of frequent mental distress decreases steadily with age, indicating that younger adults are more likely to experience FMD than older adults. This pattern aligns with the negative age coefficient and the OR < 1.

Task 3: Comparing Predictors (20 points)

3a. (5 pts) Fit three separate simple logistic regression models, each with a different predictor of your choice.

mod_smoker <- glm(fmd ~ smoker, data = brfss_logistic,
                  family = binomial(link = "logit"))

mod_sleep  <- glm(fmd ~ sleep_hrs, data = brfss_logistic, 
                  family = binomial(link = "logit"))

mod_income <- glm(fmd ~ income_cat, data = brfss_logistic, 
                  family = binomial(link = "logit"))

3b. (10 pts) Create a table comparing the odds ratios from all three models.

brfss_logistic |>
  tbl_uvregression(
    method       = glm,
    y            = "fmd",
    method.args  = list(family = binomial),
    exponentiate = TRUE,
    include      = c("smoker", "sleep_hrs", "income_cat"),
    label = list(
      smoker     ~ "Smoking status (ref: Former/Never)",
      sleep_hrs  ~ "Sleep hours (per 1-hr increase)",
      income_cat ~ "Income category (per 1-unit increase)"
    )
  ) |>
  bold_labels() |>
  bold_p(t = 0.05) |>
  modify_caption("**Table 3b. Crude Odds Ratios — Three Predictors of FMD (BRFSS 2020)**")

**Table 3b. Crude Odds Ratios — Three Predictors of FMD (BRFSS 2020)**
Characteristic	N	OR	95% CI	p-value
Smoking status (ref: Former/Never)	5,000
Former/Never		—	—
Current		1.96	1.68, 2.29	<0.001
Sleep hours (per 1-hr increase)	5,000	0.77	0.73, 0.81	<0.001
Income category (per 1-unit increase)	5,000	0.82	0.79, 0.85	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

3c. (5 pts) Which predictor has the strongest crude association with FMD? Justify your answer.

Current smoking has the strongest crude association with FMD (OR = 1.96). This effect size is the largest from the null value of 1.0, indicating that current smokers have nearly twice the odds of FMD compared with former or never smokers. Although sleep hours and income also show significant protective associations, their ORs (0.77 and 0.82) represent smaller relative changes in odds compared with the magnitude of the smoking association.

Task 4: Introduction to Multiple Logistic Regression (20 points)

4a. (5 pts) Fit a multiple logistic regression model predicting FMD from at least 3 predictors. 4b. (5 pts) Report the adjusted odds ratios using tbl_regression().

mod_multi <- glm(fmd ~ exercise + age + sex + smoker + sleep_hrs + income_cat,
                 data = brfss_logistic,
                 family = binomial(link = "logit"))

mod_multi |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(
      exercise   ~ "Exercise (past 30 days)",
      age        ~ "Age (per year)",
      sex        ~ "Sex",
      smoker     ~ "Smoking status",
      sleep_hrs  ~ "Sleep hours (per hour)",
      income_cat ~ "Income category (per unit)"
    )
  ) |>
  bold_labels() |>
  bold_p() |>
  modify_caption(
    "**Table 4a. Adjusted Odds Ratios for Frequent Mental Distress**
    Multiple logistic regression — BRFSS 2020 (n = 5,000)"
  )

**Table 4a. Adjusted Odds Ratios for Frequent Mental Distress** Multiple logistic regression — BRFSS 2020 (n = 5,000)
Characteristic	OR	95% CI	p-value
Exercise (past 30 days)
No	—	—
Yes	0.61	0.51, 0.73	<0.001
Age (per year)	0.97	0.97, 0.98	<0.001
Sex
Male	—	—
Female	1.67	1.42, 1.96	<0.001
Smoking status
Former/Never	—	—
Current	1.25	1.05, 1.48	0.012
Sleep hours (per hour)	0.81	0.77, 0.86	<0.001
Income category (per unit)	0.85	0.82, 0.89	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

4c. (5 pts) For one predictor, compare the crude OR (from Task 3) with the adjusted OR (from Task 4). Show both values.

Crude vs. Adjusted Comparison for age

# Crude ORs from simple models
crude_age <- tidy(mod_age, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "age") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Crude")

# Adjusted ORs from multiple model
adj_age <- tidy(mod_multi, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "age") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Adjusted")

bind_rows(crude_age, adj_age) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "95% CI Lower", "95% CI Upper", "Type"),
        caption = "Crude vs. Adjusted Odds Ratios") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Crude vs. Adjusted Odds Ratios
Predictor	OR	95% CI Lower	95% CI Upper	Type
age	0.974	0.97	0.979	Crude
age	0.975	0.97	0.980	Adjusted

4d. (5 pts) In 2-3 sentences, assess whether confounding is present for the predictor you chose. Which direction did the OR change, and what does this mean?

The crude OR for age was 0.974, and the adjusted OR was 0.975, showing almost no change after controlling for other predictors. Because the OR changed by far less than 10% and the interpretation (older adults have slightly lower odds of FMD) remained the same, there is no evidence of meaningful confounding. The stability of the estimate indicates that age’s association with FMD is not explained by the other variables in the model.

Completion credit (25 points): Awarded for a complete, good-faith attempt at all tasks. Total: 75 + 25 = 100 points.

End of Lab Activity

Logistic Regression — Part 1

EPI 553 — Principles of Statistical Inference II (Spring 2026)

Suruchi Shahi

April 13, 2026

Instructions

Data

Descriptive Overview

Task 1: Explore the Binary Outcome (15 points)

Descriptive Overview

Task 2: Simple Logistic Regression (20 points)

Visualizing the Predicted Probabilities

Task 3: Comparing Predictors (20 points)

Task 4: Introduction to Multiple Logistic Regression (20 points)

Crude vs. Adjusted Comparison for age