Setup and Data

library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(ggeffects)
library(plotly)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

The BRFSS 2020 Dataset

We continue using the Behavioral Risk Factor Surveillance System (BRFSS) 2020. In previous lectures, we modeled mentally unhealthy days as a continuous outcome. Today, we transform this into a binary outcome: frequent mental distress (FMD), defined as reporting 14 or more mentally unhealthy days in the past 30 days. This threshold is used by the CDC as a standard measure of frequent mental distress.

Research question:

What individual, behavioral, and socioeconomic factors are associated with frequent mental distress among U.S. adults?

brfss_full <- read_xpt(
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.XPT"
) |>
  clean_names()

brfss_logistic <- brfss_full |>
  mutate(
    # Binary outcome: frequent mental distress (>= 14 days)
    menthlth_days = case_when(
      menthlth == 88                  ~ 0,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE                           ~ NA_real_
    ),
    fmd = factor(
      ifelse(menthlth_days >= 14, 1, 0),
      levels = c(0, 1),
      labels = c("No", "Yes")
    ),
    # Predictors
    physhlth_days = case_when(
      physhlth == 88                  ~ 0,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE                           ~ NA_real_
    ),
    sleep_hrs = case_when(
      sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
      TRUE                           ~ NA_real_
    ),
    age = age80,
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    smoker = factor(case_when(
      smokday2 %in% c(1, 2) ~ "Current",
      smokday2 == 3          ~ "Former/Never",
      TRUE                   ~ NA_character_
    ), levels = c("Former/Never", "Current"))
  ) |>
  filter(
    !is.na(fmd), !is.na(physhlth_days), !is.na(sleep_hrs),
    !is.na(age), age >= 18, !is.na(sex), !is.na(bmi),
    !is.na(exercise), !is.na(income_cat), !is.na(smoker)
  )

set.seed(1220)
brfss_logistic <- brfss_logistic |>
  dplyr::select(fmd, menthlth_days, physhlth_days, sleep_hrs, age, sex,
                bmi, exercise, income_cat, smoker) |>
  slice_sample(n = 5000)

# Save for lab use
saveRDS(brfss_logistic,
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.rds")

tibble(Metric = c("Observations", "Variables", "FMD Cases", "FMD Prevalence"),
       Value  = c(nrow(brfss_logistic), ncol(brfss_logistic),
                  sum(brfss_logistic$fmd == "Yes"),
                  paste0(round(100 * mean(brfss_logistic$fmd == "Yes"), 1), "%"))) |>
  kable(caption = "Analytic Dataset Overview") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Analytic Dataset Overview
Metric	Value
Observations	5000
Variables	10
FMD Cases	757
FMD Prevalence	15.1%

Descriptive Overview

brfss_logistic |>
  tbl_summary(
    by = fmd,
    include = c(physhlth_days, sleep_hrs, age, sex, bmi, exercise,
                income_cat, smoker),
    type = list(
      c(physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      physhlth_days ~ "Physical unhealthy days",
      sleep_hrs     ~ "Sleep hours",
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercise in past 30 days",
      income_cat    ~ "Income category (1-8)",
      smoker        ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()

Characteristic	Overall N = 5,000¹	No N = 4,243¹	Yes N = 757¹
Physical unhealthy days	4 (9)	3 (8)	10 (13)
Sleep hours	7.00 (1.48)	7.09 (1.40)	6.51 (1.83)
Age (years)	56 (16)	57 (16)	50 (16)
Sex
Male	2,701 (54%)	2,378 (56%)	323 (43%)
Female	2,299 (46%)	1,865 (44%)	434 (57%)
BMI	28.5 (6.3)	28.4 (6.2)	29.3 (7.0)
Exercise in past 30 days	3,673 (73%)	3,192 (75%)	481 (64%)
Income category (1-8)	5.85 (2.11)	6.00 (2.04)	5.05 (2.29)
Smoking status
Former/Never	3,280 (66%)	2,886 (68%)	394 (52%)
Current	1,720 (34%)	1,357 (32%)	363 (48%)
¹ Mean (SD); n (%)

In-Class Lab Activity

Data

Variable	Description	Type
`fmd`	Frequent mental distress (No/Yes)	Factor (outcome)
`menthlth_days`	Mentally unhealthy days (0-30)	Numeric
`physhlth_days`	Physically unhealthy days (0-30)	Numeric
`sleep_hrs`	Hours of sleep per night	Numeric
`age`	Age in years	Numeric
`sex`	Male / Female	Factor
`bmi`	Body mass index	Numeric
`exercise`	Exercised in past 30 days (No/Yes)	Factor
`income_cat`	Household income category (1-8)	Numeric
`smoker`	Former/Never vs. Current	Factor

Task 1: Explore the Binary Outcome (15 points)

1a. (5 pts) Create a frequency table showing the number and percentage of individuals with and without frequent mental distress.

# Create frequency table (counts)
freq_table <- table(brfss_logistic$fmd)

# Convert to percentages
percent_table <- prop.table(freq_table) * 100

# Combine into one table
result <- data.frame(
  Category = names(freq_table),
  Count = as.vector(freq_table),
  Percentage = round(as.vector(percent_table), 2)
)

# View result
print(result)

##   Category Count Percentage
## 1       No  4243      84.86
## 2      Yes   757      15.14

1b. (5 pts) Create a descriptive summary table of at least 4 predictors, stratified by FMD status. Use tbl_summary().

brfss_logistic |>
  tbl_summary(
    by = fmd,
    include = c(exercise, age, sex, bmi),
    type = list(
      c(age, bmi) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      exercise      ~ "Exercise",
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()

Characteristic	Overall N = 5,000¹	No N = 4,243¹	Yes N = 757¹
Exercise	3,673 (73%)	3,192 (75%)	481 (64%)
Age (years)	56 (16)	57 (16)	50 (16)
Sex
Male	2,701 (54%)	2,378 (56%)	323 (43%)
Female	2,299 (46%)	1,865 (44%)	434 (57%)
BMI	28.5 (6.3)	28.4 (6.2)	29.3 (7.0)
¹ n (%); Mean (SD)

1c. (5 pts) Create a bar chart showing the proportion of FMD by exercise status OR smoking status.

ggplot(brfss_logistic, aes(x = fmd, fill = exercise)) +
  geom_bar(color = "black", position = "dodge") +
  geom_text(stat = "count", aes(label = ..count..),
            position = position_dodge(width = 0.9), vjust = -0.5) +
  labs(title = "Frequent mental distress by exercise",
       x = "Frequent mental distress",
       y = "Count") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5)
  )

Task 2: Simple Logistic Regression (20 points)

2a. (5 pts) Fit a simple logistic regression model predicting FMD from exercise. Report the coefficients on the log-odds scale.

mod_ex <- glm(fmd ~ exercise, data = brfss_logistic,
                      family = binomial(link = "logit"))
tidy(mod_ex, conf.int = TRUE, exponentiate = FALSE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	-1.337	0.068	-19.769	0	-1.471	-1.206
exerciseYes	-0.555	0.083	-6.655	0	-0.718	-0.391

2b. (5 pts) Exponentiate the coefficients to obtain odds ratios with 95% confidence intervals.

mod_ex <- glm(
  fmd ~ exercise,
  data = brfss_logistic,
  family = binomial(link = "logit")
)

tidy(mod_ex, conf.int = TRUE, exponentiate = TRUE)  |>
  kable(
    digits = 3,
    caption = "Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)"
  )  |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.263	0.068	-19.769	0	0.230	0.299
exerciseYes	0.574	0.083	-6.655	0	0.488	0.676

2c. (5 pts) Interpret the odds ratio for exercise in the context of the research question.

The odds ratio for exerciseYes is 0.574(95%CI: 0.488-0.676). This shows us that individuals who exercised in the past 30 days have about 43% lower odds of exerpeincing frequent mental distress compared to those who do not exercise.

2d. (5 pts) Create a plot showing the predicted probability of FMD across levels of a continuous predictor (e.g., age or sleep hours).

mod_age <- glm(fmd ~ age, data = brfss_logistic,
               family = binomial(link = "logit"))

ggpredict(mod_age, terms = "age [18:80]") |>
  plot() +
  labs(title = "Predicted Probability of Frequent Mental Distress by Age",
       x = "Age (years)", y = "Predicted Probability of FMD") +
  theme_minimal()

Task 3: Comparing Predictors (20 points)

3a. (5 pts) Fit three separate simple logistic regression models, each with a different predictor of your choice.

mod_smoker <- glm(fmd ~ smoker, data = brfss_logistic,
                  family = binomial(link = "logit"))

tidy(mod_smoker, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ Smoking (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Smoking (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.137	0.054	-37.076	0	0.123	0.151
smokerCurrent	1.959	0.080	8.424	0	1.675	2.291

mod_bmi <- glm(fmd ~ bmi, data = brfss_logistic,
                  family = binomial(link = "logit"))

tidy(mod_bmi, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ BMI (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ BMI (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.094	0.178	-13.299	0	0.066	0.133
bmi	1.023	0.006	3.745	0	1.011	1.034

mod_age <- glm(fmd ~ age, data = brfss_logistic,
                  family = binomial(link = "logit"))

tidy(mod_age, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: FMD ~ Age (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Age (Odds Ratio Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.725	0.131	-2.451	0.014	0.559	0.936
age	0.974	0.002	-10.703	0.000	0.970	0.979

3b. (10 pts) Create a table comparing the odds ratios from all three models.

m1 <- glm(fmd ~ age, family = binomial, data = brfss_logistic)
m2 <- glm(fmd ~ bmi, family = binomial, data = brfss_logistic)
m3 <- glm(fmd ~ smoker, family = binomial, data = brfss_logistic)

tribble(
  ~Model, ~`Odds Ratio`, ~`95% CI`,
  
  "M1 (age)",
  round(exp(coef(m1)[2]), 3),
  paste0("(", round(exp(confint(m1)[2,1]),3), ", ", round(exp(confint(m1)[2,2]),3), ")"),

  
  "M2 (bmi)",
  round(exp(coef(m2)[2]), 3),
  paste0("(", round(exp(confint(m2)[2,1]),3), ", ", round(exp(confint(m2)[2,2]),3), ")"),
 
  
  "M3 (smoker)",
  round(exp(coef(m3)[2]), 3),
  paste0("(", round(exp(confint(m3)[2,1]),3), ", ", round(exp(confint(m3)[2,2]),3), ")"),
) |>
  kable(caption = "Comparing 3 Logistic Regression Models") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(0, bold = TRUE)

Comparing 3 Logistic Regression Models
Model	Odds Ratio	95% CI
M1 (age)	0.974	(0.97, 0.979)
M2 (bmi)	1.023	(1.011, 1.034)
M3 (smoker)	1.959	(1.675, 2.291)

3c. (5 pts) Which predictor has the strongest crude association with FMD? Justify your answer.

Among the three predictors, smoking has the strongest association with FMD. The OR is 1.959, meaning that current smokers have nearly twice the odds of experiencing frequent mental distress compared to non-smokers. Age and BMI have a weaker association but are still statistically significant.

Task 4: Introduction to Multiple Logistic Regression (20 points)

4a. (5 pts) Fit a multiple logistic regression model predicting FMD from at least 3 predictors.

4b. (5 pts) Report the adjusted odds ratios using tbl_regression().

mod_multi <- glm(fmd ~ age + bmi + smoker,
                 data = brfss_logistic,
                 family = binomial(link = "logit"))

mod_multi |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(
      age    ~ "Age",
      bmi    ~ "BMI",
      smoker ~ "Smoking status"
    )
  ) |>
  bold_labels() |>
  bold_p()

Characteristic	OR	95% CI	p-value
Age	0.98	0.97, 0.98	<0.001
BMI	1.02	1.01, 1.03	<0.001
Smoking status
Former/Never	—	—
Current	1.67	1.42, 1.97	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

4c. (5 pts) For one predictor, compare the crude OR (from Task 3) with the adjusted OR (from Task 4). Show both values.

# Crude ORs 

crude_smoker <- tidy(mod_smoker, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "smokerCurrent") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Crude")

# Adjusted ORs 

adj_smoker <- tidy(mod_multi, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term == "smokerCurrent") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(type = "Adjusted")

bind_rows(crude_smoker, adj_smoker) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "95% CI Lower", "95% CI Upper", "Type"),
        caption = "Crude vs. Adjusted Odds Ratios") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Crude vs. Adjusted Odds Ratios
Predictor	OR	95% CI Lower	95% CI Upper	Type
smokerCurrent	1.959	1.675	2.291	Crude
smokerCurrent	1.672	1.421	1.967	Adjusted

4d. (5 pts) In 2-3 sentences, assess whether confounding is present for the predictor you chose. Which direction did the OR change, and what does this mean?

Confounding is present for smoking status. The crude ratio for current smokers is 1.959, but after adjusting for age and BMI, the adjusted odds ratio decreased to 1.672. The shows that part of the association between smoking and FMD is explained by other predictors. The direction of the OR decreased showing us that age and BMI are positive confounders.

Completion credit (25 points): Awarded for a complete, good-faith attempt at all tasks. Total: 75 + 25 = 100 points.

End of Lab Activity

Logistic Regression — Part 1

EPI 553 — Principles of Statistical Inference II (Spring 2026)

Grace Beal

April 13, 2026