library(tidyverse)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(ggeffects)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

brfss_log <- readRDS("~/R/site-library/Epi553/code/brfss_logistic_2020.rds")

Task 1: Explore the Binary Outcome

1a. Frequency Table of FMD

brfss_log |>
  count(fmd) |>
  mutate(Percentage = round(100 * n / sum(n), 1)) |>
  rename(`FMD Status` = fmd, Count = n) |>
  kable(caption = "Frequency Table: Frequent Mental Distress (FMD)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Frequency Table: Frequent Mental Distress (FMD)
FMD Status	Count	Percentage
No	4243	84.9
Yes	757	15.1

Interpretation: The table shows the number and percentage of participants with and without frequent mental distress (14+ mentally unhealthy days in the past 30 days).

1b. Descriptive Summary by FMD Status

brfss_log |>
  tbl_summary(
    by = fmd,
    include = c(sleep_hrs, age, sex, exercise, smoker, bmi, income_cat),
    type = list(c(sleep_hrs, age, bmi, income_cat) ~ "continuous"),
    statistic = list(all_continuous() ~ "{mean} ({sd})"),
    label = list(
      sleep_hrs  ~ "Sleep hours per night",
      age        ~ "Age (years)",
      sex        ~ "Sex",
      exercise   ~ "Exercise in past 30 days",
      smoker     ~ "Smoking status",
      bmi        ~ "BMI",
      income_cat ~ "Income category (1–8)"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels() |>
  modify_caption("**Descriptive Summary by FMD Status**")

**Descriptive Summary by FMD Status**
Characteristic	Overall, N = 5,000¹	No, N = 4,243¹	Yes, N = 757¹	p-value²
Sleep hours per night	7.00 (1.48)	7.09 (1.40)	6.51 (1.83)	<0.001
Age (years)	56 (16)	57 (16)	50 (16)	<0.001
Sex				<0.001
Male	2,701 (54%)	2,378 (56%)	323 (43%)
Female	2,299 (46%)	1,865 (44%)	434 (57%)
Exercise in past 30 days	3,673 (73%)	3,192 (75%)	481 (64%)	<0.001
Smoking status				<0.001
Former/Never	3,280 (66%)	2,886 (68%)	394 (52%)
Current	1,720 (34%)	1,357 (32%)	363 (48%)
BMI	28.5 (6.3)	28.4 (6.2)	29.3 (7.0)	0.001
Income category (1–8)	5.85 (2.11)	6.00 (2.04)	5.05 (2.29)	<0.001
¹ Mean (SD); n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

Interpretation: Participants with frequent mental distress sleep fewer hours, are younger on average, are more likely to be female, less likely to exercise, and more likely to be current smokers compared to those without FMD. All differences are statistically significant (p < 0.05).

1c. Bar Chart: Proportion of FMD by Exercise Status

brfss_log |>
  group_by(exercise, fmd) |>
  summarise(n = n(), .groups = "drop") |>
  group_by(exercise) |>
  mutate(prop = n / sum(n)) |>
  filter(fmd == "Yes") |>
  ggplot(aes(x = exercise, y = prop, fill = exercise)) +
  geom_bar(stat = "identity", width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = paste0(round(prop * 100, 1), "%")),
            vjust = -0.5, size = 4.5, fontface = "bold") +
  scale_y_continuous(labels = scales::percent_format(), limits = c(0, 0.35)) +
  scale_fill_manual(values = c("No" = "#d45f5f", "Yes" = "#4e9abf")) +
  labs(
    title = "Proportion with Frequent Mental Distress by Exercise Status",
    x = "Exercised in Past 30 Days",
    y = "Proportion with FMD"
  ) +
  theme_minimal(base_size = 13)

Interpretation: A higher proportion of individuals who did not exercise reported frequent mental distress compared to those who did exercise, suggesting a protective association between physical activity and mental health.

Task 2: Simple Logistic Regression

2a. Log-Odds Coefficients: FMD ~ Exercise

mod_exercise <- glm(fmd ~ exercise, data = brfss_log, family = binomial(link = "logit"))

tidy(mod_exercise) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)
term	estimate	std.error	statistic	p.value
(Intercept)	-1.3371	0.0676	-19.7689	0
exerciseYes	-0.5554	0.0835	-6.6545	0

Interpretation: The coefficient for exerciseYes is the change in log-odds of FMD associated with exercising versus not exercising. A negative coefficient indicates that exercise is associated with lower log-odds of FMD.

2b. Odds Ratios with 95% Confidence Intervals

mod_exercise |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(exercise ~ "Exercise in past 30 days")
  ) |>
  bold_labels() |>
  bold_p() |>
  modify_caption("**Simple Logistic Regression: FMD ~ Exercise (Odds Ratios)**")

**Simple Logistic Regression: FMD ~ Exercise (Odds Ratios)**
Characteristic	OR¹	95% CI¹	p-value
Exercise in past 30 days
No	—	—
Yes	0.57	0.49, 0.68	<0.001
¹ OR = Odds Ratio, CI = Confidence Interval

2c. Interpretation of the Odds Ratio for Exercise

The odds ratio for exerciseYes represents the odds of frequent mental distress among those who exercised in the past 30 days compared to those who did not exercise. An OR < 1 indicates that individuals who exercised have lower odds of experiencing frequent mental distress, suggesting a protective association between regular physical activity and mental health. For example, if OR = 0.60, individuals who exercised have approximately 40% lower odds of FMD compared to non-exercisers, after accounting for sampling variability.

2d. Predicted Probability of FMD Across Sleep Hours

mod_sleep <- glm(fmd ~ sleep_hrs, data = brfss_log, family = binomial(link = "logit"))

ggpredict(mod_sleep, terms = "sleep_hrs") |>
  plot() +
  labs(
    title = "Predicted Probability of Frequent Mental Distress by Sleep Hours",
    x = "Sleep Hours per Night",
    y = "Predicted Probability of FMD"
  ) +
  theme_minimal(base_size = 13)

Interpretation: The predicted probability of frequent mental distress decreases as sleep hours increase, consistent with well-established evidence linking insufficient sleep to poor mental health outcomes.

Task 3: Comparing Predictors

3a. Three Simple Logistic Regression Models

The three predictors chosen are sleep hours, smoking status, and income category.

mod_sleep   <- glm(fmd ~ sleep_hrs,   data = brfss_log, family = binomial)
mod_smoker  <- glm(fmd ~ smoker,      data = brfss_log, family = binomial)
mod_income  <- glm(fmd ~ income_cat,  data = brfss_log, family = binomial)

3b. Comparison Table of Odds Ratios

get_or <- function(model, term_label, term_name) {
  tidy(model, exponentiate = TRUE, conf.int = TRUE) |>
    filter(term == term_name) |>
    mutate(
      Predictor = term_label,
      OR        = round(estimate, 3),
      `95% CI`  = paste0("(", round(conf.low, 3), ", ", round(conf.high, 3), ")"),
      `p-value` = ifelse(p.value < 0.001, "< 0.001", round(p.value, 3))
    ) |>
    dplyr::select(Predictor, OR, `95% CI`, `p-value`)
}

bind_rows(
  get_or(mod_sleep,  "Sleep hours (per hour)",           "sleep_hrs"),
  get_or(mod_smoker, "Smoking: Current vs. Former/Never","smokerCurrent"),
  get_or(mod_income, "Income category (per unit)",       "income_cat")
) |>
  kable(caption = "Crude Odds Ratios from Three Simple Logistic Regression Models") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Crude Odds Ratios from Three Simple Logistic Regression Models
Predictor	OR	95% CI	p-value
Sleep hours (per hour)	0.765	(0.725, 0.807)	< 0.001
Smoking: Current vs. Former/Never	1.959	(1.675, 2.291)	< 0.001
Income category (per unit)	0.821	(0.793, 0.85)	< 0.001

3c. Strongest Crude Association with FMD

Among the three predictors,smoking status (current vs. former/never) has the strongest crude association with FMD, as evidenced by the odds ratio furthest from 1.0 and the narrowest confidence interval relative to its estimate. Current smokers have substantially higher odds of frequent mental distress than former/never smokers. Sleep hours and income are also significantly associated with FMD, but smoking status shows the largest magnitude of effect in this unadjusted analysis.

Task 4: Multiple Logistic Regression

4a. Multiple Logistic Regression Model

mod_multi <- glm(fmd ~ exercise + smoker + sleep_hrs + age + sex + income_cat,
                 data   = brfss_log,
                 family = binomial(link = "logit"))

4b. Adjusted Odds Ratios

mod_multi |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(
      exercise   ~ "Exercise in past 30 days",
      smoker     ~ "Smoking status",
      sleep_hrs  ~ "Sleep hours (per hour)",
      age        ~ "Age (per year)",
      sex        ~ "Sex",
      income_cat ~ "Income category (per unit)"
    )
  ) |>
  bold_labels() |>
  bold_p() |>
  modify_caption("**Multiple Logistic Regression: Adjusted Odds Ratios for FMD**")

**Multiple Logistic Regression: Adjusted Odds Ratios for FMD**
Characteristic	OR¹	95% CI¹	p-value
Exercise in past 30 days
No	—	—
Yes	0.61	0.51, 0.73	<0.001
Smoking status
Former/Never	—	—
Current	1.25	1.05, 1.48	0.012
Sleep hours (per hour)	0.81	0.77, 0.86	<0.001
Age (per year)	0.97	0.97, 0.98	<0.001
Sex
Male	—	—
Female	1.67	1.42, 1.96	<0.001
Income category (per unit)	0.85	0.82, 0.89	<0.001
¹ OR = Odds Ratio, CI = Confidence Interval

4c. Crude vs. Adjusted OR for Smoking Status

extract_or <- function(model, term_name, model_label) {
  tidy(model, exponentiate = TRUE, conf.int = TRUE) |>
    filter(term == term_name) |>
    mutate(
      Model    = model_label,
      OR       = round(estimate, 3),
      `95% CI` = paste0("(", round(conf.low, 3), ", ", round(conf.high, 3), ")"),
      `p-value` = ifelse(p.value < 0.001, "< 0.001", as.character(round(p.value, 3)))
    ) |>
    dplyr::select(Model, OR, `95% CI`, `p-value`)
}

bind_rows(
  extract_or(mod_smoker, "smokerCurrent", "Crude"),
  extract_or(mod_multi,  "smokerCurrent", "Adjusted")
) |>
  kable(caption = "Crude vs. Adjusted OR for Smoking Status") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Crude vs. Adjusted OR for Smoking Status
Model	OR	95% CI	p-value
Crude	1.959	(1.675, 2.291)	< 0.001
Adjusted	1.247	(1.05, 1.48)	0.012

4d. Confounding Assessment for Smoking Status

After adjusting for exercise, sleep hours, age, sex, and income category, the odds ratio for current smoking status attenuates somewhat toward the null compared to the crude OR. This attenuation indicates the presence of positive confounding other variables in the model (such as exercise and income, which are lower in current smokers) were partly explaining the crude smoking–FMD association. However, the adjusted OR remains elevated and statistically significant, indicating that smoking status retains an independent association with frequent mental distress even after accounting for these covariates.

End of Lab Activity

EPI 553 — Logistic Regression Part 1 Lab

Karo Ekor

April 13, 2026