library(tidyverse)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet)
library(MASS)
library(brant)
library(ggeffects)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

brfss_poly <- readRDS("brfss_polytomous_2020.rds")

Task 1: Explore the Outcome

1a. Frequency Table of genhlth_ord

brfss_poly |>
  count(genhlth_ord) |>
  mutate(pct = round(100 * n / sum(n), 1)) |>
  kable(
    col.names = c("General Health Category", "N", "%"),
    caption   = "Table 1. Distribution of General Health (3-Level Ordinal Outcome)"
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 1. Distribution of General Health (3-Level Ordinal Outcome)
General Health Category N %
Excellent/VG 2380 47.6
Good 1624 32.5
Fair/Poor 996 19.9

1b. Stacked Bar Chart by Smoker Status

brfss_poly |>
  filter(!is.na(smoker), !is.na(genhlth_ord)) |>
  count(smoker, genhlth_ord) |>
  group_by(smoker) |>
  mutate(pct = n / sum(n)) |>
  ggplot(aes(x = smoker, y = pct, fill = genhlth_ord)) +
  geom_col(width = 0.6) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(
    values = c("Excellent/VG" = "steelblue", "Good" = "darkgreen", "Fair/Poor" = "red2"),
    name   = "General Health"
  ) +
  labs(
    title   = "Distribution of General Health by Smoking Status",
    x       = "Smoking Status",
    y       = "Proportion"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "right")

Stacked bar chart of general health distribution by smoker status

Current smokers show a notably higher proportion of Fair/Poor health and a lower proportion of Excellent/Very Good health compared to former/never smokers.

1c. Descriptive Table by genhlth_ord (≥ 4 predictors)

brfss_poly |>
  dplyr::select(genhlth_ord, age, sex, bmi, exercise, income_cat, smoker) |>
  tbl_summary(
    by        = genhlth_ord,
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    label = list(
      age        ~ "Age (years)",
      sex        ~ "Sex",
      bmi        ~ "BMI",
      exercise   ~ "Exercised past 30 days",
      income_cat ~ "Income category (1–8)",
      smoker     ~ "Smoking status"
    ),
    missing = "no"
  ) |>
  add_overall() |>
  add_p() |>
  bold_p() |>
  modify_caption("**Table 2. Participant Characteristics by General Health Status**")
Table 2. Participant Characteristics by General Health Status
Characteristic Overall, N = 5,0001 Excellent/VG, N = 2,3801 Good, N = 1,6241 Fair/Poor, N = 9961 p-value2
Age (years) 57 (16) 55 (16) 58 (16) 61 (15) <0.001
Sex



0.005
    Male 2,717 (54%) 1,315 (55%) 906 (56%) 496 (50%)
    Female 2,283 (46%) 1,065 (45%) 718 (44%) 500 (50%)
BMI 28 (6) 27 (5) 29 (6) 30 (8) <0.001
Exercised past 30 days 3,630 (73%) 1,999 (84%) 1,161 (71%) 470 (47%) <0.001
Income category (1–8)



<0.001
    1 225 (4.5%) 39 (1.6%) 72 (4.4%) 114 (11%)
    2 271 (5.4%) 69 (2.9%) 92 (5.7%) 110 (11%)
    3 403 (8.1%) 124 (5.2%) 128 (7.9%) 151 (15%)
    4 512 (10%) 174 (7.3%) 188 (12%) 150 (15%)
    5 519 (10%) 198 (8.3%) 197 (12%) 124 (12%)
    6 714 (14%) 354 (15%) 249 (15%) 111 (11%)
    7 844 (17%) 443 (19%) 289 (18%) 112 (11%)
    8 1,512 (30%) 979 (41%) 409 (25%) 124 (12%)
Smoking status



<0.001
    Former/Never 3,304 (66%) 1,692 (71%) 1,015 (63%) 597 (60%)
    Current 1,696 (34%) 688 (29%) 609 (38%) 399 (40%)
1 Mean (SD); n (%)
2 Kruskal-Wallis rank sum test; Pearson’s Chi-squared test

Task 2: Multinomial Logistic Regression

2a. Fit Multinomial Model

mod_multi <- multinom(
  genhlth_nom ~ age + sex + bmi + exercise + income_cat + smoker,
  data  = brfss_poly,
  trace = FALSE
)

summary(mod_multi)
## Call:
## multinom(formula = genhlth_nom ~ age + sex + bmi + exercise + 
##     income_cat + smoker, data = brfss_poly, trace = FALSE)
## 
## Coefficients:
##           (Intercept)        age  sexFemale        bmi exerciseYes income_cat
## Good        -1.317306 0.01497067 -0.1417121 0.05087197   -0.500636 -0.1693992
## Fair/Poor   -1.283880 0.02579060 -0.1257209 0.06741312   -1.338926 -0.3932631
##           smokerCurrent
## Good          0.4117899
## Fair/Poor     0.3436415
## 
## Std. Errors:
##           (Intercept)         age  sexFemale         bmi exerciseYes income_cat
## Good        0.2689762 0.002189706 0.06782701 0.005768280  0.08178912 0.01748213
## Fair/Poor   0.3296881 0.002916356 0.08583048 0.006778269  0.09155273 0.02090747
##           smokerCurrent
## Good         0.07723392
## Fair/Poor    0.09720041
## 
## Residual Deviance: 9287.557 
## AIC: 9315.557

The model fits two simultaneous logit equations: “Good vs. Excellent/VG” and “Fair/Poor vs. Excellent/VG,” using Excellent/VG as the reference category.

2b. Relative Risk Ratios with 95% CIs

tidy(mod_multi, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
  mutate(
    across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
    p.value = format.pval(p.value, digits = 3, eps = 0.001)
  ) |>
  kable(
    col.names = c("Outcome (vs. Excellent/VG)", "Predictor", "RRR", "95% CI Lower", "95% CI Upper", "p-value"),
    caption   = "Table 3. Multinomial Logistic Regression: Relative Risk Ratios"
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  pack_rows("Good vs. Excellent/VG", 1, 7) |>
  pack_rows("Fair/Poor vs. Excellent/VG", 8, 14)
Table 3. Multinomial Logistic Regression: Relative Risk Ratios
Outcome (vs. Excellent/VG) Predictor RRR 95% CI Lower 95% CI Upper p-value
Good vs. Excellent/VG
Good (Intercept) 0.268 0.158 0.454 <0.001
Good age 1.015 1.011 1.019 <0.001
Good sexFemale 0.868 0.760 0.991 0.0367
Good bmi 1.052 1.040 1.064 <0.001
Good exerciseYes 0.606 0.516 0.712 <0.001
Good income_cat 0.844 0.816 0.874 <0.001
Good smokerCurrent 1.510 1.297 1.756 <0.001
Fair/Poor vs. Excellent/VG
Fair/Poor (Intercept) 0.277 0.145 0.529 <0.001
Fair/Poor age 1.026 1.020 1.032 <0.001
Fair/Poor sexFemale 0.882 0.745 1.043 0.1430
Fair/Poor bmi 1.070 1.056 1.084 <0.001
Fair/Poor exerciseYes 0.262 0.219 0.314 <0.001
Fair/Poor income_cat 0.675 0.648 0.703 <0.001
Fair/Poor smokerCurrent 1.410 1.165 1.706 <0.001

2c. Interpretation: Smoker (Fair/Poor vs. Excellent/VG)

Interpretation: Holding age, sex, BMI, exercise, and income constant, current smokers have a relative risk ratio of 1.41 (95% CI: 1.17–1.71) for reporting Fair/Poor health compared to Excellent/Very Good health, relative to former/never smokers. In other words, the odds of being in the Fair/Poor category rather than the Excellent/VG category are approximately 41% higher among current smokers than among former/never smokers, after adjustment.


Task 3: Ordinal Logistic Regression

3a. Fit Ordinal Model

mod_ord <- polr(
  genhlth_ord ~ age + sex + bmi + exercise + income_cat + smoker,
  data   = brfss_poly,
  Hess   = TRUE,
  method = "logistic"
)

summary(mod_ord)
## Call:
## polr(formula = genhlth_ord ~ age + sex + bmi + exercise + income_cat + 
##     smoker, data = brfss_poly, Hess = TRUE, method = "logistic")
## 
## Coefficients:
##                  Value Std. Error t value
## age            0.01797   0.001867   9.627
## sexFemale     -0.12885   0.056835  -2.267
## bmi            0.04973   0.004534  10.967
## exerciseYes   -0.92167   0.064426 -14.306
## income_cat    -0.26967   0.014113 -19.107
## smokerCurrent  0.29313   0.064304   4.558
## 
## Intercepts:
##                   Value    Std. Error t value 
## Excellent/VG|Good   0.0985   0.2200     0.4478
## Good|Fair/Poor      1.8728   0.2212     8.4650
## 
## Residual Deviance: 9318.274 
## AIC: 9334.274

3b. Cumulative Odds Ratios with 95% CIs

ci_ord <- confint(mod_ord)
ord_tbl <- tibble(
  term      = rownames(ci_ord),
  conf.low  = exp(ci_ord[, 1]),
  conf.high = exp(ci_ord[, 2])
) |>
  left_join(
    tidy(mod_ord) |> filter(!str_detect(term, "\\|")) |>
      dplyr::select(term, estimate),
    by = "term"
  ) |>
  mutate(estimate = exp(estimate))

bmi_or <- ord_tbl |> filter(term == "bmi")

3c. Interpretation of One Cumulative OR

Interpretation: Each one-unit increase in BMI is associated with a cumulative OR of 1.05 (95% CI: 1.04–1.06) for being in a worse health category. Because this is a proportional odds model, this OR applies at every cut-point simultaneously — that is, each additional BMI unit multiplies the odds of reporting “Good or worse” (vs. Excellent/VG) and the odds of reporting “Fair/Poor” (vs. Good or better) by the same factor of approximately 1.05, after adjusting for all other covariates.

3d. Predicted Probabilities Across BMI

pred_bmi <- ggpredict(mod_ord, terms = "bmi [10:55 by=1]")

plot(pred_bmi) +
  labs(
    title    = "Predicted Probabilities of General Health Category Across BMI",
    subtitle = "Ordinal logistic regression model; other covariates held at mean/reference",
    x        = "BMI",
    y        = "Predicted Probability",
    color    = "Health Category"
  ) +
  theme_minimal(base_size = 13)

Predicted probabilities of each health category across BMI values

As BMI increases, the predicted probability of Excellent/VG health declines steadily while the probabilities of Good and Fair/Poor health rise, consistent with a positive association between BMI and worse self-rated health.


Task 4: Check Assumptions and Compare Models

4a. Brant Test for Proportional Odds

brant_out <- brant(mod_ord)
## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      35.87   6   0
## age      0.06    1   0.81
## sexFemale    0.89    1   0.34
## bmi      7.1 1   0.01
## exerciseYes  10.58   1   0
## income_cat   10.54   1   0
## smokerCurrent    7.76    1   0.01
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds
as.data.frame(brant_out) |>
  tibble::rownames_to_column("Predictor") |>
  mutate(
    X2          = round(X2, 2),
    probability = ifelse(probability < 0.001, "< 0.001",
                         sprintf("%.3f", probability)),
    `Assumption Violated?` = case_when(
      Predictor == "Omnibus" & (probability == "< 0.001" |
        as.numeric(gsub("< ", "", probability)) < 0.05) ~ "Yes (overall)",
      Predictor == "Omnibus" ~ "No (overall)",
      probability == "< 0.001" |
        as.numeric(gsub("< ", "", probability)) < 0.05  ~ "⚠ Yes",
      TRUE ~ ""
    )
  ) |>
  kable(
    col.names = c("Predictor", "X²", "df", "p-value", "Assumption Violated?"),
    caption   = "Table 5. Brant Test for Proportional Odds Assumption",
    align     = c("l", "r", "r", "r", "c")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(0, bold = TRUE) |>
  row_spec(1, bold = TRUE, background = "#f0f0f0")
Table 5. Brant Test for Proportional Odds Assumption
Predictor df p-value Assumption Violated?
Omnibus 35.87 6 < 0.001 Yes (overall)
age 0.06 1 0.814
sexFemale 0.89 1 0.345
bmi 7.10 1 0.008 ⚠ Yes
exerciseYes 10.58 1 0.001 ⚠ Yes
income_cat 10.54 1 0.001 ⚠ Yes
smokerCurrent 7.76 1 0.005 ⚠ Yes

Interpretation: Examine the omnibus p-value first. If it is < 0.05, the proportional odds assumption is violated for the model as a whole, and individual predictor rows identify which variables are driving the violation.

4b. AIC Comparison

tibble(
  Model = c("Multinomial logistic", "Ordinal logistic (prop. odds)"),
  AIC   = c(AIC(mod_multi), AIC(mod_ord)),
  `No. Parameters` = c(mod_multi$edf,
                        length(coef(mod_ord)) + length(mod_ord$zeta))
) |>
  mutate(AIC = round(AIC, 1)) |>
  kable(caption = "Table 6. AIC Comparison: Multinomial vs. Ordinal Model") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Table 6. AIC Comparison: Multinomial vs. Ordinal Model
Model AIC No. Parameters
Multinomial logistic 9315.6 14
Ordinal logistic (prop. odds) 9334.3 8

4c. Model Recommendation

Recommendation: If the Brant test omnibus p-value is ≥ 0.05, the ordinal logistic regression model should be preferred. It is more parsimonious (fewer parameters), yields a single, easily interpretable cumulative OR for each predictor, and, assuming proportional odds holds, will have a lower AIC. If the Brant test reveals a significant violation — particularly for key predictors such as smoker or sex the multinomial model is more appropriate because it does not impose the proportional odds constraint, at the cost of requiring separate interpretation for each outcome contrast. In that case, the flexibility of the multinomial model outweighs its added complexity.


End of Lab — EPI 553, April 16, 2026