library(tidyverse)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet)
library(MASS)
library(brant)
library(ggeffects)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
brfss_poly <- readRDS("brfss_polytomous_2020.rds")genhlth_ordbrfss_poly |>
count(genhlth_ord) |>
mutate(pct = round(100 * n / sum(n), 1)) |>
kable(
col.names = c("General Health Category", "N", "%"),
caption = "Table 1. Distribution of General Health (3-Level Ordinal Outcome)"
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| General Health Category | N | % |
|---|---|---|
| Excellent/VG | 2380 | 47.6 |
| Good | 1624 | 32.5 |
| Fair/Poor | 996 | 19.9 |
brfss_poly |>
filter(!is.na(smoker), !is.na(genhlth_ord)) |>
count(smoker, genhlth_ord) |>
group_by(smoker) |>
mutate(pct = n / sum(n)) |>
ggplot(aes(x = smoker, y = pct, fill = genhlth_ord)) +
geom_col(width = 0.6) +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(
values = c("Excellent/VG" = "steelblue", "Good" = "darkgreen", "Fair/Poor" = "red2"),
name = "General Health"
) +
labs(
title = "Distribution of General Health by Smoking Status",
x = "Smoking Status",
y = "Proportion"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "right")Current smokers show a notably higher proportion of Fair/Poor health and a lower proportion of Excellent/Very Good health compared to former/never smokers.
genhlth_ord (≥ 4
predictors)brfss_poly |>
dplyr::select(genhlth_ord, age, sex, bmi, exercise, income_cat, smoker) |>
tbl_summary(
by = genhlth_ord,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
label = list(
age ~ "Age (years)",
sex ~ "Sex",
bmi ~ "BMI",
exercise ~ "Exercised past 30 days",
income_cat ~ "Income category (1–8)",
smoker ~ "Smoking status"
),
missing = "no"
) |>
add_overall() |>
add_p() |>
bold_p() |>
modify_caption("**Table 2. Participant Characteristics by General Health Status**")| Characteristic | Overall, N = 5,0001 | Excellent/VG, N = 2,3801 | Good, N = 1,6241 | Fair/Poor, N = 9961 | p-value2 |
|---|---|---|---|---|---|
| Age (years) | 57 (16) | 55 (16) | 58 (16) | 61 (15) | <0.001 |
| Sex | 0.005 | ||||
| Male | 2,717 (54%) | 1,315 (55%) | 906 (56%) | 496 (50%) | |
| Female | 2,283 (46%) | 1,065 (45%) | 718 (44%) | 500 (50%) | |
| BMI | 28 (6) | 27 (5) | 29 (6) | 30 (8) | <0.001 |
| Exercised past 30 days | 3,630 (73%) | 1,999 (84%) | 1,161 (71%) | 470 (47%) | <0.001 |
| Income category (1–8) | <0.001 | ||||
| 1 | 225 (4.5%) | 39 (1.6%) | 72 (4.4%) | 114 (11%) | |
| 2 | 271 (5.4%) | 69 (2.9%) | 92 (5.7%) | 110 (11%) | |
| 3 | 403 (8.1%) | 124 (5.2%) | 128 (7.9%) | 151 (15%) | |
| 4 | 512 (10%) | 174 (7.3%) | 188 (12%) | 150 (15%) | |
| 5 | 519 (10%) | 198 (8.3%) | 197 (12%) | 124 (12%) | |
| 6 | 714 (14%) | 354 (15%) | 249 (15%) | 111 (11%) | |
| 7 | 844 (17%) | 443 (19%) | 289 (18%) | 112 (11%) | |
| 8 | 1,512 (30%) | 979 (41%) | 409 (25%) | 124 (12%) | |
| Smoking status | <0.001 | ||||
| Former/Never | 3,304 (66%) | 1,692 (71%) | 1,015 (63%) | 597 (60%) | |
| Current | 1,696 (34%) | 688 (29%) | 609 (38%) | 399 (40%) | |
| 1 Mean (SD); n (%) | |||||
| 2 Kruskal-Wallis rank sum test; Pearson’s Chi-squared test | |||||
mod_multi <- multinom(
genhlth_nom ~ age + sex + bmi + exercise + income_cat + smoker,
data = brfss_poly,
trace = FALSE
)
summary(mod_multi)## Call:
## multinom(formula = genhlth_nom ~ age + sex + bmi + exercise +
## income_cat + smoker, data = brfss_poly, trace = FALSE)
##
## Coefficients:
## (Intercept) age sexFemale bmi exerciseYes income_cat
## Good -1.317306 0.01497067 -0.1417121 0.05087197 -0.500636 -0.1693992
## Fair/Poor -1.283880 0.02579060 -0.1257209 0.06741312 -1.338926 -0.3932631
## smokerCurrent
## Good 0.4117899
## Fair/Poor 0.3436415
##
## Std. Errors:
## (Intercept) age sexFemale bmi exerciseYes income_cat
## Good 0.2689762 0.002189706 0.06782701 0.005768280 0.08178912 0.01748213
## Fair/Poor 0.3296881 0.002916356 0.08583048 0.006778269 0.09155273 0.02090747
## smokerCurrent
## Good 0.07723392
## Fair/Poor 0.09720041
##
## Residual Deviance: 9287.557
## AIC: 9315.557
The model fits two simultaneous logit equations: “Good vs. Excellent/VG” and “Fair/Poor vs. Excellent/VG,” using Excellent/VG as the reference category.
tidy(mod_multi, conf.int = TRUE, exponentiate = TRUE) |>
dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
mutate(
across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
p.value = format.pval(p.value, digits = 3, eps = 0.001)
) |>
kable(
col.names = c("Outcome (vs. Excellent/VG)", "Predictor", "RRR", "95% CI Lower", "95% CI Upper", "p-value"),
caption = "Table 3. Multinomial Logistic Regression: Relative Risk Ratios"
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
pack_rows("Good vs. Excellent/VG", 1, 7) |>
pack_rows("Fair/Poor vs. Excellent/VG", 8, 14)| Outcome (vs. Excellent/VG) | Predictor | RRR | 95% CI Lower | 95% CI Upper | p-value |
|---|---|---|---|---|---|
| Good vs. Excellent/VG | |||||
| Good | (Intercept) | 0.268 | 0.158 | 0.454 | <0.001 |
| Good | age | 1.015 | 1.011 | 1.019 | <0.001 |
| Good | sexFemale | 0.868 | 0.760 | 0.991 | 0.0367 |
| Good | bmi | 1.052 | 1.040 | 1.064 | <0.001 |
| Good | exerciseYes | 0.606 | 0.516 | 0.712 | <0.001 |
| Good | income_cat | 0.844 | 0.816 | 0.874 | <0.001 |
| Good | smokerCurrent | 1.510 | 1.297 | 1.756 | <0.001 |
| Fair/Poor vs. Excellent/VG | |||||
| Fair/Poor | (Intercept) | 0.277 | 0.145 | 0.529 | <0.001 |
| Fair/Poor | age | 1.026 | 1.020 | 1.032 | <0.001 |
| Fair/Poor | sexFemale | 0.882 | 0.745 | 1.043 | 0.1430 |
| Fair/Poor | bmi | 1.070 | 1.056 | 1.084 | <0.001 |
| Fair/Poor | exerciseYes | 0.262 | 0.219 | 0.314 | <0.001 |
| Fair/Poor | income_cat | 0.675 | 0.648 | 0.703 | <0.001 |
| Fair/Poor | smokerCurrent | 1.410 | 1.165 | 1.706 | <0.001 |
Interpretation: Holding age, sex, BMI, exercise, and income constant, current smokers have a relative risk ratio of 1.41 (95% CI: 1.17–1.71) for reporting Fair/Poor health compared to Excellent/Very Good health, relative to former/never smokers. In other words, the odds of being in the Fair/Poor category rather than the Excellent/VG category are approximately 41% higher among current smokers than among former/never smokers, after adjustment.
mod_ord <- polr(
genhlth_ord ~ age + sex + bmi + exercise + income_cat + smoker,
data = brfss_poly,
Hess = TRUE,
method = "logistic"
)
summary(mod_ord)## Call:
## polr(formula = genhlth_ord ~ age + sex + bmi + exercise + income_cat +
## smoker, data = brfss_poly, Hess = TRUE, method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## age 0.01797 0.001867 9.627
## sexFemale -0.12885 0.056835 -2.267
## bmi 0.04973 0.004534 10.967
## exerciseYes -0.92167 0.064426 -14.306
## income_cat -0.26967 0.014113 -19.107
## smokerCurrent 0.29313 0.064304 4.558
##
## Intercepts:
## Value Std. Error t value
## Excellent/VG|Good 0.0985 0.2200 0.4478
## Good|Fair/Poor 1.8728 0.2212 8.4650
##
## Residual Deviance: 9318.274
## AIC: 9334.274
ci_ord <- confint(mod_ord)
ord_tbl <- tibble(
term = rownames(ci_ord),
conf.low = exp(ci_ord[, 1]),
conf.high = exp(ci_ord[, 2])
) |>
left_join(
tidy(mod_ord) |> filter(!str_detect(term, "\\|")) |>
dplyr::select(term, estimate),
by = "term"
) |>
mutate(estimate = exp(estimate))
bmi_or <- ord_tbl |> filter(term == "bmi")Interpretation: Each one-unit increase in BMI is associated with a cumulative OR of 1.05 (95% CI: 1.04–1.06) for being in a worse health category. Because this is a proportional odds model, this OR applies at every cut-point simultaneously — that is, each additional BMI unit multiplies the odds of reporting “Good or worse” (vs. Excellent/VG) and the odds of reporting “Fair/Poor” (vs. Good or better) by the same factor of approximately 1.05, after adjusting for all other covariates.
pred_bmi <- ggpredict(mod_ord, terms = "bmi [10:55 by=1]")
plot(pred_bmi) +
labs(
title = "Predicted Probabilities of General Health Category Across BMI",
subtitle = "Ordinal logistic regression model; other covariates held at mean/reference",
x = "BMI",
y = "Predicted Probability",
color = "Health Category"
) +
theme_minimal(base_size = 13)As BMI increases, the predicted probability of Excellent/VG health declines steadily while the probabilities of Good and Fair/Poor health rise, consistent with a positive association between BMI and worse self-rated health.
## --------------------------------------------
## Test for X2 df probability
## --------------------------------------------
## Omnibus 35.87 6 0
## age 0.06 1 0.81
## sexFemale 0.89 1 0.34
## bmi 7.1 1 0.01
## exerciseYes 10.58 1 0
## income_cat 10.54 1 0
## smokerCurrent 7.76 1 0.01
## --------------------------------------------
##
## H0: Parallel Regression Assumption holds
as.data.frame(brant_out) |>
tibble::rownames_to_column("Predictor") |>
mutate(
X2 = round(X2, 2),
probability = ifelse(probability < 0.001, "< 0.001",
sprintf("%.3f", probability)),
`Assumption Violated?` = case_when(
Predictor == "Omnibus" & (probability == "< 0.001" |
as.numeric(gsub("< ", "", probability)) < 0.05) ~ "Yes (overall)",
Predictor == "Omnibus" ~ "No (overall)",
probability == "< 0.001" |
as.numeric(gsub("< ", "", probability)) < 0.05 ~ "⚠ Yes",
TRUE ~ ""
)
) |>
kable(
col.names = c("Predictor", "X²", "df", "p-value", "Assumption Violated?"),
caption = "Table 5. Brant Test for Proportional Odds Assumption",
align = c("l", "r", "r", "r", "c")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
row_spec(0, bold = TRUE) |>
row_spec(1, bold = TRUE, background = "#f0f0f0")| Predictor | X² | df | p-value | Assumption Violated? |
|---|---|---|---|---|
| Omnibus | 35.87 | 6 | < 0.001 | Yes (overall) |
| age | 0.06 | 1 | 0.814 | |
| sexFemale | 0.89 | 1 | 0.345 | |
| bmi | 7.10 | 1 | 0.008 | ⚠ Yes |
| exerciseYes | 10.58 | 1 | 0.001 | ⚠ Yes |
| income_cat | 10.54 | 1 | 0.001 | ⚠ Yes |
| smokerCurrent | 7.76 | 1 | 0.005 | ⚠ Yes |
Interpretation: Examine the omnibus p-value first. If it is < 0.05, the proportional odds assumption is violated for the model as a whole, and individual predictor rows identify which variables are driving the violation.
tibble(
Model = c("Multinomial logistic", "Ordinal logistic (prop. odds)"),
AIC = c(AIC(mod_multi), AIC(mod_ord)),
`No. Parameters` = c(mod_multi$edf,
length(coef(mod_ord)) + length(mod_ord$zeta))
) |>
mutate(AIC = round(AIC, 1)) |>
kable(caption = "Table 6. AIC Comparison: Multinomial vs. Ordinal Model") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Model | AIC | No. Parameters |
|---|---|---|
| Multinomial logistic | 9315.6 | 14 |
| Ordinal logistic (prop. odds) | 9334.3 | 8 |
Recommendation: If the Brant test omnibus p-value is ≥ 0.05, the ordinal logistic regression model should be preferred. It is more parsimonious (fewer parameters), yields a single, easily interpretable cumulative OR for each predictor, and, assuming proportional odds holds, will have a lower AIC. If the Brant test reveals a significant violation — particularly for key predictors such as
smokerorsexthe multinomial model is more appropriate because it does not impose the proportional odds constraint, at the cost of requiring separate interpretation for each outcome contrast. In that case, the flexibility of the multinomial model outweighs its added complexity.
End of Lab — EPI 553, April 16, 2026