EPI 553 — Polytomous and Ordinal Regression Lab Due: End of class, April 16, 2026
Complete the four tasks below using
brfss_polytomous_2020.rds. Submit a knitted HTML file via
Brightspace.
| Variable | Description | Type |
|---|---|---|
genhlth_ord |
General health (Excellent/VG < Good < Fair/Poor) | Ordered factor (outcome) |
genhlth_nom |
General health, unordered | Factor (outcome) |
age |
Age in years | Numeric |
sex |
Male / Female | Factor |
bmi |
Body mass index | Numeric |
exercise |
Exercised in past 30 days (No/Yes) | Factor |
income_cat |
Household income (1-8) | Numeric |
smoker |
Former/Never vs. Current | Factor |
library(tidyverse)
library(broom)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet)
library(MASS)
library(brant)
library(ggeffects)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
brfss_full <- read_xpt("data/LLCP2020.XPT") |>
clean_names()#Clean data
brfss_poly <- brfss_full |>
mutate(
# 3-level ordinal outcome
genhlth_3 = case_when(
genhlth %in% c(1, 2) ~ "Excellent/VG",
genhlth == 3 ~ "Good",
genhlth %in% c(4, 5) ~ "Fair/Poor",
TRUE ~ NA_character_
),
# Ordered factor for ordinal regression
genhlth_ord = factor(genhlth_3,
levels = c("Excellent/VG", "Good", "Fair/Poor"),
ordered = TRUE),
# Unordered factor for polytomous regression
genhlth_nom = factor(genhlth_3,
levels = c("Excellent/VG", "Good", "Fair/Poor")),
# Predictors
age = age80,
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
income_cat = case_when(
income2 %in% 1:8 ~ as.numeric(income2),
TRUE ~ NA_real_
),
smoker = factor(case_when(
smokday2 %in% c(1, 2) ~ "Current",
smokday2 == 3 ~ "Former/Never",
TRUE ~ NA_character_
), levels = c("Former/Never", "Current"))
) |>
filter(
!is.na(genhlth_ord), !is.na(age), age >= 18, !is.na(sex),
!is.na(bmi), !is.na(exercise), !is.na(income_cat), !is.na(smoker)
)
set.seed(1220)
brfss_poly <- brfss_poly |>
dplyr::select(genhlth_ord, genhlth_nom, age, sex, bmi,
exercise, income_cat, smoker) |>
slice_sample(n = 5000)
saveRDS(brfss_poly,
"data/brfss_polytomous_2020.rds")
brfss_poly <- readRDS("data/brfss_polytomous_2020.rds")1a. (5 pts) Create a frequency table of
genhlth_ord showing N and percentage for each category.
brfss_poly |>
count(genhlth_ord) |>
mutate(pct = round(100 * n / sum(n), 1)) |>
kable(col.names = c("General Health", "N", "%"),
caption = "Outcome Distribution") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| General Health | N | % |
|---|---|---|
| Excellent/VG | 2380 | 47.6 |
| Good | 1624 | 32.5 |
| Fair/Poor | 996 | 19.9 |
1b. (5 pts) Create a stacked bar chart showing the
distribution of general health by smoker status.
ggplot(brfss_poly) +
geom_bar(aes(x=genhlth_ord, y= smoker, fill = smoker), stat="identity") +
theme_minimal() +
labs(x = "General Health", y = "Smoker Status", title = "General Health by Smoker Status")brfss_poly |>
group_by(genhlth_ord) |> mutate(percent = mean(smoker == "Current")) |>
mutate(percent = round(percent, digits = 2)) |>
ggplot(aes(x = genhlth_ord, y = percent, fill = percent)) +
geom_bar(position="dodge", stat="identity") +
geom_text(stat = "identity", aes(label = after_stat(y)), vjust = -0.3) +
labs(
title = "Distribution of General Health by Smoker Status",
subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
x = "General Health",
y = "Percentage as Current Smoker"
) +
theme_minimal() +
scale_y_continuous(labels=scales::percent) +
theme(legend.position = "none")1c. (5 pts) Use tbl_summary() to create
a descriptive table stratified by genhlth_ord with at least
4 predictors.
brfss_poly |>
tbl_summary(
by = genhlth_ord,
include = c(sex, bmi, exercise, smoker, income_cat),
type = list(
c(bmi, income_cat) ~ "continuous"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})"
),
label = list(
sex ~ "Sex",
bmi ~ "BMI",
exercise ~ "Exercise in past 30 days",
income_cat ~ "Income category (1-8)",
smoker ~ "Smoking status"
)
) |>
add_overall() |>
add_p() |>
bold_labels()| Characteristic | Overall N = 5,0001 |
Excellent/VG N = 2,3801 |
Good N = 1,6241 |
Fair/Poor N = 9961 |
p-value2 |
|---|---|---|---|---|---|
| Sex | 0.005 | ||||
| Male | 2,717 (54%) | 1,315 (55%) | 906 (56%) | 496 (50%) | |
| Female | 2,283 (46%) | 1,065 (45%) | 718 (44%) | 500 (50%) | |
| BMI | 28 (6) | 27 (5) | 29 (6) | 30 (8) | <0.001 |
| Exercise in past 30 days | 3,630 (73%) | 1,999 (84%) | 1,161 (71%) | 470 (47%) | <0.001 |
| Smoking status | <0.001 | ||||
| Former/Never | 3,304 (66%) | 1,692 (71%) | 1,015 (63%) | 597 (60%) | |
| Current | 1,696 (34%) | 688 (29%) | 609 (38%) | 399 (40%) | |
| Income category (1-8) | 5.78 (2.14) | 6.43 (1.84) | 5.64 (2.09) | 4.47 (2.23) | <0.001 |
| 1 n (%); Mean (SD) | |||||
| 2 Pearson’s Chi-squared test; Kruskal-Wallis rank sum test | |||||
2a. (5 pts) Fit a multinomial logistic regression
model predicting genhlth_nom from at least 4 predictors
using multinom().
mod_multi <- multinom(
genhlth_nom ~ age + sex + bmi + exercise + income_cat + smoker,
data = brfss_poly,
trace = FALSE
)
summary(mod_multi)## Call:
## multinom(formula = genhlth_nom ~ age + sex + bmi + exercise +
## income_cat + smoker, data = brfss_poly, trace = FALSE)
##
## Coefficients:
## (Intercept) age sexFemale bmi exerciseYes income_cat
## Good -1.317306 0.01497067 -0.1417121 0.05087197 -0.500636 -0.1693992
## Fair/Poor -1.283880 0.02579060 -0.1257209 0.06741312 -1.338926 -0.3932631
## smokerCurrent
## Good 0.4117899
## Fair/Poor 0.3436415
##
## Std. Errors:
## (Intercept) age sexFemale bmi exerciseYes income_cat
## Good 0.2689762 0.002189706 0.06782701 0.005768280 0.08178912 0.01748213
## Fair/Poor 0.3296881 0.002916356 0.08583048 0.006778269 0.09155273 0.02090747
## smokerCurrent
## Good 0.07723392
## Fair/Poor 0.09720041
##
## Residual Deviance: 9287.557
## AIC: 9315.557
2b. (10 pts) Report the relative risk ratios (exponentiated coefficients) with 95% CIs in a clean table.
tidy(mod_multi, conf.int = TRUE, exponentiate = TRUE) |>
dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
p.value = format.pval(p.value, digits = 3)) |>
kable(col.names = c("Outcome", "Predictor", "RRR", "Lower", "Upper", "p"),
caption = "Multinomial Logistic Regression: RRRs vs. Excellent/VG") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Outcome | Predictor | RRR | Lower | Upper | p |
|---|---|---|---|---|---|
| Good | (Intercept) | 0.268 | 0.158 | 0.454 | 9.71e-07 |
| Good | age | 1.015 | 1.011 | 1.019 | 8.10e-12 |
| Good | sexFemale | 0.868 | 0.760 | 0.991 | 0.036679 |
| Good | bmi | 1.052 | 1.040 | 1.064 | < 2e-16 |
| Good | exerciseYes | 0.606 | 0.516 | 0.712 | 9.30e-10 |
| Good | income_cat | 0.844 | 0.816 | 0.874 | < 2e-16 |
| Good | smokerCurrent | 1.510 | 1.297 | 1.756 | 9.73e-08 |
| Fair/Poor | (Intercept) | 0.277 | 0.145 | 0.529 | 9.85e-05 |
| Fair/Poor | age | 1.026 | 1.020 | 1.032 | < 2e-16 |
| Fair/Poor | sexFemale | 0.882 | 0.745 | 1.043 | 0.142987 |
| Fair/Poor | bmi | 1.070 | 1.056 | 1.084 | < 2e-16 |
| Fair/Poor | exerciseYes | 0.262 | 0.219 | 0.314 | < 2e-16 |
| Fair/Poor | income_cat | 0.675 | 0.648 | 0.703 | < 2e-16 |
| Fair/Poor | smokerCurrent | 1.410 | 1.165 | 1.706 | 0.000407 |
2c. (5 pts) Interpret the RRR for one predictor in the “Fair/Poor vs. Excellent/VG” comparison. Current smokers have an odds ratio of 1.41 for having poor health as compared to excellent or very good. The relative risk of being in the “Fair/Poor” category, rather than “Excellent/VG”, is about 41% higher among current smokers.
3a. (5 pts) Fit an ordinal logistic regression model
with the same predictors using polr().
mod_ord <- polr(
genhlth_ord ~ age + sex + bmi + exercise + income_cat + smoker,
data = brfss_poly,
Hess = TRUE
)
summary(mod_ord)## Call:
## polr(formula = genhlth_ord ~ age + sex + bmi + exercise + income_cat +
## smoker, data = brfss_poly, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## age 0.01797 0.001867 9.627
## sexFemale -0.12885 0.056835 -2.267
## bmi 0.04973 0.004534 10.967
## exerciseYes -0.92167 0.064426 -14.306
## income_cat -0.26967 0.014113 -19.107
## smokerCurrent 0.29313 0.064304 4.558
##
## Intercepts:
## Value Std. Error t value
## Excellent/VG|Good 0.0985 0.2200 0.4478
## Good|Fair/Poor 1.8728 0.2212 8.4650
##
## Residual Deviance: 9318.274
## AIC: 9334.274
3b. (5 pts) Report the cumulative ORs with 95% CIs.
tidy(mod_ord, conf.int = TRUE, exponentiate = TRUE) |>
filter(coef.type == "coefficient") |>
dplyr::select(term, estimate, conf.low, conf.high) |>
mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
kable(col.names = c("Predictor", "OR", "Lower", "Upper"),
caption = "Ordinal Logistic Regression: Cumulative ORs") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Predictor | OR | Lower | Upper |
|---|---|---|---|
| age | 1.018 | 1.014 | 1.022 |
| sexFemale | 0.879 | 0.786 | 0.983 |
| bmi | 1.051 | 1.042 | 1.060 |
| exerciseYes | 0.398 | 0.351 | 0.451 |
| income_cat | 0.764 | 0.743 | 0.785 |
| smokerCurrent | 1.341 | 1.182 | 1.521 |
3c. (5 pts) Interpret one OR in plain language, making sure to mention the “at every cut-point” property. Smoking has OR = 1.34, therefore current smokers have 1.34 times the odds of reporting Good or worse health (vs. Excellent/VG), AND 1.6 times the odds of reporting Fair/Poor (vs. Excellent/VG or Good).
3d. (10 pts) Use ggpredict() to plot
predicted probabilities of each health category across a continuous
predictor of your choice.
ggpredict(mod_ord, terms = "income_cat [1:8]") |>
plot() +
labs(title = "Ordinal Model: Predicted Probability of Each Health Category",
x = "Income Category", y = "Predicted Probability") +
theme_minimal()4a. (5 pts) Run the Brant test for proportional odds. Does the assumption hold?
## --------------------------------------------
## Test for X2 df probability
## --------------------------------------------
## Omnibus 35.87 6 0
## age 0.06 1 0.81
## sexFemale 0.89 1 0.34
## bmi 7.1 1 0.01
## exerciseYes 10.58 1 0
## income_cat 10.54 1 0
## smokerCurrent 7.76 1 0.01
## --------------------------------------------
##
## H0: Parallel Regression Assumption holds
as.data.frame(brant_out) |>
tibble::rownames_to_column("Predictor") |>
mutate(
X2 = round(X2, 2),
probability = ifelse(probability < 0.001, "< 0.001", sprintf("%.3f", probability)),
Fails = ifelse(Predictor == "Omnibus",
ifelse(as.numeric(gsub("< ", "", probability)) < 0.05 | probability == "< 0.001", "Yes (overall)", "No (overall)"),
ifelse(probability == "< 0.001" | as.numeric(gsub("< ", "", probability)) < 0.05, "⚠ Yes", ""))
) |>
kable(
col.names = c("Predictor", "X²", "df", "p-value", "Assumption Violated?"),
caption = "Brant Test for Proportional Odds Assumption",
align = c("l", "r", "r", "r", "c")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
row_spec(0, bold = TRUE) |>
row_spec(1, bold = TRUE, background = "#f0f0f0")| Predictor | X² | df | p-value | Assumption Violated? |
|---|---|---|---|---|
| Omnibus | 35.87 | 6 | < 0.001 | Yes (overall) |
| age | 0.06 | 1 | 0.814 | |
| sexFemale | 0.89 | 1 | 0.345 | |
| bmi | 7.10 | 1 | 0.008 | ⚠ Yes |
| exerciseYes | 10.58 | 1 | 0.001 | ⚠ Yes |
| income_cat | 10.54 | 1 | 0.001 | ⚠ Yes |
| smokerCurrent | 7.76 | 1 | 0.005 | ⚠ Yes |
The assumptions are violated for the omnibus test. In addtition, bmi, exercise, income, and smoking status also violate the assumptions based on p<0.05.
4b. (5 pts) Compare the AIC of the multinomial and ordinal models. Which fits better?
tibble(
Model = c("Multinomial", "Ordinal (proportional odds)"),
AIC = c(AIC(mod_multi), AIC(mod_ord)),
df = c(mod_multi$edf, length(coef(mod_ord)) + length(mod_ord$zeta))
) |>
mutate(AIC = round(AIC, 1)) |>
kable(caption = "Model Comparison") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Model | AIC | df |
|---|---|---|
| Multinomial | 9315.6 | 14 |
| Ordinal (proportional odds) | 9334.3 | 8 |
The multinomial models has a better fit based on the smaller AIC.
4c. (5 pts) Based on your results, which model would you recommend reporting? Justify in 2-3 sentences. I would recommend reporting the multinomial model. This ordinal model has multiple violations and has a worsened AIC, suggesting that the multinomial model is a better fit.