Setup and Data

library(tidyverse)
library(haven)
library(janitor)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet)        # multinom() for polytomous regression
library(MASS)        # polr() for ordinal regression
library(brant)       # Brant test for proportional odds
library(ggeffects)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

Part 5: In-Class Lab Activity

EPI 553 — Polytomous and Ordinal Regression Lab Due: End of class, April 16, 2026

Instructions

Complete the four tasks below using brfss_polytomous_2020.rds. Submit a knitted HTML file via Brightspace.

Data

Variable	Description	Type
`genhlth_ord`	General health (Excellent/VG < Good < Fair/Poor)	Ordered factor (outcome)
`genhlth_nom`	General health, unordered	Factor (outcome)
`age`	Age in years	Numeric
`sex`	Male / Female	Factor
`bmi`	Body mass index	Numeric
`exercise`	Exercised in past 30 days (No/Yes)	Factor
`income_cat`	Household income (1-8)	Numeric
`smoker`	Former/Never vs. Current	Factor

brfss_poly <- readRDS("C:/Users/joshm/Documents/UAlbany/Spring 2026/EPI 553/Labs/brfss_poly.rds")

Task 1: Explore the Outcome (15 points)

1a. (5 pts) Create a frequency table of genhlth_ord showing N and percentage for each category.

brfss_poly |>
  count(genhlth_ord) |>
  mutate(pct = round(100 * n / sum(n), 1)) |>
  kable(col.names = c("General Health", "N", "%"),
        caption = "Outcome Distribution") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Outcome Distribution
General Health	N	%
Excellent/VG	2380	47.6
Good	1624	32.5
Fair/Poor	996	19.9

1b. (5 pts) Create a stacked bar chart showing the distribution of general health by smoker status.

ggplot(brfss_poly, aes(x = smoker, fill = genhlth_ord)) +
  geom_bar(alpha = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  scale_fill_brewer(palette = "Blues") +
  labs(
    title = "General health by smoking status",
    subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
    x = "Smoker",
    y = "Genhlth"
  ) +
  theme_minimal(base_size = 10) +
  theme(legend.position = "none")

1c. (5 pts) Use tbl_summary() to create a descriptive table stratified by genhlth_ord with at least 4 predictors.

brfss_poly |>
  tbl_summary(
    by = genhlth_ord,
    include = c(age, sex, bmi, exercise),
    type = list(
      c(age, bmi) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercise"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()

Characteristic	Overall N = 5,000¹	Excellent/VG N = 2,380¹	Good N = 1,624¹	Fair/Poor N = 996¹	p-value²
Age (years)	57 (16)	55 (16)	58 (16)	61 (15)	<0.001
Sex					0.005
Male	2,717 (54%)	1,315 (55%)	906 (56%)	496 (50%)
Female	2,283 (46%)	1,065 (45%)	718 (44%)	500 (50%)
BMI	28 (6)	27 (5)	29 (6)	30 (8)	<0.001
Exercise	3,630 (73%)	1,999 (84%)	1,161 (71%)	470 (47%)	<0.001
¹ Mean (SD); n (%)
² Kruskal-Wallis rank sum test; Pearson’s Chi-squared test

Task 2: Multinomial Logistic Regression (20 points)

2a. (5 pts) Fit a multinomial logistic regression model predicting genhlth_nom from at least 4 predictors using multinom().

multi <- multinom(
  genhlth_nom ~ age + sex + bmi + exercise,
  data = brfss_poly,
  trace = FALSE
)

2b. (10 pts) Report the relative risk ratios (exponentiated coefficients) with 95% CIs in a clean table.

tidy(multi, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
         p.value = format.pval(p.value, digits = 3)) |>
  kable(col.names = c("Outcome", "Predictor", "RRR", "Lower", "Upper", "p"),
        caption = "Multinomial Logistic Regression: RRRs vs. Excellent/VG") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Multinomial Logistic Regression: RRRs vs. Excellent/VG
Outcome	Predictor	RRR	Lower	Upper	p
Good	(Intercept)	0.180	0.117	0.278	9.49e-15
Good	age	1.011	1.007	1.015	3.95e-08
Good	sexFemale	0.941	0.826	1.071	0.356
Good	bmi	1.045	1.034	1.057	1.45e-15
Good	exerciseYes	0.515	0.440	0.602	< 2e-16
Fair/Poor	(Intercept)	0.065	0.038	0.111	< 2e-16
Fair/Poor	age	1.021	1.016	1.027	6.86e-16
Fair/Poor	sexFemale	1.082	0.923	1.268	0.332
Fair/Poor	bmi	1.061	1.048	1.074	< 2e-16
Fair/Poor	exerciseYes	0.196	0.166	0.233	< 2e-16

2c. (5 pts) Interpret the RRR for one predictor in the “Fair/Poor vs. Excellent/VG” comparison.

Holding age, sex, and BMI constant, those who exercise have 0.196 times the risk of being in Fair/Poor health vs. Excellent/VG compared to those who do not exercise.

Task 3: Ordinal Logistic Regression (25 points)

3a. (5 pts) Fit an ordinal logistic regression model with the same predictors using polr().

ord <- polr(
  genhlth_ord ~ age + sex + bmi + exercise,
  data = brfss_poly,
  Hess = TRUE
)

3b. (5 pts) Report the cumulative ORs with 95% CIs.

tidy(ord, conf.int = TRUE, exponentiate = TRUE) |>
  filter(coef.type == "coefficient") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "Lower", "Upper"),
        caption = "Ordinal Logistic Regression: Cumulative ORs") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Ordinal Logistic Regression: Cumulative ORs
Predictor	OR	Lower	Upper
age	1.015	1.012	1.018
sexFemale	1.015	0.911	1.131
bmi	1.046	1.037	1.055
exerciseYes	0.312	0.276	0.353

3c. (5 pts) Interpret one OR in plain language, making sure to mention the “at every cut-point” property.

Those who exercise have 0.312 times the risk of reporting a worse health category compared to those who do not exercise at every cut point.

3d. (10 pts) Use ggpredict() to plot predicted probabilities of each health category across a continuous predictor of your choice.

ggpredict(ord, terms = "age") |>
  plot() +
  labs(title = "Ordinal Model: Predicted Probability of Each Health Category",
       x = "Age", y = "Predicted Probability") +
  theme_minimal()

Task 4: Check Assumptions and Compare (15 points)

4a. (5 pts) Run the Brant test for proportional odds. Does the assumption hold?

brant_out <- brant(ord)

## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      20.67   4   0
## age      0.6 1   0.44
## sexFemale    2.5 1   0.11
## bmi      4.9 1   0.03
## exerciseYes  11.33   1   0
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

No, the assumption is violated as the omnibus p < 0.05.

4b. (5 pts) Compare the AIC of the multinomial and ordinal models. Which fits better?

tibble(
  Model = c("Multinomial", "Ordinal (proportional odds)"),
  AIC   = c(AIC(multi), AIC(ord)),
  df    = c(multi$edf, length(coef(ord)) + length(ord$zeta))
) |>
  mutate(AIC = round(AIC, 1)) |>
  kable(caption = "Model Comparison") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Model Comparison
Model	AIC	df
Multinomial	9784.6	10
Ordinal (proportional odds)	9796.4	6

The multinomial model fits better because it has a lower AIC.

4c. (5 pts) Based on your results, which model would you recommend reporting? Justify in 2-3 sentences.

I would recommend the original multinomial model, because it has a lower AIC and the Brant Test was violated for the ordinal model. This suggests that the single coefficient is not sufficient to describe the predictor’s effect across all categories of the outcome. Also, the multinomial model’s interpretation is a bit easier to understand and less confusing.

Completion credit (25 points): Awarded for a complete, good-faith attempt. Total: 75 + 25 = 100 points.

End of Lab Activity

Polytomous and Ordinal Logistic Regression

EPI 553 — Principles of Statistical Inference II

Josh Macera

April 16, 2026