Setup and Data

library(tidyverse)
library(haven)
library(janitor)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet)        # multinom() for polytomous regression
library(MASS)        # polr() for ordinal regression
library(brant)       # Brant test for proportional odds
library(ggeffects)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

Part 5: In-Class Lab Activity

EPI 553 — Polytomous and Ordinal Regression Lab Due: End of class, April 16, 2026

Instructions

Complete the four tasks below using brfss_polytomous_2020.rds. Submit a knitted HTML file via Brightspace.

Data

Variable Description Type
genhlth_ord General health (Excellent/VG < Good < Fair/Poor) Ordered factor (outcome)
genhlth_nom General health, unordered Factor (outcome)
age Age in years Numeric
sex Male / Female Factor
bmi Body mass index Numeric
exercise Exercised in past 30 days (No/Yes) Factor
income_cat Household income (1-8) Numeric
smoker Former/Never vs. Current Factor
brfss_poly <- readRDS("C:/Users/joshm/Documents/UAlbany/Spring 2026/EPI 553/Labs/brfss_poly.rds")

Task 1: Explore the Outcome (15 points)

1a. (5 pts) Create a frequency table of genhlth_ord showing N and percentage for each category.

brfss_poly |>
  count(genhlth_ord) |>
  mutate(pct = round(100 * n / sum(n), 1)) |>
  kable(col.names = c("General Health", "N", "%"),
        caption = "Outcome Distribution") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Outcome Distribution
General Health N %
Excellent/VG 2380 47.6
Good 1624 32.5
Fair/Poor 996 19.9

1b. (5 pts) Create a stacked bar chart showing the distribution of general health by smoker status.

ggplot(brfss_poly, aes(x = smoker, fill = genhlth_ord)) +
  geom_bar(alpha = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  scale_fill_brewer(palette = "Blues") +
  labs(
    title = "General health by smoking status",
    subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
    x = "Smoker",
    y = "Genhlth"
  ) +
  theme_minimal(base_size = 10) +
  theme(legend.position = "none")

1c. (5 pts) Use tbl_summary() to create a descriptive table stratified by genhlth_ord with at least 4 predictors.

brfss_poly |>
  tbl_summary(
    by = genhlth_ord,
    include = c(age, sex, bmi, exercise),
    type = list(
      c(age, bmi) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      age           ~ "Age (years)",
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercise"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()
Characteristic Overall
N = 5,000
1
Excellent/VG
N = 2,380
1
Good
N = 1,624
1
Fair/Poor
N = 996
1
p-value2
Age (years) 57 (16) 55 (16) 58 (16) 61 (15) <0.001
Sex



0.005
    Male 2,717 (54%) 1,315 (55%) 906 (56%) 496 (50%)
    Female 2,283 (46%) 1,065 (45%) 718 (44%) 500 (50%)
BMI 28 (6) 27 (5) 29 (6) 30 (8) <0.001
Exercise 3,630 (73%) 1,999 (84%) 1,161 (71%) 470 (47%) <0.001
1 Mean (SD); n (%)
2 Kruskal-Wallis rank sum test; Pearson’s Chi-squared test

Task 2: Multinomial Logistic Regression (20 points)

2a. (5 pts) Fit a multinomial logistic regression model predicting genhlth_nom from at least 4 predictors using multinom().

multi <- multinom(
  genhlth_nom ~ age + sex + bmi + exercise,
  data = brfss_poly,
  trace = FALSE
)

2b. (10 pts) Report the relative risk ratios (exponentiated coefficients) with 95% CIs in a clean table.

tidy(multi, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
         p.value = format.pval(p.value, digits = 3)) |>
  kable(col.names = c("Outcome", "Predictor", "RRR", "Lower", "Upper", "p"),
        caption = "Multinomial Logistic Regression: RRRs vs. Excellent/VG") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Multinomial Logistic Regression: RRRs vs. Excellent/VG
Outcome Predictor RRR Lower Upper p
Good (Intercept) 0.180 0.117 0.278 9.49e-15
Good age 1.011 1.007 1.015 3.95e-08
Good sexFemale 0.941 0.826 1.071 0.356
Good bmi 1.045 1.034 1.057 1.45e-15
Good exerciseYes 0.515 0.440 0.602 < 2e-16
Fair/Poor (Intercept) 0.065 0.038 0.111 < 2e-16
Fair/Poor age 1.021 1.016 1.027 6.86e-16
Fair/Poor sexFemale 1.082 0.923 1.268 0.332
Fair/Poor bmi 1.061 1.048 1.074 < 2e-16
Fair/Poor exerciseYes 0.196 0.166 0.233 < 2e-16

2c. (5 pts) Interpret the RRR for one predictor in the “Fair/Poor vs. Excellent/VG” comparison.

Holding age, sex, and BMI constant, those who exercise have 0.196 times the risk of being in Fair/Poor health vs. Excellent/VG compared to those who do not exercise.

Task 3: Ordinal Logistic Regression (25 points)

3a. (5 pts) Fit an ordinal logistic regression model with the same predictors using polr().

ord <- polr(
  genhlth_ord ~ age + sex + bmi + exercise,
  data = brfss_poly,
  Hess = TRUE
)

3b. (5 pts) Report the cumulative ORs with 95% CIs.

tidy(ord, conf.int = TRUE, exponentiate = TRUE) |>
  filter(coef.type == "coefficient") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "Lower", "Upper"),
        caption = "Ordinal Logistic Regression: Cumulative ORs") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Ordinal Logistic Regression: Cumulative ORs
Predictor OR Lower Upper
age 1.015 1.012 1.018
sexFemale 1.015 0.911 1.131
bmi 1.046 1.037 1.055
exerciseYes 0.312 0.276 0.353

3c. (5 pts) Interpret one OR in plain language, making sure to mention the “at every cut-point” property.

Those who exercise have 0.312 times the risk of reporting a worse health category compared to those who do not exercise at every cut point.

3d. (10 pts) Use ggpredict() to plot predicted probabilities of each health category across a continuous predictor of your choice.

ggpredict(ord, terms = "age") |>
  plot() +
  labs(title = "Ordinal Model: Predicted Probability of Each Health Category",
       x = "Age", y = "Predicted Probability") +
  theme_minimal()

Task 4: Check Assumptions and Compare (15 points)

4a. (5 pts) Run the Brant test for proportional odds. Does the assumption hold?

brant_out <- brant(ord)
## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      20.67   4   0
## age      0.6 1   0.44
## sexFemale    2.5 1   0.11
## bmi      4.9 1   0.03
## exerciseYes  11.33   1   0
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

No, the assumption is violated as the omnibus p < 0.05.

4b. (5 pts) Compare the AIC of the multinomial and ordinal models. Which fits better?

tibble(
  Model = c("Multinomial", "Ordinal (proportional odds)"),
  AIC   = c(AIC(multi), AIC(ord)),
  df    = c(multi$edf, length(coef(ord)) + length(ord$zeta))
) |>
  mutate(AIC = round(AIC, 1)) |>
  kable(caption = "Model Comparison") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Model Comparison
Model AIC df
Multinomial 9784.6 10
Ordinal (proportional odds) 9796.4 6

The multinomial model fits better because it has a lower AIC.

4c. (5 pts) Based on your results, which model would you recommend reporting? Justify in 2-3 sentences.

I would recommend the original multinomial model, because it has a lower AIC and the Brant Test was violated for the ordinal model. This suggests that the single coefficient is not sufficient to describe the predictor’s effect across all categories of the outcome. Also, the multinomial model’s interpretation is a bit easier to understand and less confusing.

Completion credit (25 points): Awarded for a complete, good-faith attempt. Total: 75 + 25 = 100 points.

End of Lab Activity