Part 5: In-Class Lab Activity

EPI 553 — Polytomous and Ordinal Regression Lab Due: End of class, April 16, 2026

Instructions

Complete the four tasks below using brfss_polytomous_2020.rds. Submit a knitted HTML file via Brightspace.

Data

Variable	Description	Type
`genhlth_ord`	General health (Excellent/VG < Good < Fair/Poor)	Ordered factor (outcome)
`genhlth_nom`	General health, unordered	Factor (outcome)
`age`	Age in years	Numeric
`sex`	Male / Female	Factor
`bmi`	Body mass index	Numeric
`exercise`	Exercised in past 30 days (No/Yes)	Factor
`income_cat`	Household income (1-8)	Numeric
`smoker`	Former/Never vs. Current	Factor

library(tidyverse)
library(broom)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet)
library(MASS)
library(brant)
library(ggeffects)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

brfss_full <- read_xpt("data/LLCP2020.XPT") |>
  clean_names()

#Clean data

brfss_poly <- brfss_full |>
  mutate(
    # 3-level ordinal outcome
    genhlth_3 = case_when(
      genhlth %in% c(1, 2) ~ "Excellent/VG",
      genhlth == 3         ~ "Good",
      genhlth %in% c(4, 5) ~ "Fair/Poor",
      TRUE                 ~ NA_character_
    ),
    # Ordered factor for ordinal regression
    genhlth_ord = factor(genhlth_3,
      levels = c("Excellent/VG", "Good", "Fair/Poor"),
      ordered = TRUE),
    # Unordered factor for polytomous regression
    genhlth_nom = factor(genhlth_3,
      levels = c("Excellent/VG", "Good", "Fair/Poor")),
    # Predictors
    age = age80,
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    smoker = factor(case_when(
      smokday2 %in% c(1, 2) ~ "Current",
      smokday2 == 3          ~ "Former/Never",
      TRUE                   ~ NA_character_
    ), levels = c("Former/Never", "Current"))
  ) |>
  filter(
    !is.na(genhlth_ord), !is.na(age), age >= 18, !is.na(sex),
    !is.na(bmi), !is.na(exercise), !is.na(income_cat), !is.na(smoker)
  )

set.seed(1220)
brfss_poly <- brfss_poly |>
  dplyr::select(genhlth_ord, genhlth_nom, age, sex, bmi,
                exercise, income_cat, smoker) |>
  slice_sample(n = 5000)

saveRDS(brfss_poly,
  "data/brfss_polytomous_2020.rds")

brfss_poly <- readRDS("data/brfss_polytomous_2020.rds")

Task 1: Explore the Outcome (15 points)

1a. (5 pts) Create a frequency table of genhlth_ord showing N and percentage for each category.

brfss_poly |>
  count(genhlth_ord) |>
  mutate(pct = round(100 * n / sum(n), 1)) |>
  kable(col.names = c("General Health", "N", "%"),
        caption = "Outcome Distribution") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Outcome Distribution
General Health	N	%
Excellent/VG	2380	47.6
Good	1624	32.5
Fair/Poor	996	19.9

1b. (5 pts) Create a stacked bar chart showing the distribution of general health by smoker status.

ggplot(brfss_poly) +
  geom_bar(aes(x=genhlth_ord, y= smoker, fill = smoker), stat="identity") +
  theme_minimal() +
  labs(x = "General Health", y = "Smoker Status", title = "General Health by Smoker Status")

brfss_poly |>
group_by(genhlth_ord) |> mutate(percent = mean(smoker == "Current")) |>
mutate(percent = round(percent, digits = 2)) |>
ggplot(aes(x = genhlth_ord, y = percent, fill = percent)) +
  geom_bar(position="dodge", stat="identity") +
  geom_text(stat = "identity", aes(label = after_stat(y)), vjust = -0.3) +
  labs(
    title = "Distribution of General Health by Smoker Status",
    subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
    x = "General Health",
    y = "Percentage as Current Smoker"
  ) +
  theme_minimal() +
  scale_y_continuous(labels=scales::percent) +
  theme(legend.position = "none")

1c. (5 pts) Use tbl_summary() to create a descriptive table stratified by genhlth_ord with at least 4 predictors.

brfss_poly |>
  tbl_summary(
    by = genhlth_ord,
    include = c(sex, bmi, exercise, smoker, income_cat),
    type = list(
      c(bmi, income_cat) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercise in past 30 days",
      income_cat    ~ "Income category (1-8)",
      smoker        ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()

Characteristic	Overall N = 5,000¹	Excellent/VG N = 2,380¹	Good N = 1,624¹	Fair/Poor N = 996¹	p-value²
Sex					0.005
Male	2,717 (54%)	1,315 (55%)	906 (56%)	496 (50%)
Female	2,283 (46%)	1,065 (45%)	718 (44%)	500 (50%)
BMI	28 (6)	27 (5)	29 (6)	30 (8)	<0.001
Exercise in past 30 days	3,630 (73%)	1,999 (84%)	1,161 (71%)	470 (47%)	<0.001
Smoking status					<0.001
Former/Never	3,304 (66%)	1,692 (71%)	1,015 (63%)	597 (60%)
Current	1,696 (34%)	688 (29%)	609 (38%)	399 (40%)
Income category (1-8)	5.78 (2.14)	6.43 (1.84)	5.64 (2.09)	4.47 (2.23)	<0.001
¹ n (%); Mean (SD)
² Pearson’s Chi-squared test; Kruskal-Wallis rank sum test

Task 2: Multinomial Logistic Regression (20 points)

2a. (5 pts) Fit a multinomial logistic regression model predicting genhlth_nom from at least 4 predictors using multinom().

mod_multi <- multinom(
  genhlth_nom ~ age + sex + bmi + exercise + income_cat + smoker,
  data = brfss_poly,
  trace = FALSE
)

summary(mod_multi)

## Call:
## multinom(formula = genhlth_nom ~ age + sex + bmi + exercise + 
##     income_cat + smoker, data = brfss_poly, trace = FALSE)
## 
## Coefficients:
##           (Intercept)        age  sexFemale        bmi exerciseYes income_cat
## Good        -1.317306 0.01497067 -0.1417121 0.05087197   -0.500636 -0.1693992
## Fair/Poor   -1.283880 0.02579060 -0.1257209 0.06741312   -1.338926 -0.3932631
##           smokerCurrent
## Good          0.4117899
## Fair/Poor     0.3436415
## 
## Std. Errors:
##           (Intercept)         age  sexFemale         bmi exerciseYes income_cat
## Good        0.2689762 0.002189706 0.06782701 0.005768280  0.08178912 0.01748213
## Fair/Poor   0.3296881 0.002916356 0.08583048 0.006778269  0.09155273 0.02090747
##           smokerCurrent
## Good         0.07723392
## Fair/Poor    0.09720041
## 
## Residual Deviance: 9287.557 
## AIC: 9315.557

2b. (10 pts) Report the relative risk ratios (exponentiated coefficients) with 95% CIs in a clean table.

tidy(mod_multi, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
         p.value = format.pval(p.value, digits = 3)) |>
  kable(col.names = c("Outcome", "Predictor", "RRR", "Lower", "Upper", "p"),
        caption = "Multinomial Logistic Regression: RRRs vs. Excellent/VG") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Multinomial Logistic Regression: RRRs vs. Excellent/VG
Outcome	Predictor	RRR	Lower	Upper	p
Good	(Intercept)	0.268	0.158	0.454	9.71e-07
Good	age	1.015	1.011	1.019	8.10e-12
Good	sexFemale	0.868	0.760	0.991	0.036679
Good	bmi	1.052	1.040	1.064	< 2e-16
Good	exerciseYes	0.606	0.516	0.712	9.30e-10
Good	income_cat	0.844	0.816	0.874	< 2e-16
Good	smokerCurrent	1.510	1.297	1.756	9.73e-08
Fair/Poor	(Intercept)	0.277	0.145	0.529	9.85e-05
Fair/Poor	age	1.026	1.020	1.032	< 2e-16
Fair/Poor	sexFemale	0.882	0.745	1.043	0.142987
Fair/Poor	bmi	1.070	1.056	1.084	< 2e-16
Fair/Poor	exerciseYes	0.262	0.219	0.314	< 2e-16
Fair/Poor	income_cat	0.675	0.648	0.703	< 2e-16
Fair/Poor	smokerCurrent	1.410	1.165	1.706	0.000407

2c. (5 pts) Interpret the RRR for one predictor in the “Fair/Poor vs. Excellent/VG” comparison. Current smokers have an odds ratio of 1.41 for having poor health as compared to excellent or very good. The relative risk of being in the “Fair/Poor” category, rather than “Excellent/VG”, is about 41% higher among current smokers.

Task 3: Ordinal Logistic Regression (25 points)

3a. (5 pts) Fit an ordinal logistic regression model with the same predictors using polr().

mod_ord <- polr(
  genhlth_ord ~ age + sex + bmi + exercise + income_cat + smoker,
  data = brfss_poly,
  Hess = TRUE
)

summary(mod_ord)

## Call:
## polr(formula = genhlth_ord ~ age + sex + bmi + exercise + income_cat + 
##     smoker, data = brfss_poly, Hess = TRUE)
## 
## Coefficients:
##                  Value Std. Error t value
## age            0.01797   0.001867   9.627
## sexFemale     -0.12885   0.056835  -2.267
## bmi            0.04973   0.004534  10.967
## exerciseYes   -0.92167   0.064426 -14.306
## income_cat    -0.26967   0.014113 -19.107
## smokerCurrent  0.29313   0.064304   4.558
## 
## Intercepts:
##                   Value    Std. Error t value 
## Excellent/VG|Good   0.0985   0.2200     0.4478
## Good|Fair/Poor      1.8728   0.2212     8.4650
## 
## Residual Deviance: 9318.274 
## AIC: 9334.274

3b. (5 pts) Report the cumulative ORs with 95% CIs.

tidy(mod_ord, conf.int = TRUE, exponentiate = TRUE) |>
  filter(coef.type == "coefficient") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "Lower", "Upper"),
        caption = "Ordinal Logistic Regression: Cumulative ORs") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Ordinal Logistic Regression: Cumulative ORs
Predictor	OR	Lower	Upper
age	1.018	1.014	1.022
sexFemale	0.879	0.786	0.983
bmi	1.051	1.042	1.060
exerciseYes	0.398	0.351	0.451
income_cat	0.764	0.743	0.785
smokerCurrent	1.341	1.182	1.521

3c. (5 pts) Interpret one OR in plain language, making sure to mention the “at every cut-point” property. Smoking has OR = 1.34, therefore current smokers have 1.34 times the odds of reporting Good or worse health (vs. Excellent/VG), AND 1.6 times the odds of reporting Fair/Poor (vs. Excellent/VG or Good).

3d. (10 pts) Use ggpredict() to plot predicted probabilities of each health category across a continuous predictor of your choice.

ggpredict(mod_ord, terms = "income_cat [1:8]") |>
  plot() +
  labs(title = "Ordinal Model: Predicted Probability of Each Health Category",
       x = "Income Category", y = "Predicted Probability") +
  theme_minimal()

Task 4: Check Assumptions and Compare (15 points)

4a. (5 pts) Run the Brant test for proportional odds. Does the assumption hold?

brant_out <- brant(mod_ord)

## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      35.87   6   0
## age      0.06    1   0.81
## sexFemale    0.89    1   0.34
## bmi      7.1 1   0.01
## exerciseYes  10.58   1   0
## income_cat   10.54   1   0
## smokerCurrent    7.76    1   0.01
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

as.data.frame(brant_out) |>
  tibble::rownames_to_column("Predictor") |>
  mutate(
    X2          = round(X2, 2),
    probability = ifelse(probability < 0.001, "< 0.001", sprintf("%.3f", probability)),
    Fails       = ifelse(Predictor == "Omnibus",
                         ifelse(as.numeric(gsub("< ", "", probability)) < 0.05 | probability == "< 0.001", "Yes (overall)", "No (overall)"),
                         ifelse(probability == "< 0.001" | as.numeric(gsub("< ", "", probability)) < 0.05, "⚠ Yes", ""))
  ) |>
  kable(
    col.names = c("Predictor", "X²", "df", "p-value", "Assumption Violated?"),
    caption   = "Brant Test for Proportional Odds Assumption",
    align     = c("l", "r", "r", "r", "c")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(0, bold = TRUE) |>
  row_spec(1, bold = TRUE, background = "#f0f0f0")

Brant Test for Proportional Odds Assumption
Predictor	X²	df	p-value	Assumption Violated?
Omnibus	35.87	6	< 0.001	Yes (overall)
age	0.06	1	0.814
sexFemale	0.89	1	0.345
bmi	7.10	1	0.008	⚠ Yes
exerciseYes	10.58	1	0.001	⚠ Yes
income_cat	10.54	1	0.001	⚠ Yes
smokerCurrent	7.76	1	0.005	⚠ Yes

The assumptions are violated for the omnibus test. In addtition, bmi, exercise, income, and smoking status also violate the assumptions based on p<0.05.

4b. (5 pts) Compare the AIC of the multinomial and ordinal models. Which fits better?

tibble(
  Model = c("Multinomial", "Ordinal (proportional odds)"),
  AIC   = c(AIC(mod_multi), AIC(mod_ord)),
  df    = c(mod_multi$edf, length(coef(mod_ord)) + length(mod_ord$zeta))
) |>
  mutate(AIC = round(AIC, 1)) |>
  kable(caption = "Model Comparison") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Model Comparison
Model	AIC	df
Multinomial	9315.6	14
Ordinal (proportional odds)	9334.3	8

The multinomial models has a better fit based on the smaller AIC.

4c. (5 pts) Based on your results, which model would you recommend reporting? Justify in 2-3 sentences. I would recommend reporting the multinomial model. This ordinal model has multiple violations and has a worsened AIC, suggesting that the multinomial model is a better fit.

Polytomous and Ordinal Logistic Regression

EPI 553 — Principles of Statistical Inference II

Matthew Goldman

April 16, 2026