Part 5: In-Class Lab Activity

EPI 553 — Polytomous and Ordinal Regression Lab Due: End of class, April 16, 2026

Instructions

Complete the four tasks below using brfss_polytomous_2020.rds. Submit a knitted HTML file via Brightspace.

Data

Variable Description Type
genhlth_ord General health (Excellent/VG < Good < Fair/Poor) Ordered factor (outcome)
genhlth_nom General health, unordered Factor (outcome)
age Age in years Numeric
sex Male / Female Factor
bmi Body mass index Numeric
exercise Exercised in past 30 days (No/Yes) Factor
income_cat Household income (1-8) Numeric
smoker Former/Never vs. Current Factor
library(tidyverse)
library(broom)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet)
library(MASS)
library(brant)
library(ggeffects)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

brfss_full <- read_xpt("data/LLCP2020.XPT") |>
  clean_names()

#Clean data

brfss_poly <- brfss_full |>
  mutate(
    # 3-level ordinal outcome
    genhlth_3 = case_when(
      genhlth %in% c(1, 2) ~ "Excellent/VG",
      genhlth == 3         ~ "Good",
      genhlth %in% c(4, 5) ~ "Fair/Poor",
      TRUE                 ~ NA_character_
    ),
    # Ordered factor for ordinal regression
    genhlth_ord = factor(genhlth_3,
      levels = c("Excellent/VG", "Good", "Fair/Poor"),
      ordered = TRUE),
    # Unordered factor for polytomous regression
    genhlth_nom = factor(genhlth_3,
      levels = c("Excellent/VG", "Good", "Fair/Poor")),
    # Predictors
    age = age80,
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    smoker = factor(case_when(
      smokday2 %in% c(1, 2) ~ "Current",
      smokday2 == 3          ~ "Former/Never",
      TRUE                   ~ NA_character_
    ), levels = c("Former/Never", "Current"))
  ) |>
  filter(
    !is.na(genhlth_ord), !is.na(age), age >= 18, !is.na(sex),
    !is.na(bmi), !is.na(exercise), !is.na(income_cat), !is.na(smoker)
  )

set.seed(1220)
brfss_poly <- brfss_poly |>
  dplyr::select(genhlth_ord, genhlth_nom, age, sex, bmi,
                exercise, income_cat, smoker) |>
  slice_sample(n = 5000)

saveRDS(brfss_poly,
  "data/brfss_polytomous_2020.rds")

brfss_poly <- readRDS("data/brfss_polytomous_2020.rds")

Task 1: Explore the Outcome (15 points)

1a. (5 pts) Create a frequency table of genhlth_ord showing N and percentage for each category.

brfss_poly |>
  count(genhlth_ord) |>
  mutate(pct = round(100 * n / sum(n), 1)) |>
  kable(col.names = c("General Health", "N", "%"),
        caption = "Outcome Distribution") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Outcome Distribution
General Health N %
Excellent/VG 2380 47.6
Good 1624 32.5
Fair/Poor 996 19.9

1b. (5 pts) Create a stacked bar chart showing the distribution of general health by smoker status.

ggplot(brfss_poly) +
  geom_bar(aes(x=genhlth_ord, y= smoker, fill = smoker), stat="identity") +
  theme_minimal() +
  labs(x = "General Health", y = "Smoker Status", title = "General Health by Smoker Status")

brfss_poly |>
group_by(genhlth_ord) |> mutate(percent = mean(smoker == "Current")) |>
mutate(percent = round(percent, digits = 2)) |>
ggplot(aes(x = genhlth_ord, y = percent, fill = percent)) +
  geom_bar(position="dodge", stat="identity") +
  geom_text(stat = "identity", aes(label = after_stat(y)), vjust = -0.3) +
  labs(
    title = "Distribution of General Health by Smoker Status",
    subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
    x = "General Health",
    y = "Percentage as Current Smoker"
  ) +
  theme_minimal() +
  scale_y_continuous(labels=scales::percent) +
  theme(legend.position = "none")

1c. (5 pts) Use tbl_summary() to create a descriptive table stratified by genhlth_ord with at least 4 predictors.

brfss_poly |>
  tbl_summary(
    by = genhlth_ord,
    include = c(sex, bmi, exercise, smoker, income_cat),
    type = list(
      c(bmi, income_cat) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      sex           ~ "Sex",
      bmi           ~ "BMI",
      exercise      ~ "Exercise in past 30 days",
      income_cat    ~ "Income category (1-8)",
      smoker        ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()
Characteristic Overall
N = 5,000
1
Excellent/VG
N = 2,380
1
Good
N = 1,624
1
Fair/Poor
N = 996
1
p-value2
Sex



0.005
    Male 2,717 (54%) 1,315 (55%) 906 (56%) 496 (50%)
    Female 2,283 (46%) 1,065 (45%) 718 (44%) 500 (50%)
BMI 28 (6) 27 (5) 29 (6) 30 (8) <0.001
Exercise in past 30 days 3,630 (73%) 1,999 (84%) 1,161 (71%) 470 (47%) <0.001
Smoking status



<0.001
    Former/Never 3,304 (66%) 1,692 (71%) 1,015 (63%) 597 (60%)
    Current 1,696 (34%) 688 (29%) 609 (38%) 399 (40%)
Income category (1-8) 5.78 (2.14) 6.43 (1.84) 5.64 (2.09) 4.47 (2.23) <0.001
1 n (%); Mean (SD)
2 Pearson’s Chi-squared test; Kruskal-Wallis rank sum test

Task 2: Multinomial Logistic Regression (20 points)

2a. (5 pts) Fit a multinomial logistic regression model predicting genhlth_nom from at least 4 predictors using multinom().

mod_multi <- multinom(
  genhlth_nom ~ age + sex + bmi + exercise + income_cat + smoker,
  data = brfss_poly,
  trace = FALSE
)

summary(mod_multi)
## Call:
## multinom(formula = genhlth_nom ~ age + sex + bmi + exercise + 
##     income_cat + smoker, data = brfss_poly, trace = FALSE)
## 
## Coefficients:
##           (Intercept)        age  sexFemale        bmi exerciseYes income_cat
## Good        -1.317306 0.01497067 -0.1417121 0.05087197   -0.500636 -0.1693992
## Fair/Poor   -1.283880 0.02579060 -0.1257209 0.06741312   -1.338926 -0.3932631
##           smokerCurrent
## Good          0.4117899
## Fair/Poor     0.3436415
## 
## Std. Errors:
##           (Intercept)         age  sexFemale         bmi exerciseYes income_cat
## Good        0.2689762 0.002189706 0.06782701 0.005768280  0.08178912 0.01748213
## Fair/Poor   0.3296881 0.002916356 0.08583048 0.006778269  0.09155273 0.02090747
##           smokerCurrent
## Good         0.07723392
## Fair/Poor    0.09720041
## 
## Residual Deviance: 9287.557 
## AIC: 9315.557

2b. (10 pts) Report the relative risk ratios (exponentiated coefficients) with 95% CIs in a clean table.

tidy(mod_multi, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
         p.value = format.pval(p.value, digits = 3)) |>
  kable(col.names = c("Outcome", "Predictor", "RRR", "Lower", "Upper", "p"),
        caption = "Multinomial Logistic Regression: RRRs vs. Excellent/VG") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Multinomial Logistic Regression: RRRs vs. Excellent/VG
Outcome Predictor RRR Lower Upper p
Good (Intercept) 0.268 0.158 0.454 9.71e-07
Good age 1.015 1.011 1.019 8.10e-12
Good sexFemale 0.868 0.760 0.991 0.036679
Good bmi 1.052 1.040 1.064 < 2e-16
Good exerciseYes 0.606 0.516 0.712 9.30e-10
Good income_cat 0.844 0.816 0.874 < 2e-16
Good smokerCurrent 1.510 1.297 1.756 9.73e-08
Fair/Poor (Intercept) 0.277 0.145 0.529 9.85e-05
Fair/Poor age 1.026 1.020 1.032 < 2e-16
Fair/Poor sexFemale 0.882 0.745 1.043 0.142987
Fair/Poor bmi 1.070 1.056 1.084 < 2e-16
Fair/Poor exerciseYes 0.262 0.219 0.314 < 2e-16
Fair/Poor income_cat 0.675 0.648 0.703 < 2e-16
Fair/Poor smokerCurrent 1.410 1.165 1.706 0.000407

2c. (5 pts) Interpret the RRR for one predictor in the “Fair/Poor vs. Excellent/VG” comparison. Current smokers have an odds ratio of 1.41 for having poor health as compared to excellent or very good. The relative risk of being in the “Fair/Poor” category, rather than “Excellent/VG”, is about 41% higher among current smokers.

Task 3: Ordinal Logistic Regression (25 points)

3a. (5 pts) Fit an ordinal logistic regression model with the same predictors using polr().

mod_ord <- polr(
  genhlth_ord ~ age + sex + bmi + exercise + income_cat + smoker,
  data = brfss_poly,
  Hess = TRUE
)

summary(mod_ord)
## Call:
## polr(formula = genhlth_ord ~ age + sex + bmi + exercise + income_cat + 
##     smoker, data = brfss_poly, Hess = TRUE)
## 
## Coefficients:
##                  Value Std. Error t value
## age            0.01797   0.001867   9.627
## sexFemale     -0.12885   0.056835  -2.267
## bmi            0.04973   0.004534  10.967
## exerciseYes   -0.92167   0.064426 -14.306
## income_cat    -0.26967   0.014113 -19.107
## smokerCurrent  0.29313   0.064304   4.558
## 
## Intercepts:
##                   Value    Std. Error t value 
## Excellent/VG|Good   0.0985   0.2200     0.4478
## Good|Fair/Poor      1.8728   0.2212     8.4650
## 
## Residual Deviance: 9318.274 
## AIC: 9334.274

3b. (5 pts) Report the cumulative ORs with 95% CIs.

tidy(mod_ord, conf.int = TRUE, exponentiate = TRUE) |>
  filter(coef.type == "coefficient") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "Lower", "Upper"),
        caption = "Ordinal Logistic Regression: Cumulative ORs") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Ordinal Logistic Regression: Cumulative ORs
Predictor OR Lower Upper
age 1.018 1.014 1.022
sexFemale 0.879 0.786 0.983
bmi 1.051 1.042 1.060
exerciseYes 0.398 0.351 0.451
income_cat 0.764 0.743 0.785
smokerCurrent 1.341 1.182 1.521

3c. (5 pts) Interpret one OR in plain language, making sure to mention the “at every cut-point” property. Smoking has OR = 1.34, therefore current smokers have 1.34 times the odds of reporting Good or worse health (vs. Excellent/VG), AND 1.6 times the odds of reporting Fair/Poor (vs. Excellent/VG or Good).

3d. (10 pts) Use ggpredict() to plot predicted probabilities of each health category across a continuous predictor of your choice.

ggpredict(mod_ord, terms = "income_cat [1:8]") |>
  plot() +
  labs(title = "Ordinal Model: Predicted Probability of Each Health Category",
       x = "Income Category", y = "Predicted Probability") +
  theme_minimal()

Task 4: Check Assumptions and Compare (15 points)

4a. (5 pts) Run the Brant test for proportional odds. Does the assumption hold?

brant_out <- brant(mod_ord)
## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      35.87   6   0
## age      0.06    1   0.81
## sexFemale    0.89    1   0.34
## bmi      7.1 1   0.01
## exerciseYes  10.58   1   0
## income_cat   10.54   1   0
## smokerCurrent    7.76    1   0.01
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds
as.data.frame(brant_out) |>
  tibble::rownames_to_column("Predictor") |>
  mutate(
    X2          = round(X2, 2),
    probability = ifelse(probability < 0.001, "< 0.001", sprintf("%.3f", probability)),
    Fails       = ifelse(Predictor == "Omnibus",
                         ifelse(as.numeric(gsub("< ", "", probability)) < 0.05 | probability == "< 0.001", "Yes (overall)", "No (overall)"),
                         ifelse(probability == "< 0.001" | as.numeric(gsub("< ", "", probability)) < 0.05, "⚠ Yes", ""))
  ) |>
  kable(
    col.names = c("Predictor", "X²", "df", "p-value", "Assumption Violated?"),
    caption   = "Brant Test for Proportional Odds Assumption",
    align     = c("l", "r", "r", "r", "c")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(0, bold = TRUE) |>
  row_spec(1, bold = TRUE, background = "#f0f0f0")
Brant Test for Proportional Odds Assumption
Predictor df p-value Assumption Violated?
Omnibus 35.87 6 < 0.001 Yes (overall)
age 0.06 1 0.814
sexFemale 0.89 1 0.345
bmi 7.10 1 0.008 ⚠ Yes
exerciseYes 10.58 1 0.001 ⚠ Yes
income_cat 10.54 1 0.001 ⚠ Yes
smokerCurrent 7.76 1 0.005 ⚠ Yes

The assumptions are violated for the omnibus test. In addtition, bmi, exercise, income, and smoking status also violate the assumptions based on p<0.05.

4b. (5 pts) Compare the AIC of the multinomial and ordinal models. Which fits better?

tibble(
  Model = c("Multinomial", "Ordinal (proportional odds)"),
  AIC   = c(AIC(mod_multi), AIC(mod_ord)),
  df    = c(mod_multi$edf, length(coef(mod_ord)) + length(mod_ord$zeta))
) |>
  mutate(AIC = round(AIC, 1)) |>
  kable(caption = "Model Comparison") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Model Comparison
Model AIC df
Multinomial 9315.6 14
Ordinal (proportional odds) 9334.3 8

The multinomial models has a better fit based on the smaller AIC.

4c. (5 pts) Based on your results, which model would you recommend reporting? Justify in 2-3 sentences. I would recommend reporting the multinomial model. This ordinal model has multiple violations and has a worsened AIC, suggesting that the multinomial model is a better fit.