knitr::opts_chunk$set(echo = TRUE)

Part 5: In-Class Lab Activity

EPI 553 — Polytomous and Ordinal Regression Lab Due: End of class, April 16, 2026

Instructions

Complete the four tasks below using brfss_polytomous_2020.rds. Submit a knitted HTML file via Brightspace.

Data

Variable Description Type
genhlth_ord General health (Excellent/VG < Good < Fair/Poor) Ordered factor (outcome)
genhlth_nom General health, unordered Factor (outcome)
age Age in years Numeric
sex Male / Female Factor
bmi Body mass index Numeric
exercise Exercised in past 30 days (No/Yes) Factor
income_cat Household income (1-8) Numeric
smoker Former/Never vs. Current Factor
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.3
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.4.3
library(nnet)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:gtsummary':
## 
##     select
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(brant)
## Warning: package 'brant' was built under R version 4.4.3
library(ggeffects)
## Warning: package 'ggeffects' was built under R version 4.4.3
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
## Setting theme "Compact"
## Setting theme "Compact"
brfss_poly <- readRDS( "/Users/sarah/OneDrive/Documents/EPI 553/data/brfss_poly_2020.rds")

Task 1: Explore the Outcome (15 points)

1a. (5 pts) Create a frequency table of genhlth_ord showing N and percentage for each category.

brfss_poly |>
  count(genhlth_ord) |>
  mutate(pct = round(100 * n / sum(n), 1)) |>
  kable(col.names = c("General Health", "N", "%"),
        caption = "Outcome Distribution") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Outcome Distribution
General Health N %
Excellent/VG 2380 47.6
Good 1624 32.5
Fair/Poor 996 19.9

1b. (5 pts) Create a stacked bar chart showing the distribution of general health by smoker status.

ggplot(brfss_poly, aes(x = smoker, fill = genhlth_ord)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_brewer(palette = "Blues") +
  labs(
    title = "Figure 1. Distribution of General Health by Smoking Status",
    x = "Smoking Status",
    y = "Percent",
    fill = "General Health"
  ) +
  theme_minimal(base_size = 13)

1c. (5 pts) Use tbl_summary() to create a descriptive table stratified by genhlth_ord with at least 4 predictors.

brfss_poly |>
  tbl_summary(
    by = genhlth_ord,
    include = c(age, sex, bmi, exercise, income_cat),
    label = list(
      age        ~ "Age (years)",
      sex        ~ "Sex",
      bmi        ~ "Body Mass Index",
      exercise   ~ "Exercise (past 30 days)",
      income_cat ~ "Household Income (1–8)"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1
  ) |>
  add_overall() |>
  bold_labels() |>
  modify_caption("**Table 1. Descriptive Characteristics Stratified by Ordered General Health**") 
Table 1. Descriptive Characteristics Stratified by Ordered General Health
Characteristic Overall
N = 5,000
1
Excellent/VG
N = 2,380
1
Good
N = 1,624
1
Fair/Poor
N = 996
1
Age (years) 57.0 (16.2) 54.7 (16.5) 57.9 (16.3) 60.8 (14.6)
Sex



    Male 2,717 (54%) 1,315 (55%) 906 (56%) 496 (50%)
    Female 2,283 (46%) 1,065 (45%) 718 (44%) 500 (50%)
Body Mass Index 28.5 (6.3) 27.4 (5.2) 29.1 (6.4) 30.0 (8.0)
Exercise (past 30 days) 3,630 (73%) 1,999 (84%) 1,161 (71%) 470 (47%)
Household Income (1–8)



    1 225 (4.5%) 39 (1.6%) 72 (4.4%) 114 (11%)
    2 271 (5.4%) 69 (2.9%) 92 (5.7%) 110 (11%)
    3 403 (8.1%) 124 (5.2%) 128 (7.9%) 151 (15%)
    4 512 (10%) 174 (7.3%) 188 (12%) 150 (15%)
    5 519 (10%) 198 (8.3%) 197 (12%) 124 (12%)
    6 714 (14%) 354 (15%) 249 (15%) 111 (11%)
    7 844 (17%) 443 (19%) 289 (18%) 112 (11%)
    8 1,512 (30%) 979 (41%) 409 (25%) 124 (12%)
1 Mean (SD); n (%)

Task 2: Multinomial Logistic Regression (20 points)

2a. (5 pts) Fit a multinomial logistic regression model predicting genhlth_nom from at least 4 predictors using multinom().

multi_mod_2a <- multinom(
  genhlth_nom ~ age + sex + bmi + exercise + income_cat + smoker,
  data = brfss_poly,
  trace = FALSE
)

summary(multi_mod_2a)
## Call:
## multinom(formula = genhlth_nom ~ age + sex + bmi + exercise + 
##     income_cat + smoker, data = brfss_poly, trace = FALSE)
## 
## Coefficients:
##           (Intercept)        age  sexFemale        bmi exerciseYes income_cat
## Good        -1.317306 0.01497067 -0.1417121 0.05087197   -0.500636 -0.1693992
## Fair/Poor   -1.283880 0.02579060 -0.1257209 0.06741312   -1.338926 -0.3932631
##           smokerCurrent
## Good          0.4117899
## Fair/Poor     0.3436415
## 
## Std. Errors:
##           (Intercept)         age  sexFemale         bmi exerciseYes income_cat
## Good        0.2689762 0.002189706 0.06782701 0.005768280  0.08178912 0.01748213
## Fair/Poor   0.3296881 0.002916356 0.08583048 0.006778269  0.09155273 0.02090747
##           smokerCurrent
## Good         0.07723392
## Fair/Poor    0.09720041
## 
## Residual Deviance: 9287.557 
## AIC: 9315.557

2b. (10 pts) Report the relative risk ratios (exponentiated coefficients) with 95% CIs in a clean table.

tidy(multi_mod_2a, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
         p.value = format.pval(p.value, digits = 3)) |>
  kable(col.names = c("Outcome", "Predictor", "RRR", "Lower", "Upper", "p"),
        caption = "Multinomial Logistic Regression: RRRs vs. Excellent/VG") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Multinomial Logistic Regression: RRRs vs. Excellent/VG
Outcome Predictor RRR Lower Upper p
Good (Intercept) 0.268 0.158 0.454 9.71e-07
Good age 1.015 1.011 1.019 8.10e-12
Good sexFemale 0.868 0.760 0.991 0.036679
Good bmi 1.052 1.040 1.064 < 2e-16
Good exerciseYes 0.606 0.516 0.712 9.30e-10
Good income_cat 0.844 0.816 0.874 < 2e-16
Good smokerCurrent 1.510 1.297 1.756 9.73e-08
Fair/Poor (Intercept) 0.277 0.145 0.529 9.85e-05
Fair/Poor age 1.026 1.020 1.032 < 2e-16
Fair/Poor sexFemale 0.882 0.745 1.043 0.142987
Fair/Poor bmi 1.070 1.056 1.084 < 2e-16
Fair/Poor exerciseYes 0.262 0.219 0.314 < 2e-16
Fair/Poor income_cat 0.675 0.648 0.703 < 2e-16
Fair/Poor smokerCurrent 1.410 1.165 1.706 0.000407

2c. (5 pts) Interpret the RRR for one predictor in the “Fair/Poor vs. Excellent/VG” comparison.

Individuals who reported exercising in the past 30 days had a 74% lower relative risk of reporting Fair/Poor health compared to Excellent/Very Good health, compared to those who didn’t exercise (RRR = 0.262).

Task 3: Ordinal Logistic Regression (25 points)

3a. (5 pts) Fit an ordinal logistic regression model with the same predictors using polr().

mod_3a <- polr(genhlth_nom ~ age + sex + bmi + exercise + income_cat + smoker,
  data = brfss_poly,
  Hess = TRUE
)
summary(mod_3a)
## Call:
## polr(formula = genhlth_nom ~ age + sex + bmi + exercise + income_cat + 
##     smoker, data = brfss_poly, Hess = TRUE)
## 
## Coefficients:
##                  Value Std. Error t value
## age            0.01797   0.001867   9.627
## sexFemale     -0.12885   0.056835  -2.267
## bmi            0.04973   0.004534  10.967
## exerciseYes   -0.92167   0.064426 -14.306
## income_cat    -0.26967   0.014113 -19.107
## smokerCurrent  0.29313   0.064304   4.558
## 
## Intercepts:
##                   Value    Std. Error t value 
## Excellent/VG|Good   0.0985   0.2200     0.4478
## Good|Fair/Poor      1.8728   0.2212     8.4650
## 
## Residual Deviance: 9318.274 
## AIC: 9334.274

3b. (5 pts) Report the cumulative ORs with 95% CIs.

tidy(mod_3a, conf.int = TRUE, exponentiate = TRUE) |>
  filter(coef.type == "coefficient") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "Lower", "Upper"),
        caption = "Ordinal Logistic Regression: Cumulative ORs") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Ordinal Logistic Regression: Cumulative ORs
Predictor OR Lower Upper
age 1.018 1.014 1.022
sexFemale 0.879 0.786 0.983
bmi 1.051 1.042 1.060
exerciseYes 0.398 0.351 0.451
income_cat 0.764 0.743 0.785
smokerCurrent 1.341 1.182 1.521

3c. (5 pts) Interpret one OR in plain language, making sure to mention the “at every cut-point” property.

Individuals who exercised in the past 30 days had lower cumulative odds of reporting worse general health at every cut point of the scale compared to those who didn’t exercise (OR = 0.398).

3d. (10 pts) Use ggpredict() to plot predicted probabilities of each health category across a continuous predictor of your choice.

ggpredict(mod_3a, terms = "age") |>
  plot() +
  labs(title = "Predicted Probabiltiy of Each Health Category Across Age", x = "Age", y = "Predicted Probability") +
  theme_minimal()
## Data were 'prettified'. Consider using `terms="age [all]"` to get smooth
##   plots.

Task 4: Check Assumptions and Compare (15 points)

4a. (5 pts) Run the Brant test for proportional odds. Does the assumption hold?

brant_4a <- brant(mod_3a)
## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      35.87   6   0
## age      0.06    1   0.81
## sexFemale    0.89    1   0.34
## bmi      7.1 1   0.01
## exerciseYes  10.58   1   0
## income_cat   10.54   1   0
## smokerCurrent    7.76    1   0.01
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds
as.data.frame(brant_4a) |>
  tibble::rownames_to_column("Predictor") |>
  mutate(
    X2          = round(X2, 2),
    probability = ifelse(probability < 0.001, "< 0.001", sprintf("%.3f", probability)),
    Fails       = ifelse(Predictor == "Omnibus",
                         ifelse(as.numeric(gsub("< ", "", probability)) < 0.05 | probability == "< 0.001", "Yes (overall)", "No (overall)"),
                         ifelse(probability == "< 0.001" | as.numeric(gsub("< ", "", probability)) < 0.05, "⚠ Yes", ""))
  ) |>
  kable(
    col.names = c("Predictor", "X²", "df", "p-value", "Assumption Violated?"),
    caption   = "Brant Test for Proportional Odds Assumption",
    align     = c("l", "r", "r", "r", "c")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(0, bold = TRUE) |>
  row_spec(1, bold = TRUE, background = "#f0f0f0")
Brant Test for Proportional Odds Assumption
Predictor df p-value Assumption Violated?
Omnibus 35.87 6 < 0.001 Yes (overall)
age 0.06 1 0.814
sexFemale 0.89 1 0.345
bmi 7.10 1 0.008 ⚠ Yes
exerciseYes 10.58 1 0.001 ⚠ Yes
income_cat 10.54 1 0.001 ⚠ Yes
smokerCurrent 7.76 1 0.005 ⚠ Yes

4b. (5 pts) Compare the AIC of the multinomial and ordinal models. Which fits better?

The multinomial logistic regression model had a lower AIC (9315.557) compared with the ordinal logistic regression model (9334.274), indicating that the multinomial model provides a better fit to the data.

4c. (5 pts) Based on your results, which model would you recommend reporting? Justify in 2-3 sentences.

The Brant test showed a significant violation of the proportional odds assumption overall (p < 0.001), with several predictors—BMI, exercise, income, and smoking—failing the assumption individually. Because the ordinal logistic regression model relies on this assumption, it is not appropriate for these data. Although the multinomial model is less parsimonious, it fits the data better (lower AIC) and does not require the proportional odds assumption, so it is the preferred model to report.

Completion credit (25 points): Awarded for a complete, good-faith attempt. Total: 75 + 25 = 100 points.

End of Lab Activity