knitr::opts_chunk$set(
  echo    = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width  = 10,
  fig.height = 6
)

Setup and Data

library(tidyverse)
library(haven)
library(janitor)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet)        # multinom() for polytomous regression
library(MASS)        # polr() for ordinal regression
library(brant)       # Brant test for proportional odds
library(ggeffects)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

brfss_full <- read_xpt(
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.XPT"
) |>
  clean_names()

Building the Outcome: General Health

The BRFSS variable genhlth asks: “Would you say that in general your health is…”

1 = Excellent, 2 = Very Good, 3 = Good, 4 = Fair, 5 = Poor

We collapse this into a 3-level ordinal outcome to keep models tractable for class:

brfss_poly <- brfss_full |>
  mutate(
    # 3-level ordinal outcome
    genhlth_3 = case_when(
      genhlth %in% c(1, 2) ~ "Excellent/VG",
      genhlth == 3         ~ "Good",
      genhlth %in% c(4, 5) ~ "Fair/Poor",
      TRUE                 ~ NA_character_
    ),
    # Ordered factor for ordinal regression
    genhlth_ord = factor(genhlth_3,
      levels = c("Excellent/VG", "Good", "Fair/Poor"),
      ordered = TRUE),
    # Unordered factor for polytomous regression
    genhlth_nom = factor(genhlth_3,
      levels = c("Excellent/VG", "Good", "Fair/Poor")),
    # Predictors
    age = age80,
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    smoker = factor(case_when(
      smokday2 %in% c(1, 2) ~ "Current",
      smokday2 == 3          ~ "Former/Never",
      TRUE                   ~ NA_character_
    ), levels = c("Former/Never", "Current"))
  ) |>
  filter(
    !is.na(genhlth_ord), !is.na(age), age >= 18, !is.na(sex),
    !is.na(bmi), !is.na(exercise), !is.na(income_cat), !is.na(smoker)
  )

set.seed(1220)
brfss_poly <- brfss_poly |>
  dplyr::select(genhlth_ord, genhlth_nom, age, sex, bmi,
                exercise, income_cat, smoker) |>
  slice_sample(n = 5000)

saveRDS(brfss_poly,
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/brfss_polytomous_2020.rds")

brfss_poly |>
  count(genhlth_ord) |>
  mutate(pct = round(100 * n / sum(n), 1)) |>
  kable(col.names = c("General Health", "N", "%"),
        caption = "Outcome Distribution") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Outcome Distribution
General Health	N	%
Excellent/VG	2380	47.6
Good	1624	32.5
Fair/Poor	996	19.9

In-Class Lab Activity

Data

Variable	Description	Type
`genhlth_ord`	General health (Excellent/VG < Good < Fair/Poor)	Ordered factor (outcome)
`genhlth_nom`	General health, unordered	Factor (outcome)
`age`	Age in years	Numeric
`sex`	Male / Female	Factor
`bmi`	Body mass index	Numeric
`exercise`	Exercised in past 30 days (No/Yes)	Factor
`income_cat`	Household income (1-8)	Numeric
`smoker`	Former/Never vs. Current	Factor

Task 1: Explore the Outcome (15 points)

1a. (5 pts) Create a frequency table of genhlth_ord showing N and percentage for each category.

brfss_poly |>
  count(genhlth_ord) |>
  mutate(
    percent = n / sum(n) * 100
  )

## # A tibble: 3 × 3
##   genhlth_ord      n percent
##   <ord>        <int>   <dbl>
## 1 Excellent/VG  2380    47.6
## 2 Good          1624    32.5
## 3 Fair/Poor      996    19.9

1b. (5 pts) Create a stacked bar chart showing the distribution of general health by smoker status.

ggplot(brfss_poly, aes(x = smoker, fill = genhlth_ord)) +
  geom_bar() +
  labs(
    title = "Distribution of General Health by Smoking Status",
    x = "Smoking Status",
    y = "Count",
    fill = "General Health"
  ) +
  theme_minimal()

1c. (5 pts) Use tbl_summary() to create a descriptive table stratified by genhlth_ord with at least 4 predictors.

brfss_poly |>
  tbl_summary(
    by = genhlth_ord,
    include = c(age, sex, bmi, exercise),
    type = list(
      c(age, bmi) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      age      ~ "Age (years)",
      sex      ~ "Sex",
      bmi      ~ "BMI",
      exercise ~ "Exercise in past 30 days"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()

Characteristic	Overall N = 5,000¹	Excellent/VG N = 2,380¹	Good N = 1,624¹	Fair/Poor N = 996¹
Age (years)	57 (16)	55 (16)	58 (16)	61 (15)
Sex
Male	2,717 (54%)	1,315 (55%)	906 (56%)	496 (50%)
Female	2,283 (46%)	1,065 (45%)	718 (44%)	500 (50%)
BMI	28 (6)	27 (5)	29 (6)	30 (8)
Exercise in past 30 days	3,630 (73%)	1,999 (84%)	1,161 (71%)	470 (47%)
¹ Mean (SD); n (%)

Task 2: Multinomial Logistic Regression (20 points)

2a. (5 pts) Fit a multinomial logistic regression model predicting genhlth_nom from at least 4 predictors using multinom().

mod_multi <- multinom(genhlth_nom ~ age + sex + bmi + exercise + smoker,
  data = brfss_poly)

## # weights:  21 (12 variable)
## initial  value 5493.061443 
## iter  10 value 5056.270643
## final  value 4834.786418 
## converged

2b. (10 pts) Report the relative risk ratios (exponentiated coefficients) with 95% CIs in a clean table.

mod_multi <- multinom(genhlth_nom ~ age + sex + bmi + exercise + smoker,
  data = brfss_poly)

## # weights:  21 (12 variable)
## initial  value 5493.061443 
## iter  10 value 5056.270643
## final  value 4834.786418 
## converged

tidy(mod_multi, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
         p.value = format.pval(p.value, digits = 3)) |>
  kable(col.names = c("Outcome", "Predictor", "RRR", "Lower", "Upper", "p"),
        caption = "Multinomial Logistic Regression: RRRs vs. Excellent/VG") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Multinomial Logistic Regression: RRRs vs. Excellent/VG
Outcome	Predictor	RRR	Lower	Upper	p
Good	(Intercept)	0.090	0.056	0.144	< 2e-16
Good	age	1.016	1.012	1.021	4.91e-14
Good	sexFemale	0.920	0.807	1.048	0.209
Good	bmi	1.051	1.040	1.063	< 2e-16
Good	exerciseYes	0.553	0.472	0.648	2.17e-13
Good	smokerCurrent	1.798	1.554	2.081	3.49e-15
Fair/Poor	(Intercept)	0.025	0.014	0.045	< 2e-16
Fair/Poor	age	1.029	1.023	1.035	< 2e-16
Fair/Poor	sexFemale	1.045	0.890	1.226	0.592
Fair/Poor	bmi	1.069	1.056	1.083	< 2e-16
Fair/Poor	exerciseYes	0.215	0.181	0.256	< 2e-16
Fair/Poor	smokerCurrent	2.131	1.782	2.548	< 2e-16

2c. (5 pts) Interpret the RRR for one predictor in the “Fair/Poor vs. Excellent/VG” comparison.

The relative risk ratio for current smoking in fair/poor vs. excellent/very good comparison is 2.13(95% CI: 1.78,2.55). This means that, holding all other variables constant, current smokers have 2.13 times the relative risk of reporting fair/poor health rather than excellent/very good health compared to former/never smokers.

Task 3: Ordinal Logistic Regression (25 points)

3a. (5 pts) Fit an ordinal logistic regression model with the same predictors using polr().

3b. (5 pts) Report the cumulative ORs with 95% CIs.

mod_ord <- polr(
  genhlth_nom ~ age + sex + bmi + exercise + smoker,
  data = brfss_poly,
  Hess = TRUE
)
summary(mod_ord)

## Call:
## polr(formula = genhlth_nom ~ age + sex + bmi + exercise + smoker, 
##     data = brfss_poly, Hess = TRUE)
## 
## Coefficients:
##                   Value Std. Error  t value
## age            0.020324   0.001841  11.0393
## sexFemale     -0.008869   0.055394  -0.1601
## bmi            0.050074   0.004452  11.2467
## exerciseYes   -1.086855   0.063205 -17.1958
## smokerCurrent  0.579879   0.061570   9.4183
## 
## Intercepts:
##                   Value    Std. Error t value 
## Excellent/VG|Good   1.8581   0.1991     9.3341
## Good|Fair/Poor      3.5276   0.2035    17.3353
## 
## Residual Deviance: 9695.134 
## AIC: 9709.134

tidy(mod_ord, conf.int = TRUE, exponentiate = TRUE) |>
  filter(coef.type == "coefficient") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "Lower", "Upper"),
        caption = "Ordinal Logistic Regression: Cumulative ORs") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Ordinal Logistic Regression: Cumulative ORs
Predictor	OR	Lower	Upper
age	1.021	1.017	1.024
sexFemale	0.991	0.889	1.105
bmi	1.051	1.042	1.061
exerciseYes	0.337	0.298	0.382
smokerCurrent	1.786	1.583	2.015

3c. (5 pts) Interpret one OR in plain language, making sure to mention the “at every cut-point” property.

The odds ratio for exercise is 0.34(95% CI: 0.30,0.38). This means that individuals who exercised have 66% lower odds of being in a worse general health category compared to those who did not exercise, and this is true at every cut-point.

3d. (10 pts) Use ggpredict() to plot predicted probabilities of each health category across a continuous predictor of your choice.

mod_ord <- polr(
  genhlth_ord ~ age + sex + bmi + exercise + smoker,
  data = brfss_poly,
  Hess = TRUE
)

ggpredict(mod_ord, terms = "age") |>
  plot() +
  labs(
    title = "Ordinal Model: Predicted Probability of Each Health Category by Age",
    x = "Age",
    y = "Predicted Probability"
  ) +
  theme_minimal()

Task 4: Check Assumptions and Compare (15 points)

4a. (5 pts) Run the Brant test for proportional odds. Does the assumption hold? The assumption does not hold.

brant(mod_ord)

## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      25.28   5   0
## age      0.14    1   0.71
## sexFemale    2.51    1   0.11
## bmi      6.26    1   0.01
## exerciseYes  12.6    1   0
## smokerCurrent    3.95    1   0.05
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

4b. (5 pts) Compare the AIC of the multinomial and ordinal models. Which fits better?

The multinomial fits better.

mod_ord <- polr(
  genhlth_ord ~ age + sex + bmi + exercise + smoker,
  data = brfss_poly,
  Hess = TRUE
)

tibble(
  Model = c("Multinomial", "Ordinal (proportional odds)"),
  AIC   = c(AIC(mod_multi), AIC(mod_ord)),
  df    = c(mod_multi$edf, length(coef(mod_ord)) + length(mod_ord$zeta))
) |>
  mutate(AIC = round(AIC, 1)) |>
  kable(caption = "Model Comparison") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Model Comparison
Model	AIC	df
Multinomial	9693.6	12
Ordinal (proportional odds)	9709.1	7

4c. (5 pts) Based on your results, which model would you recommend reporting? Justify in 2-3 sentences.

The the multinomial model is preferred because it has a lower AIC, which shows an overall better fit. The ordinal model is simpler but the mutlinomial model does a better job at capturing differences between health categories.

Completion credit (25 points): Awarded for a complete, good-faith attempt. Total: 75 + 25 = 100 points.

End of Lab Activity

Polytomous and Ordinal Logistic Regression

EPI 553 — Principles of Statistical Inference II

Grace Beal

April 16, 2026