library(tidyverse)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet)
library(MASS)
library(brant)
library(ggeffects)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

brfss_poly <- readRDS("brfss_polytomous_2020.rds")

Task 1: Explore the Outcome

1a. Frequency Table of `genhlth_ord`

brfss_poly |>
  count(genhlth_ord) |>
  mutate(pct = round(100 * n / sum(n), 1)) |>
  kable(
    col.names = c("General Health Category", "N", "%"),
    caption   = "Table 1. Distribution of General Health (3-Level Ordinal Outcome)"
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 1. Distribution of General Health (3-Level Ordinal Outcome)
General Health Category	N	%
Excellent/VG	2380	47.6
Good	1624	32.5
Fair/Poor	996	19.9

1b. Stacked Bar Chart by Smoker Status

brfss_poly |>
  filter(!is.na(smoker), !is.na(genhlth_ord)) |>
  count(smoker, genhlth_ord) |>
  group_by(smoker) |>
  mutate(pct = n / sum(n)) |>
  ggplot(aes(x = smoker, y = pct, fill = genhlth_ord)) +
  geom_col(width = 0.6) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(
    values = c("Excellent/VG" = "steelblue", "Good" = "darkgreen", "Fair/Poor" = "red2"),
    name   = "General Health"
  ) +
  labs(
    title   = "Distribution of General Health by Smoking Status",
    x       = "Smoking Status",
    y       = "Proportion"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "right")

Stacked bar chart of general health distribution by smoker status

Current smokers show a notably higher proportion of Fair/Poor health and a lower proportion of Excellent/Very Good health compared to former/never smokers.

1c. Descriptive Table by `genhlth_ord` (≥ 4 predictors)

brfss_poly |>
  dplyr::select(genhlth_ord, age, sex, bmi, exercise, income_cat, smoker) |>
  tbl_summary(
    by        = genhlth_ord,
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    label = list(
      age        ~ "Age (years)",
      sex        ~ "Sex",
      bmi        ~ "BMI",
      exercise   ~ "Exercised past 30 days",
      income_cat ~ "Income category (1–8)",
      smoker     ~ "Smoking status"
    ),
    missing = "no"
  ) |>
  add_overall() |>
  add_p() |>
  bold_p() |>
  modify_caption("**Table 2. Participant Characteristics by General Health Status**")

**Table 2. Participant Characteristics by General Health Status**
Characteristic	Overall, N = 5,000¹	Excellent/VG, N = 2,380¹	Good, N = 1,624¹	Fair/Poor, N = 996¹	p-value²
Age (years)	57 (16)	55 (16)	58 (16)	61 (15)	<0.001
Sex					0.005
Male	2,717 (54%)	1,315 (55%)	906 (56%)	496 (50%)
Female	2,283 (46%)	1,065 (45%)	718 (44%)	500 (50%)
BMI	28 (6)	27 (5)	29 (6)	30 (8)	<0.001
Exercised past 30 days	3,630 (73%)	1,999 (84%)	1,161 (71%)	470 (47%)	<0.001
Income category (1–8)					<0.001
1	225 (4.5%)	39 (1.6%)	72 (4.4%)	114 (11%)
2	271 (5.4%)	69 (2.9%)	92 (5.7%)	110 (11%)
3	403 (8.1%)	124 (5.2%)	128 (7.9%)	151 (15%)
4	512 (10%)	174 (7.3%)	188 (12%)	150 (15%)
5	519 (10%)	198 (8.3%)	197 (12%)	124 (12%)
6	714 (14%)	354 (15%)	249 (15%)	111 (11%)
7	844 (17%)	443 (19%)	289 (18%)	112 (11%)
8	1,512 (30%)	979 (41%)	409 (25%)	124 (12%)
Smoking status					<0.001
Former/Never	3,304 (66%)	1,692 (71%)	1,015 (63%)	597 (60%)
Current	1,696 (34%)	688 (29%)	609 (38%)	399 (40%)
¹ Mean (SD); n (%)
² Kruskal-Wallis rank sum test; Pearson’s Chi-squared test

Task 2: Multinomial Logistic Regression

2a. Fit Multinomial Model

mod_multi <- multinom(
  genhlth_nom ~ age + sex + bmi + exercise + income_cat + smoker,
  data  = brfss_poly,
  trace = FALSE
)

summary(mod_multi)

## Call:
## multinom(formula = genhlth_nom ~ age + sex + bmi + exercise + 
##     income_cat + smoker, data = brfss_poly, trace = FALSE)
## 
## Coefficients:
##           (Intercept)        age  sexFemale        bmi exerciseYes income_cat
## Good        -1.317306 0.01497067 -0.1417121 0.05087197   -0.500636 -0.1693992
## Fair/Poor   -1.283880 0.02579060 -0.1257209 0.06741312   -1.338926 -0.3932631
##           smokerCurrent
## Good          0.4117899
## Fair/Poor     0.3436415
## 
## Std. Errors:
##           (Intercept)         age  sexFemale         bmi exerciseYes income_cat
## Good        0.2689762 0.002189706 0.06782701 0.005768280  0.08178912 0.01748213
## Fair/Poor   0.3296881 0.002916356 0.08583048 0.006778269  0.09155273 0.02090747
##           smokerCurrent
## Good         0.07723392
## Fair/Poor    0.09720041
## 
## Residual Deviance: 9287.557 
## AIC: 9315.557

The model fits two simultaneous logit equations: “Good vs. Excellent/VG” and “Fair/Poor vs. Excellent/VG,” using Excellent/VG as the reference category.

2b. Relative Risk Ratios with 95% CIs

tidy(mod_multi, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
  mutate(
    across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
    p.value = format.pval(p.value, digits = 3, eps = 0.001)
  ) |>
  kable(
    col.names = c("Outcome (vs. Excellent/VG)", "Predictor", "RRR", "95% CI Lower", "95% CI Upper", "p-value"),
    caption   = "Table 3. Multinomial Logistic Regression: Relative Risk Ratios"
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  pack_rows("Good vs. Excellent/VG", 1, 7) |>
  pack_rows("Fair/Poor vs. Excellent/VG", 8, 14)

Table 3. Multinomial Logistic Regression: Relative Risk Ratios
Outcome (vs. Excellent/VG)	Predictor	RRR	95% CI Lower	95% CI Upper	p-value
Good vs. Excellent/VG
Good	(Intercept)	0.268	0.158	0.454	<0.001
Good	age	1.015	1.011	1.019	<0.001
Good	sexFemale	0.868	0.760	0.991	0.0367
Good	bmi	1.052	1.040	1.064	<0.001
Good	exerciseYes	0.606	0.516	0.712	<0.001
Good	income_cat	0.844	0.816	0.874	<0.001
Good	smokerCurrent	1.510	1.297	1.756	<0.001
Fair/Poor vs. Excellent/VG
Fair/Poor	(Intercept)	0.277	0.145	0.529	<0.001
Fair/Poor	age	1.026	1.020	1.032	<0.001
Fair/Poor	sexFemale	0.882	0.745	1.043	0.1430
Fair/Poor	bmi	1.070	1.056	1.084	<0.001
Fair/Poor	exerciseYes	0.262	0.219	0.314	<0.001
Fair/Poor	income_cat	0.675	0.648	0.703	<0.001
Fair/Poor	smokerCurrent	1.410	1.165	1.706	<0.001

2c. Interpretation: Smoker (Fair/Poor vs. Excellent/VG)

Interpretation: Holding age, sex, BMI, exercise, and income constant, current smokers have a relative risk ratio of 1.41 (95% CI: 1.17–1.71) for reporting Fair/Poor health compared to Excellent/Very Good health, relative to former/never smokers. In other words, the odds of being in the Fair/Poor category rather than the Excellent/VG category are approximately 41% higher among current smokers than among former/never smokers, after adjustment.

Task 3: Ordinal Logistic Regression

3a. Fit Ordinal Model

mod_ord <- polr(
  genhlth_ord ~ age + sex + bmi + exercise + income_cat + smoker,
  data   = brfss_poly,
  Hess   = TRUE,
  method = "logistic"
)

summary(mod_ord)

## Call:
## polr(formula = genhlth_ord ~ age + sex + bmi + exercise + income_cat + 
##     smoker, data = brfss_poly, Hess = TRUE, method = "logistic")
## 
## Coefficients:
##                  Value Std. Error t value
## age            0.01797   0.001867   9.627
## sexFemale     -0.12885   0.056835  -2.267
## bmi            0.04973   0.004534  10.967
## exerciseYes   -0.92167   0.064426 -14.306
## income_cat    -0.26967   0.014113 -19.107
## smokerCurrent  0.29313   0.064304   4.558
## 
## Intercepts:
##                   Value    Std. Error t value 
## Excellent/VG|Good   0.0985   0.2200     0.4478
## Good|Fair/Poor      1.8728   0.2212     8.4650
## 
## Residual Deviance: 9318.274 
## AIC: 9334.274

3b. Cumulative Odds Ratios with 95% CIs

ci_ord <- confint(mod_ord)
ord_tbl <- tibble(
  term      = rownames(ci_ord),
  conf.low  = exp(ci_ord[, 1]),
  conf.high = exp(ci_ord[, 2])
) |>
  left_join(
    tidy(mod_ord) |> filter(!str_detect(term, "\\|")) |>
      dplyr::select(term, estimate),
    by = "term"
  ) |>
  mutate(estimate = exp(estimate))

bmi_or <- ord_tbl |> filter(term == "bmi")

3c. Interpretation of One Cumulative OR

Interpretation: Each one-unit increase in BMI is associated with a cumulative OR of 1.05 (95% CI: 1.04–1.06) for being in a worse health category. Because this is a proportional odds model, this OR applies at every cut-point simultaneously — that is, each additional BMI unit multiplies the odds of reporting “Good or worse” (vs. Excellent/VG) and the odds of reporting “Fair/Poor” (vs. Good or better) by the same factor of approximately 1.05, after adjusting for all other covariates.

3d. Predicted Probabilities Across BMI

pred_bmi <- ggpredict(mod_ord, terms = "bmi [10:55 by=1]")

plot(pred_bmi) +
  labs(
    title    = "Predicted Probabilities of General Health Category Across BMI",
    subtitle = "Ordinal logistic regression model; other covariates held at mean/reference",
    x        = "BMI",
    y        = "Predicted Probability",
    color    = "Health Category"
  ) +
  theme_minimal(base_size = 13)

Predicted probabilities of each health category across BMI values

As BMI increases, the predicted probability of Excellent/VG health declines steadily while the probabilities of Good and Fair/Poor health rise, consistent with a positive association between BMI and worse self-rated health.

Task 4: Check Assumptions and Compare Models

4a. Brant Test for Proportional Odds

brant_out <- brant(mod_ord)

## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      35.87   6   0
## age      0.06    1   0.81
## sexFemale    0.89    1   0.34
## bmi      7.1 1   0.01
## exerciseYes  10.58   1   0
## income_cat   10.54   1   0
## smokerCurrent    7.76    1   0.01
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

as.data.frame(brant_out) |>
  tibble::rownames_to_column("Predictor") |>
  mutate(
    X2          = round(X2, 2),
    probability = ifelse(probability < 0.001, "< 0.001",
                         sprintf("%.3f", probability)),
    `Assumption Violated?` = case_when(
      Predictor == "Omnibus" & (probability == "< 0.001" |
        as.numeric(gsub("< ", "", probability)) < 0.05) ~ "Yes (overall)",
      Predictor == "Omnibus" ~ "No (overall)",
      probability == "< 0.001" |
        as.numeric(gsub("< ", "", probability)) < 0.05  ~ "⚠ Yes",
      TRUE ~ ""
    )
  ) |>
  kable(
    col.names = c("Predictor", "X²", "df", "p-value", "Assumption Violated?"),
    caption   = "Table 5. Brant Test for Proportional Odds Assumption",
    align     = c("l", "r", "r", "r", "c")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(0, bold = TRUE) |>
  row_spec(1, bold = TRUE, background = "#f0f0f0")

Table 5. Brant Test for Proportional Odds Assumption
Predictor	X²	df	p-value	Assumption Violated?
Omnibus	35.87	6	< 0.001	Yes (overall)
age	0.06	1	0.814
sexFemale	0.89	1	0.345
bmi	7.10	1	0.008	⚠ Yes
exerciseYes	10.58	1	0.001	⚠ Yes
income_cat	10.54	1	0.001	⚠ Yes
smokerCurrent	7.76	1	0.005	⚠ Yes

Interpretation: Examine the omnibus p-value first. If it is < 0.05, the proportional odds assumption is violated for the model as a whole, and individual predictor rows identify which variables are driving the violation.

4b. AIC Comparison

tibble(
  Model = c("Multinomial logistic", "Ordinal logistic (prop. odds)"),
  AIC   = c(AIC(mod_multi), AIC(mod_ord)),
  `No. Parameters` = c(mod_multi$edf,
                        length(coef(mod_ord)) + length(mod_ord$zeta))
) |>
  mutate(AIC = round(AIC, 1)) |>
  kable(caption = "Table 6. AIC Comparison: Multinomial vs. Ordinal Model") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Table 6. AIC Comparison: Multinomial vs. Ordinal Model
Model	AIC	No. Parameters
Multinomial logistic	9315.6	14
Ordinal logistic (prop. odds)	9334.3	8

4c. Model Recommendation

Recommendation: If the Brant test omnibus p-value is ≥ 0.05, the ordinal logistic regression model should be preferred. It is more parsimonious (fewer parameters), yields a single, easily interpretable cumulative OR for each predictor, and, assuming proportional odds holds, will have a lower AIC. If the Brant test reveals a significant violation — particularly for key predictors such as smoker or sex the multinomial model is more appropriate because it does not impose the proportional odds constraint, at the cost of requiring separate interpretation for each outcome contrast. In that case, the flexibility of the multinomial model outweighs its added complexity.

End of Lab — EPI 553, April 16, 2026

EPI 553 — Lab 14

Karo Ekor

April 16, 2026

Task 1: Explore the Outcome

1a. Frequency Table of `genhlth_ord`

1b. Stacked Bar Chart by Smoker Status

1c. Descriptive Table by `genhlth_ord` (≥ 4 predictors)

Task 2: Multinomial Logistic Regression

2a. Fit Multinomial Model

2b. Relative Risk Ratios with 95% CIs

2c. Interpretation: Smoker (Fair/Poor vs. Excellent/VG)

Task 3: Ordinal Logistic Regression

3a. Fit Ordinal Model

3b. Cumulative Odds Ratios with 95% CIs

3c. Interpretation of One Cumulative OR

3d. Predicted Probabilities Across BMI

Task 4: Check Assumptions and Compare Models

4a. Brant Test for Proportional Odds

4b. AIC Comparison

4c. Model Recommendation

EPI 553 — Lab 14

Karo Ekor

April 16, 2026

Task 1: Explore the Outcome

1a. Frequency Table of genhlth_ord

1b. Stacked Bar Chart by Smoker Status

1c. Descriptive Table by genhlth_ord (≥ 4 predictors)

Task 2: Multinomial Logistic Regression

2a. Fit Multinomial Model

2b. Relative Risk Ratios with 95% CIs

2c. Interpretation: Smoker (Fair/Poor vs. Excellent/VG)

Task 3: Ordinal Logistic Regression

3a. Fit Ordinal Model

3b. Cumulative Odds Ratios with 95% CIs

3c. Interpretation of One Cumulative OR

3d. Predicted Probabilities Across BMI

Task 4: Check Assumptions and Compare Models

4a. Brant Test for Proportional Odds

4b. AIC Comparison

4c. Model Recommendation

1a. Frequency Table of `genhlth_ord`

1c. Descriptive Table by `genhlth_ord` (≥ 4 predictors)