knitr::opts_chunk$set(
  echo    = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width  = 10,
  fig.height = 6
)

Setup and Data

library(tidyverse)
library(haven)
library(janitor)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet)        # multinom() for polytomous regression
library(MASS)        # polr() for ordinal regression
library(brant)       # Brant test for proportional odds
library(ggeffects)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
brfss_full <- read_xpt(
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.XPT"
) |>
  clean_names()

Building the Outcome: General Health

The BRFSS variable genhlth asks: “Would you say that in general your health is…”

1 = Excellent, 2 = Very Good, 3 = Good, 4 = Fair, 5 = Poor

We collapse this into a 3-level ordinal outcome to keep models tractable for class:

brfss_poly <- brfss_full |>
  mutate(
    # 3-level ordinal outcome
    genhlth_3 = case_when(
      genhlth %in% c(1, 2) ~ "Excellent/VG",
      genhlth == 3         ~ "Good",
      genhlth %in% c(4, 5) ~ "Fair/Poor",
      TRUE                 ~ NA_character_
    ),
    # Ordered factor for ordinal regression
    genhlth_ord = factor(genhlth_3,
      levels = c("Excellent/VG", "Good", "Fair/Poor"),
      ordered = TRUE),
    # Unordered factor for polytomous regression
    genhlth_nom = factor(genhlth_3,
      levels = c("Excellent/VG", "Good", "Fair/Poor")),
    # Predictors
    age = age80,
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    smoker = factor(case_when(
      smokday2 %in% c(1, 2) ~ "Current",
      smokday2 == 3          ~ "Former/Never",
      TRUE                   ~ NA_character_
    ), levels = c("Former/Never", "Current"))
  ) |>
  filter(
    !is.na(genhlth_ord), !is.na(age), age >= 18, !is.na(sex),
    !is.na(bmi), !is.na(exercise), !is.na(income_cat), !is.na(smoker)
  )

set.seed(1220)
brfss_poly <- brfss_poly |>
  dplyr::select(genhlth_ord, genhlth_nom, age, sex, bmi,
                exercise, income_cat, smoker) |>
  slice_sample(n = 5000)

saveRDS(brfss_poly,
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/brfss_polytomous_2020.rds")

brfss_poly |>
  count(genhlth_ord) |>
  mutate(pct = round(100 * n / sum(n), 1)) |>
  kable(col.names = c("General Health", "N", "%"),
        caption = "Outcome Distribution") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Outcome Distribution
General Health N %
Excellent/VG 2380 47.6
Good 1624 32.5
Fair/Poor 996 19.9

In-Class Lab Activity

Data

Variable Description Type
genhlth_ord General health (Excellent/VG < Good < Fair/Poor) Ordered factor (outcome)
genhlth_nom General health, unordered Factor (outcome)
age Age in years Numeric
sex Male / Female Factor
bmi Body mass index Numeric
exercise Exercised in past 30 days (No/Yes) Factor
income_cat Household income (1-8) Numeric
smoker Former/Never vs. Current Factor

Task 1: Explore the Outcome (15 points)

1a. (5 pts) Create a frequency table of genhlth_ord showing N and percentage for each category.

brfss_poly |>
  count(genhlth_ord) |>
  mutate(
    percent = n / sum(n) * 100
  )
## # A tibble: 3 × 3
##   genhlth_ord      n percent
##   <ord>        <int>   <dbl>
## 1 Excellent/VG  2380    47.6
## 2 Good          1624    32.5
## 3 Fair/Poor      996    19.9

1b. (5 pts) Create a stacked bar chart showing the distribution of general health by smoker status.

ggplot(brfss_poly, aes(x = smoker, fill = genhlth_ord)) +
  geom_bar() +
  labs(
    title = "Distribution of General Health by Smoking Status",
    x = "Smoking Status",
    y = "Count",
    fill = "General Health"
  ) +
  theme_minimal()

1c. (5 pts) Use tbl_summary() to create a descriptive table stratified by genhlth_ord with at least 4 predictors.

brfss_poly |>
  tbl_summary(
    by = genhlth_ord,
    include = c(age, sex, bmi, exercise),
    type = list(
      c(age, bmi) ~ "continuous"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})"
    ),
    label = list(
      age      ~ "Age (years)",
      sex      ~ "Sex",
      bmi      ~ "BMI",
      exercise ~ "Exercise in past 30 days"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels()
Characteristic Overall
N = 5,000
1
Excellent/VG
N = 2,380
1
Good
N = 1,624
1
Fair/Poor
N = 996
1
p-value
Age (years) 57 (16) 55 (16) 58 (16) 61 (15)
Sex




    Male 2,717 (54%) 1,315 (55%) 906 (56%) 496 (50%)
    Female 2,283 (46%) 1,065 (45%) 718 (44%) 500 (50%)
BMI 28 (6) 27 (5) 29 (6) 30 (8)
Exercise in past 30 days 3,630 (73%) 1,999 (84%) 1,161 (71%) 470 (47%)
1 Mean (SD); n (%)

Task 2: Multinomial Logistic Regression (20 points)

2a. (5 pts) Fit a multinomial logistic regression model predicting genhlth_nom from at least 4 predictors using multinom().

mod_multi <- multinom(genhlth_nom ~ age + sex + bmi + exercise + smoker,
  data = brfss_poly)
## # weights:  21 (12 variable)
## initial  value 5493.061443 
## iter  10 value 5056.270643
## final  value 4834.786418 
## converged

2b. (10 pts) Report the relative risk ratios (exponentiated coefficients) with 95% CIs in a clean table.

mod_multi <- multinom(genhlth_nom ~ age + sex + bmi + exercise + smoker,
  data = brfss_poly)
## # weights:  21 (12 variable)
## initial  value 5493.061443 
## iter  10 value 5056.270643
## final  value 4834.786418 
## converged
tidy(mod_multi, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
         p.value = format.pval(p.value, digits = 3)) |>
  kable(col.names = c("Outcome", "Predictor", "RRR", "Lower", "Upper", "p"),
        caption = "Multinomial Logistic Regression: RRRs vs. Excellent/VG") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Multinomial Logistic Regression: RRRs vs. Excellent/VG
Outcome Predictor RRR Lower Upper p
Good (Intercept) 0.090 0.056 0.144 < 2e-16
Good age 1.016 1.012 1.021 4.91e-14
Good sexFemale 0.920 0.807 1.048 0.209
Good bmi 1.051 1.040 1.063 < 2e-16
Good exerciseYes 0.553 0.472 0.648 2.17e-13
Good smokerCurrent 1.798 1.554 2.081 3.49e-15
Fair/Poor (Intercept) 0.025 0.014 0.045 < 2e-16
Fair/Poor age 1.029 1.023 1.035 < 2e-16
Fair/Poor sexFemale 1.045 0.890 1.226 0.592
Fair/Poor bmi 1.069 1.056 1.083 < 2e-16
Fair/Poor exerciseYes 0.215 0.181 0.256 < 2e-16
Fair/Poor smokerCurrent 2.131 1.782 2.548 < 2e-16

2c. (5 pts) Interpret the RRR for one predictor in the “Fair/Poor vs. Excellent/VG” comparison.

The relative risk ratio for current smoking in fair/poor vs. excellent/very good comparison is 2.13(95% CI: 1.78,2.55). This means that, holding all other variables constant, current smokers have 2.13 times the relative risk of reporting fair/poor health rather than excellent/very good health compared to former/never smokers.

Task 3: Ordinal Logistic Regression (25 points)

3a. (5 pts) Fit an ordinal logistic regression model with the same predictors using polr().

3b. (5 pts) Report the cumulative ORs with 95% CIs.

mod_ord <- polr(
  genhlth_nom ~ age + sex + bmi + exercise + smoker,
  data = brfss_poly,
  Hess = TRUE
)
summary(mod_ord)
## Call:
## polr(formula = genhlth_nom ~ age + sex + bmi + exercise + smoker, 
##     data = brfss_poly, Hess = TRUE)
## 
## Coefficients:
##                   Value Std. Error  t value
## age            0.020324   0.001841  11.0393
## sexFemale     -0.008869   0.055394  -0.1601
## bmi            0.050074   0.004452  11.2467
## exerciseYes   -1.086855   0.063205 -17.1958
## smokerCurrent  0.579879   0.061570   9.4183
## 
## Intercepts:
##                   Value    Std. Error t value 
## Excellent/VG|Good   1.8581   0.1991     9.3341
## Good|Fair/Poor      3.5276   0.2035    17.3353
## 
## Residual Deviance: 9695.134 
## AIC: 9709.134
tidy(mod_ord, conf.int = TRUE, exponentiate = TRUE) |>
  filter(coef.type == "coefficient") |>
  dplyr::select(term, estimate, conf.low, conf.high) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
  kable(col.names = c("Predictor", "OR", "Lower", "Upper"),
        caption = "Ordinal Logistic Regression: Cumulative ORs") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Ordinal Logistic Regression: Cumulative ORs
Predictor OR Lower Upper
age 1.021 1.017 1.024
sexFemale 0.991 0.889 1.105
bmi 1.051 1.042 1.061
exerciseYes 0.337 0.298 0.382
smokerCurrent 1.786 1.583 2.015

3c. (5 pts) Interpret one OR in plain language, making sure to mention the “at every cut-point” property.

The odds ratio for exercise is 0.34(95% CI: 0.30,0.38). This means that individuals who exercised have 66% lower odds of being in a worse general health category compared to those who did not exercise, and this is true at every cut-point.

3d. (10 pts) Use ggpredict() to plot predicted probabilities of each health category across a continuous predictor of your choice.

mod_ord <- polr(
  genhlth_ord ~ age + sex + bmi + exercise + smoker,
  data = brfss_poly,
  Hess = TRUE
)

ggpredict(mod_ord, terms = "age") |>
  plot() +
  labs(
    title = "Ordinal Model: Predicted Probability of Each Health Category by Age",
    x = "Age",
    y = "Predicted Probability"
  ) +
  theme_minimal()

Task 4: Check Assumptions and Compare (15 points)

4a. (5 pts) Run the Brant test for proportional odds. Does the assumption hold? The assumption does not hold.

brant(mod_ord)
## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      25.28   5   0
## age      0.14    1   0.71
## sexFemale    2.51    1   0.11
## bmi      6.26    1   0.01
## exerciseYes  12.6    1   0
## smokerCurrent    3.95    1   0.05
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

4b. (5 pts) Compare the AIC of the multinomial and ordinal models. Which fits better?

The multinomial fits better.

mod_ord <- polr(
  genhlth_ord ~ age + sex + bmi + exercise + smoker,
  data = brfss_poly,
  Hess = TRUE
)

tibble(
  Model = c("Multinomial", "Ordinal (proportional odds)"),
  AIC   = c(AIC(mod_multi), AIC(mod_ord)),
  df    = c(mod_multi$edf, length(coef(mod_ord)) + length(mod_ord$zeta))
) |>
  mutate(AIC = round(AIC, 1)) |>
  kable(caption = "Model Comparison") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Model Comparison
Model AIC df
Multinomial 9693.6 12
Ordinal (proportional odds) 9709.1 7

4c. (5 pts) Based on your results, which model would you recommend reporting? Justify in 2-3 sentences.

The the multinomial model is preferred because it has a lower AIC, which shows an overall better fit. The ordinal model is simpler but the mutlinomial model does a better job at capturing differences between health categories.

Completion credit (25 points): Awarded for a complete, good-faith attempt. Total: 75 + 25 = 100 points.

End of Lab Activity