knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE,
fig.width = 10,
fig.height = 6
)library(tidyverse)
library(haven)
library(janitor)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(nnet) # multinom() for polytomous regression
library(MASS) # polr() for ordinal regression
library(brant) # Brant test for proportional odds
library(ggeffects)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))brfss_full <- read_xpt(
"C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.XPT"
) |>
clean_names()The BRFSS variable genhlth asks: “Would you say that
in general your health is…”
1 = Excellent, 2 = Very Good, 3 = Good, 4 = Fair, 5 = Poor
We collapse this into a 3-level ordinal outcome to keep models tractable for class:
brfss_poly <- brfss_full |>
mutate(
# 3-level ordinal outcome
genhlth_3 = case_when(
genhlth %in% c(1, 2) ~ "Excellent/VG",
genhlth == 3 ~ "Good",
genhlth %in% c(4, 5) ~ "Fair/Poor",
TRUE ~ NA_character_
),
# Ordered factor for ordinal regression
genhlth_ord = factor(genhlth_3,
levels = c("Excellent/VG", "Good", "Fair/Poor"),
ordered = TRUE),
# Unordered factor for polytomous regression
genhlth_nom = factor(genhlth_3,
levels = c("Excellent/VG", "Good", "Fair/Poor")),
# Predictors
age = age80,
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
income_cat = case_when(
income2 %in% 1:8 ~ as.numeric(income2),
TRUE ~ NA_real_
),
smoker = factor(case_when(
smokday2 %in% c(1, 2) ~ "Current",
smokday2 == 3 ~ "Former/Never",
TRUE ~ NA_character_
), levels = c("Former/Never", "Current"))
) |>
filter(
!is.na(genhlth_ord), !is.na(age), age >= 18, !is.na(sex),
!is.na(bmi), !is.na(exercise), !is.na(income_cat), !is.na(smoker)
)
set.seed(1220)
brfss_poly <- brfss_poly |>
dplyr::select(genhlth_ord, genhlth_nom, age, sex, bmi,
exercise, income_cat, smoker) |>
slice_sample(n = 5000)
saveRDS(brfss_poly,
"C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/brfss_polytomous_2020.rds")
brfss_poly |>
count(genhlth_ord) |>
mutate(pct = round(100 * n / sum(n), 1)) |>
kable(col.names = c("General Health", "N", "%"),
caption = "Outcome Distribution") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| General Health | N | % |
|---|---|---|
| Excellent/VG | 2380 | 47.6 |
| Good | 1624 | 32.5 |
| Fair/Poor | 996 | 19.9 |
| Variable | Description | Type |
|---|---|---|
genhlth_ord |
General health (Excellent/VG < Good < Fair/Poor) | Ordered factor (outcome) |
genhlth_nom |
General health, unordered | Factor (outcome) |
age |
Age in years | Numeric |
sex |
Male / Female | Factor |
bmi |
Body mass index | Numeric |
exercise |
Exercised in past 30 days (No/Yes) | Factor |
income_cat |
Household income (1-8) | Numeric |
smoker |
Former/Never vs. Current | Factor |
1a. (5 pts) Create a frequency table of
genhlth_ord showing N and percentage for each category.
## # A tibble: 3 × 3
## genhlth_ord n percent
## <ord> <int> <dbl>
## 1 Excellent/VG 2380 47.6
## 2 Good 1624 32.5
## 3 Fair/Poor 996 19.9
1b. (5 pts) Create a stacked bar chart showing the
distribution of general health by smoker status.
ggplot(brfss_poly, aes(x = smoker, fill = genhlth_ord)) +
geom_bar() +
labs(
title = "Distribution of General Health by Smoking Status",
x = "Smoking Status",
y = "Count",
fill = "General Health"
) +
theme_minimal()1c. (5 pts) Use tbl_summary() to create
a descriptive table stratified by genhlth_ord with at least
4 predictors.
brfss_poly |>
tbl_summary(
by = genhlth_ord,
include = c(age, sex, bmi, exercise),
type = list(
c(age, bmi) ~ "continuous"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})"
),
label = list(
age ~ "Age (years)",
sex ~ "Sex",
bmi ~ "BMI",
exercise ~ "Exercise in past 30 days"
)
) |>
add_overall() |>
add_p() |>
bold_labels()| Characteristic | Overall N = 5,0001 |
Excellent/VG N = 2,3801 |
Good N = 1,6241 |
Fair/Poor N = 9961 |
p-value |
|---|---|---|---|---|---|
| Age (years) | 57 (16) | 55 (16) | 58 (16) | 61 (15) | |
| Sex | |||||
| Male | 2,717 (54%) | 1,315 (55%) | 906 (56%) | 496 (50%) | |
| Female | 2,283 (46%) | 1,065 (45%) | 718 (44%) | 500 (50%) | |
| BMI | 28 (6) | 27 (5) | 29 (6) | 30 (8) | |
| Exercise in past 30 days | 3,630 (73%) | 1,999 (84%) | 1,161 (71%) | 470 (47%) | |
| 1 Mean (SD); n (%) | |||||
2a. (5 pts) Fit a multinomial logistic regression
model predicting genhlth_nom from at least 4 predictors
using multinom().
## # weights: 21 (12 variable)
## initial value 5493.061443
## iter 10 value 5056.270643
## final value 4834.786418
## converged
2b. (10 pts) Report the relative risk ratios (exponentiated coefficients) with 95% CIs in a clean table.
## # weights: 21 (12 variable)
## initial value 5493.061443
## iter 10 value 5056.270643
## final value 4834.786418
## converged
tidy(mod_multi, conf.int = TRUE, exponentiate = TRUE) |>
dplyr::select(y.level, term, estimate, conf.low, conf.high, p.value) |>
mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
p.value = format.pval(p.value, digits = 3)) |>
kable(col.names = c("Outcome", "Predictor", "RRR", "Lower", "Upper", "p"),
caption = "Multinomial Logistic Regression: RRRs vs. Excellent/VG") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Outcome | Predictor | RRR | Lower | Upper | p |
|---|---|---|---|---|---|
| Good | (Intercept) | 0.090 | 0.056 | 0.144 | < 2e-16 |
| Good | age | 1.016 | 1.012 | 1.021 | 4.91e-14 |
| Good | sexFemale | 0.920 | 0.807 | 1.048 | 0.209 |
| Good | bmi | 1.051 | 1.040 | 1.063 | < 2e-16 |
| Good | exerciseYes | 0.553 | 0.472 | 0.648 | 2.17e-13 |
| Good | smokerCurrent | 1.798 | 1.554 | 2.081 | 3.49e-15 |
| Fair/Poor | (Intercept) | 0.025 | 0.014 | 0.045 | < 2e-16 |
| Fair/Poor | age | 1.029 | 1.023 | 1.035 | < 2e-16 |
| Fair/Poor | sexFemale | 1.045 | 0.890 | 1.226 | 0.592 |
| Fair/Poor | bmi | 1.069 | 1.056 | 1.083 | < 2e-16 |
| Fair/Poor | exerciseYes | 0.215 | 0.181 | 0.256 | < 2e-16 |
| Fair/Poor | smokerCurrent | 2.131 | 1.782 | 2.548 | < 2e-16 |
2c. (5 pts) Interpret the RRR for one predictor in the “Fair/Poor vs. Excellent/VG” comparison.
The relative risk ratio for current smoking in fair/poor vs. excellent/very good comparison is 2.13(95% CI: 1.78,2.55). This means that, holding all other variables constant, current smokers have 2.13 times the relative risk of reporting fair/poor health rather than excellent/very good health compared to former/never smokers.
3a. (5 pts) Fit an ordinal logistic regression model
with the same predictors using polr().
3b. (5 pts) Report the cumulative ORs with 95% CIs.
mod_ord <- polr(
genhlth_nom ~ age + sex + bmi + exercise + smoker,
data = brfss_poly,
Hess = TRUE
)
summary(mod_ord)## Call:
## polr(formula = genhlth_nom ~ age + sex + bmi + exercise + smoker,
## data = brfss_poly, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## age 0.020324 0.001841 11.0393
## sexFemale -0.008869 0.055394 -0.1601
## bmi 0.050074 0.004452 11.2467
## exerciseYes -1.086855 0.063205 -17.1958
## smokerCurrent 0.579879 0.061570 9.4183
##
## Intercepts:
## Value Std. Error t value
## Excellent/VG|Good 1.8581 0.1991 9.3341
## Good|Fair/Poor 3.5276 0.2035 17.3353
##
## Residual Deviance: 9695.134
## AIC: 9709.134
tidy(mod_ord, conf.int = TRUE, exponentiate = TRUE) |>
filter(coef.type == "coefficient") |>
dplyr::select(term, estimate, conf.low, conf.high) |>
mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
kable(col.names = c("Predictor", "OR", "Lower", "Upper"),
caption = "Ordinal Logistic Regression: Cumulative ORs") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Predictor | OR | Lower | Upper |
|---|---|---|---|
| age | 1.021 | 1.017 | 1.024 |
| sexFemale | 0.991 | 0.889 | 1.105 |
| bmi | 1.051 | 1.042 | 1.061 |
| exerciseYes | 0.337 | 0.298 | 0.382 |
| smokerCurrent | 1.786 | 1.583 | 2.015 |
3c. (5 pts) Interpret one OR in plain language, making sure to mention the “at every cut-point” property.
The odds ratio for exercise is 0.34(95% CI: 0.30,0.38). This means that individuals who exercised have 66% lower odds of being in a worse general health category compared to those who did not exercise, and this is true at every cut-point.
3d. (10 pts) Use ggpredict() to plot
predicted probabilities of each health category across a continuous
predictor of your choice.
mod_ord <- polr(
genhlth_ord ~ age + sex + bmi + exercise + smoker,
data = brfss_poly,
Hess = TRUE
)
ggpredict(mod_ord, terms = "age") |>
plot() +
labs(
title = "Ordinal Model: Predicted Probability of Each Health Category by Age",
x = "Age",
y = "Predicted Probability"
) +
theme_minimal()4a. (5 pts) Run the Brant test for proportional odds. Does the assumption hold? The assumption does not hold.
## --------------------------------------------
## Test for X2 df probability
## --------------------------------------------
## Omnibus 25.28 5 0
## age 0.14 1 0.71
## sexFemale 2.51 1 0.11
## bmi 6.26 1 0.01
## exerciseYes 12.6 1 0
## smokerCurrent 3.95 1 0.05
## --------------------------------------------
##
## H0: Parallel Regression Assumption holds
4b. (5 pts) Compare the AIC of the multinomial and ordinal models. Which fits better?
The multinomial fits better.
mod_ord <- polr(
genhlth_ord ~ age + sex + bmi + exercise + smoker,
data = brfss_poly,
Hess = TRUE
)
tibble(
Model = c("Multinomial", "Ordinal (proportional odds)"),
AIC = c(AIC(mod_multi), AIC(mod_ord)),
df = c(mod_multi$edf, length(coef(mod_ord)) + length(mod_ord$zeta))
) |>
mutate(AIC = round(AIC, 1)) |>
kable(caption = "Model Comparison") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Model | AIC | df |
|---|---|---|
| Multinomial | 9693.6 | 12 |
| Ordinal (proportional odds) | 9709.1 | 7 |
4c. (5 pts) Based on your results, which model would you recommend reporting? Justify in 2-3 sentences.
The the multinomial model is preferred because it has a lower AIC, which shows an overall better fit. The ordinal model is simpler but the mutlinomial model does a better job at capturing differences between health categories.
Completion credit (25 points): Awarded for a complete, good-faith attempt. Total: 75 + 25 = 100 points.
End of Lab Activity