# Load the BRFSS 2023 XPT file
brfss_raw <- haven::read_xpt("D:/fizza documents/Epi 553/Assignments/HW04/LLCP2023.XPT")
cat("Raw dataset dimensions:\n")## Raw dataset dimensions:
## Rows: 433323
## Columns: 350
# Select only the variables needed for this assignment
# Note: variables with leading underscore require backticks
brfss_selected <- brfss_raw |>
dplyr::select(
PHYSHLTH, # outcome: physically unhealthy days
MENTHLTH, # predictor 1: mentally unhealthy days (continuous)
`_BMI5`, # predictor 2: BMI x100 (continuous)
SEXVAR, # predictor 3: sex (binary)
EXERANY2, # predictor 4: any exercise (binary)
ADDEPEV3, # predictor 5: ever told depression (binary)
`_AGEG5YR`, # predictor 6: age group (categorical)
`_INCOMG1`, # predictor 7: income (categorical)
EDUCA, # predictor 8: education (categorical)
`_SMOKER3`, # predictor 9: smoking status (categorical)
GENHLTH # predictor 10: general health (categorical)
)
cat("\nSelected dataset dimensions:\n")##
## Selected dataset dimensions:
## Rows: 433323
## Columns: 11
Raw Dataset Description: The raw BRFSS 2023 dataset has 433,323 people and 350 variables. After selecting only the 11 variables needed for this analysis (the outcome plus 10 predictors), the working dataset still has all 433,323 rows but only 11 columns. The other 339 variables were dropped.
Predictor selection justification: I included all 10 candidate predictors in the maximum model because each one has theoretical or empirical links to physical health burden. Mental health days and general health status are directly related to physical health in terms of functioning. BMI, exercise, smoking, and depression are behavioral and clinical risk factors that are known to be associated with physical health problems. Age, sex, income, and education are sociodemographic factors that help explain structural differences in health and health-seeking behavior, and all of them were found to be significant in previous BRFSS studies. Including all 10 variables allows the model selection process to figure out which combination of predictors gives the best fit. —
brfss_clean <- brfss_selected |>
dplyr::mutate(
# --- OUTCOME ---
# PHYSHLTH: 88 = none (recode to 0); 77/99 = NA; 1-30 keep as is
physhlth_days = dplyr::case_when(
as.integer(PHYSHLTH) == 88L ~ 0,
as.integer(PHYSHLTH) %in% c(77L, 99L) ~ NA_real_,
as.integer(PHYSHLTH) >= 1L &
as.integer(PHYSHLTH) <= 30L ~ as.numeric(PHYSHLTH),
TRUE ~ NA_real_
),
# --- CONTINUOUS PREDICTORS ---
# MENTHLTH: 88 = none; 77/99 = NA; 1-30 keep
menthlth_days = dplyr::case_when(
as.integer(MENTHLTH) == 88L ~ 0,
as.integer(MENTHLTH) %in% c(77L, 99L) ~ NA_real_,
as.integer(MENTHLTH) >= 1L &
as.integer(MENTHLTH) <= 30L ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_
),
# _BMI5: divide by 100; 9999 = NA
bmi = dplyr::if_else(as.integer(`_BMI5`) == 9999L,
NA_real_,
as.numeric(`_BMI5`) / 100),
# --- BINARY PREDICTORS ---
# SEXVAR: 1 = Male, 2 = Female
sex = factor(
dplyr::case_when(
as.integer(SEXVAR) == 1L ~ "Male",
as.integer(SEXVAR) == 2L ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female") # Male = reference
),
# EXERANY2: 1 = Yes, 2 = No; 7/9 = NA
exercise = factor(
dplyr::case_when(
as.integer(EXERANY2) == 1L ~ "Yes",
as.integer(EXERANY2) == 2L ~ "No",
as.integer(EXERANY2) %in% c(7L, 9L) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No", "Yes") # No = reference
),
# ADDEPEV3: 1 = Yes, 2 = No; 7/9 = NA
depression = factor(
dplyr::case_when(
as.integer(ADDEPEV3) == 1L ~ "Yes",
as.integer(ADDEPEV3) == 2L ~ "No",
as.integer(ADDEPEV3) %in% c(7L, 9L) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No", "Yes") # No = reference
),
# --- CATEGORICAL PREDICTORS ---
# _AGEG5YR: 1-3="18-34", 4-6="35-49", 7-9="50-64", 10-13="65+"; 14=NA
age_group = factor(
dplyr::case_when(
as.integer(`_AGEG5YR`) %in% 1L:3L ~ "18-34",
as.integer(`_AGEG5YR`) %in% 4L:6L ~ "35-49",
as.integer(`_AGEG5YR`) %in% 7L:9L ~ "50-64",
as.integer(`_AGEG5YR`) %in% 10L:13L ~ "65+",
as.integer(`_AGEG5YR`) == 14L ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("18-34", "35-49", "50-64", "65+") # 18-34 = reference
),
# _INCOMG1: 1-2="Less than $25K", 3-4="$25K-$49K",
# 5-6="$50K-$99K", 7="$100K+"; 9=NA
income = factor(
dplyr::case_when(
as.integer(`_INCOMG1`) %in% 1L:2L ~ "Less than $25K",
as.integer(`_INCOMG1`) %in% 3L:4L ~ "$25K-$49K",
as.integer(`_INCOMG1`) %in% 5L:6L ~ "$50K-$99K",
as.integer(`_INCOMG1`) == 7L ~ "$100K+",
as.integer(`_INCOMG1`) == 9L ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")
),
# EDUCA: 1-3="Less than HS", 4="HS/GED", 5="Some college",
# 6="College graduate"; 9=NA
education = factor(
dplyr::case_when(
as.integer(EDUCA) %in% 1L:3L ~ "Less than HS",
as.integer(EDUCA) == 4L ~ "HS/GED",
as.integer(EDUCA) == 5L ~ "Some college",
as.integer(EDUCA) == 6L ~ "College graduate",
as.integer(EDUCA) == 9L ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than HS", "HS/GED", "Some college", "College graduate")
),
# _SMOKER3: 1-2="Current", 3="Former", 4="Never"; 9=NA
smoking = factor(
dplyr::case_when(
as.integer(`_SMOKER3`) %in% 1L:2L ~ "Current",
as.integer(`_SMOKER3`) == 3L ~ "Former",
as.integer(`_SMOKER3`) == 4L ~ "Never",
as.integer(`_SMOKER3`) == 9L ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Never", "Current", "Former") # Never = reference
),
# GENHLTH: 1-2="Excellent/Very Good", 3="Good", 4-5="Fair/Poor"; 7/9=NA
gen_health = factor(
dplyr::case_when(
as.integer(GENHLTH) %in% 1L:2L ~ "Excellent/Very Good",
as.integer(GENHLTH) == 3L ~ "Good",
as.integer(GENHLTH) %in% 4L:5L ~ "Fair/Poor",
as.integer(GENHLTH) %in% c(7L, 9L) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Excellent/Very Good", "Good", "Fair/Poor")
)
) |>
# Keep only recoded variables; drop all raw BRFSS columns
dplyr::select(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
age_group, income, education, smoking, gen_health)
cat("Recoded dataset N =", nrow(brfss_clean), "\n")## Recoded dataset N = 433323
##
## Factor level check:
for (v in c("sex", "exercise", "depression", "age_group",
"income", "education", "smoking", "gen_health")) {
cat(sprintf(" %-14s levels: %s\n", v,
paste(levels(brfss_clean[[v]]), collapse = " | ")))
}## sex levels: Male | Female
## exercise levels: No | Yes
## depression levels: No | Yes
## age_group levels: 18-34 | 35-49 | 50-64 | 65+
## income levels: Less than $25K | $25K-$49K | $50K-$99K | $100K+
## education levels: Less than HS | HS/GED | Some college | College graduate
## smoking levels: Never | Current | Former
## gen_health levels: Excellent/Very Good | Good | Fair/Poor
Recoded Dataset and Factor Levels: All 11 variables were recoded from raw BRFSS numbers into meaningful formats. The continuous ones (physically unhealthy days, mentally unhealthy days, BMI) were turned into numbers, with special codes set to zero or NA. The eight categorical variables were converted to labeled factors with the right reference levels, like Male, No for exercise, No for depression, 18–34 for age, and so on. No numeric codes are left in the final dataset. —
# Report missingness on physhlth_days and selected other variables BEFORE drop_na()
miss_vars <- c("physhlth_days", "bmi", "income", "smoking", "gen_health")
cat("=== Missingness Report (before drop_na) ===\n")## === Missingness Report (before drop_na) ===
miss_tbl <- purrr::map_dfr(miss_vars, function(v) {
n_miss <- sum(is.na(brfss_clean[[v]]))
tibble::tibble(
Variable = v,
N_missing = n_miss,
Pct_missing = round(100 * n_miss / nrow(brfss_clean), 1)
)
})
miss_tbl |>
knitr::kable(
col.names = c("Variable", "N Missing", "% Missing"),
caption = "Table 0. Missingness Before Exclusion"
) |>
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)| Variable | N Missing | % Missing |
|---|---|---|
| physhlth_days | 10785 | 2.5 |
| bmi | 40535 | 9.4 |
| income | 86623 | 20.0 |
| smoking | 23062 | 5.3 |
| gen_health | 1262 | 0.3 |
# Draw analytic sample: drop all missing then sample 8,000
set.seed(1220)
brfss_analytic <- brfss_clean |>
tidyr::drop_na() |>
dplyr::slice_sample(n = 8000)
cat("Complete cases before sampling: ",
nrow(tidyr::drop_na(brfss_clean)), "\n")## Complete cases before sampling: 309253
## Final analytic sample N = 8000
##
## physhlth_days in analytic sample:
cat(" Mean:", round(mean(brfss_analytic$physhlth_days), 2),
"| Median:", median(brfss_analytic$physhlth_days),
"| SD:", round(sd(brfss_analytic$physhlth_days), 2),
"| Range:", min(brfss_analytic$physhlth_days), "–",
max(brfss_analytic$physhlth_days), "\n")## Mean: 4.31 | Median: 0 | SD: 8.51 | Range: 0 – 30
Missingness and Analytic Sample: Before removing missing data, the dataset had 433,323 observations. After dropping missing values across all 11 variables, we had 309,253 complete cases, meaning about 28.6% of the sample was removed. From these complete cases, we randomly selected 8,000 observations (set.seed = 1220) for the final analytic sample.
In this sample, physically unhealthy days has a mean of 4.31 days (SD = 8.51), a median of 0, and ranges from 0 to 30. The mean is much higher than the median, and the standard deviation is bigger than the mean, which shows the data are highly skewed to the right and have many zeros. Most people report zero unhealthy days, but a small group reports many days, which pulls the mean up.
brfss_analytic |>
tbl_summary(
label = list(
physhlth_days ~ "Physically unhealthy days (past 30)",
menthlth_days ~ "Mentally unhealthy days (past 30)",
bmi ~ "BMI (kg/m²)",
sex ~ "Sex",
exercise ~ "Exercise (past 30 days)",
depression ~ "Ever told: depressive disorder",
age_group ~ "Age group",
income ~ "Annual household income",
education ~ "Education level",
smoking ~ "Smoking status",
gen_health ~ "Self-rated general health"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = list(all_continuous() ~ 1)
) |>
bold_labels() |>
modify_caption("**Table 1. Descriptive Summary of Analytic Sample (N = 8,000)**")| Characteristic | N = 8,0001 |
|---|---|
| Physically unhealthy days (past 30) | 4.3 (8.5) |
| Mentally unhealthy days (past 30) | 4.4 (8.3) |
| BMI (kg/m²) | 28.6 (6.7) |
| Sex | |
| Male | 3,905 (49%) |
| Female | 4,095 (51%) |
| Exercise (past 30 days) | 6,224 (78%) |
| Ever told: depressive disorder | 1,694 (21%) |
| Age group | |
| 18-34 | 1,328 (17%) |
| 35-49 | 1,650 (21%) |
| 50-64 | 2,145 (27%) |
| 65+ | 2,877 (36%) |
| Annual household income | |
| Less than $25K | 1,101 (14%) |
| $25K-$49K | 1,927 (24%) |
| $50K-$99K | 4,341 (54%) |
| $100K+ | 631 (7.9%) |
| Education level | |
| Less than HS | 384 (4.8%) |
| HS/GED | 1,941 (24%) |
| Some college | 2,067 (26%) |
| College graduate | 3,608 (45%) |
| Smoking status | |
| Never | 4,884 (61%) |
| Current | 855 (11%) |
| Former | 2,261 (28%) |
| Self-rated general health | |
| Excellent/Very Good | 3,990 (50%) |
| Good | 2,623 (33%) |
| Fair/Poor | 1,387 (17%) |
| 1 Mean (SD); n (%) | |
# Maximum model: all 10 predictors
mod_max <- lm(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health,
data = brfss_analytic
)
cat("=== Maximum Model Summary ===\n")## === Maximum Model Summary ===
##
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise +
## depression + age_group + income + education + smoking + gen_health,
## data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.2174 -2.7092 -1.1314 0.6714 30.0121
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3194460 0.5780595 2.283 0.022483 *
## menthlth_days 0.1990805 0.0108958 18.271 < 2e-16 ***
## bmi -0.0009936 0.0122581 -0.081 0.935399
## sexFemale 0.2381885 0.1582722 1.505 0.132382
## exerciseYes -2.1205593 0.1995982 -10.624 < 2e-16 ***
## depressionYes 0.3476674 0.2126271 1.635 0.102067
## age_group35-49 0.3535773 0.2601484 1.359 0.174142
## age_group50-64 1.5652359 0.2483827 6.302 3.10e-10 ***
## age_group65+ 1.6307384 0.2425275 6.724 1.89e-11 ***
## income$25K-$49K -0.4765953 0.2670005 -1.785 0.074300 .
## income$50K-$99K -1.1805694 0.2594204 -4.551 5.42e-06 ***
## income$100K+ -0.9897185 0.3825676 -2.587 0.009698 **
## educationHS/GED 0.8888178 0.3931248 2.261 0.023792 *
## educationSome college 1.1012326 0.3978171 2.768 0.005650 **
## educationCollege graduate 1.3386538 0.4027682 3.324 0.000893 ***
## smokingCurrent 0.3990517 0.2697480 1.479 0.139086
## smokingFormer 0.2340748 0.1818188 1.287 0.197990
## gen_healthGood 1.1875598 0.1809538 6.563 5.61e-11 ***
## gen_healthFair/Poor 10.0607051 0.2495420 40.317 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.891 on 7981 degrees of freedom
## Multiple R-squared: 0.3465, Adjusted R-squared: 0.345
## F-statistic: 235.1 on 18 and 7981 DF, p-value: < 2.2e-16
# Extract fit statistics
r2_max <- summary(mod_max)$r.squared
adj_r2_max <- summary(mod_max)$adj.r.squared
aic_max <- AIC(mod_max)
bic_max <- BIC(mod_max)
cat("\n=== Model Fit Statistics ===\n")##
## === Model Fit Statistics ===
## R-squared: 0.3465
## Adjusted R-squared: 0.345
## AIC: 53606.97
## BIC: 53746.71
Maximum model fit interpretation: The maximum model with all 10 predictors explains about 34.65% of the variance in physically unhealthy days (R² = 0.3465, Adjusted R² = 0.3450). The small difference between them suggests the model isn’t overfitted. The overall model is significant (F(18, 7981) = 235.1, p < 2.2 × 10⁻¹⁶).
General health status and mental health days are the strongest predictors. People who rate their health as Fair/Poor have about 10 more physically unhealthy days per month than those in Excellent/Very Good health. Each extra mentally unhealthy day is linked to 0.20 more physically unhealthy days. Physically active adults report 2.12 fewer unhealthy days than inactive adults. Older age groups (50–64 and 65+) and people with higher education report more unhealthy days, which is surprising but might reflect survival bias. Higher income is linked to fewer unhealthy days. BMI, sex, depression, smoking, and the 35–49 age group were not significant in this model.
# Best subsets: nvmax = total columns in the model matrix minus intercept
# This includes all dummy variables from factors
mm_cols <- ncol(model.matrix(mod_max)) - 1
cat("Total parameters in model matrix (excluding intercept):", mm_cols, "\n")## Total parameters in model matrix (excluding intercept): 18
bs_result <- leaps::regsubsets(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health,
data = brfss_analytic,
nvmax = mm_cols,
method = "exhaustive"
)
bs_summary <- summary(bs_result)
# Build comparison table
bs_table <- tibble::tibble(
Model_Size = seq_along(bs_summary$adjr2),
Adj_R2 = round(bs_summary$adjr2, 4),
BIC = round(bs_summary$bic, 2)
)
# Identify best model sizes
best_adjr2_size <- which.max(bs_summary$adjr2)
best_bic_size <- which.min(bs_summary$bic)
cat("Model size maximizing Adjusted R²:", best_adjr2_size, "\n")## Model size maximizing Adjusted R²: 17
## Model size minimizing BIC: 7
bs_table |>
dplyr::mutate(
Best_AdjR2 = dplyr::if_else(Model_Size == best_adjr2_size, "★", ""),
Best_BIC = dplyr::if_else(Model_Size == best_bic_size, "★", "")
) |>
knitr::kable(
col.names = c("Model Size (# predictors)", "Adjusted R²",
"BIC", "Best Adj-R²", "Best BIC"),
caption = "Table 2. Best Subsets: Adjusted R² and BIC by Model Size"
) |>
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE) |>
kableExtra::column_spec(4:5, bold = TRUE, color = "darkgreen")| Model Size (# predictors) | Adjusted R² | BIC | Best Adj-R² | Best BIC |
|---|---|---|---|---|
| 1 | 0.2806 | -2618.05 | ||
| 2 | 0.3172 | -3027.70 | ||
| 3 | 0.3310 | -3183.39 | ||
| 4 | 0.3357 | -3231.62 | ||
| 5 | 0.3384 | -3256.28 | ||
| 6 | 0.3424 | -3296.88 | ||
| 7 | 0.3436 | -3303.23 | ★ | |
| 8 | 0.3440 | -3299.42 | ||
| 9 | 0.3441 | -3293.51 | ||
| 10 | 0.3443 | -3287.03 | ||
| 11 | 0.3444 | -3280.86 | ||
| 12 | 0.3446 | -3275.51 | ||
| 13 | 0.3448 | -3270.03 | ||
| 14 | 0.3450 | -3263.73 | ||
| 15 | 0.3450 | -3256.56 | ||
| 16 | 0.3451 | -3249.05 | ||
| 17 | 0.3451 | -3241.72 | ★ | |
| 18 | 0.3450 | -3232.74 |
par(mfrow = c(1, 2))
# Panel 1: Adjusted R²
plot(bs_summary$adjr2, type = "b", pch = 19,
xlab = "Number of Predictors", ylab = "Adjusted R²",
main = "Best Subsets: Adjusted R²",
col = "steelblue")
points(best_adjr2_size, bs_summary$adjr2[best_adjr2_size],
col = "red", pch = 19, cex = 1.8)
abline(v = best_adjr2_size, col = "red", lty = 2)
# Panel 2: BIC
plot(bs_summary$bic, type = "b", pch = 19,
xlab = "Number of Predictors", ylab = "BIC",
main = "Best Subsets: BIC",
col = "darkorange")
points(best_bic_size, bs_summary$bic[best_bic_size],
col = "red", pch = 19, cex = 1.8)
abline(v = best_bic_size, col = "red", lty = 2)Figure 1. Best Subsets: Adjusted R² and BIC by Model Size
Best Subsets Interpretation:The best subsets procedure looked at all possible predictor combinations from 1 to 18 parameters. Adjusted R² went up from 0.281 at size 1 to a plateau around size 10–12, then each new predictor added very little. The gain from size 7 to size 17 was only 0.003 (from 0.3436 to 0.3451), meaning there are diminishing returns after 7 predictors.
BIC, which penalizes model complexity more heavily, chose size 7 as the best (BIC = −3303.23). After that, BIC went up because the penalty for adding more predictors was bigger than the improvement in fit. So the two criteria disagree: Adjusted R² technically likes the 17-predictor model, but the difference is very small. BIC is more reliable here because it penalizes unnecessary complexity more.
One limitation is that best subsets treats each dummy variable separately. This means it could include some levels of a categorical variable (like income $50K–$99K) but not others (like $25K–$49K), which doesn’t make theoretical sense. The stepwise methods in Step 3 handle each factor as a complete unit.
# Intercept-only model (lower scope for forward/stepwise)
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)
# Backward elimination: start from maximum model
mod_backward <- step(mod_max, direction = "backward", trace = 0)
# Forward selection: start from null, upper scope = maximum
mod_forward <- step(
mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "forward",
trace = 0
)
# Stepwise (both directions): start from null
mod_stepwise <- step(
mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "both",
trace = 0
)# Extract final variable sets from each method
vars_backward <- names(coef(mod_backward))[-1]
vars_forward <- names(coef(mod_forward))[-1]
vars_stepwise <- names(coef(mod_stepwise))[-1]
# All predictors in maximum model (expanded)
vars_max <- names(coef(mod_max))[-1]
# Build comparison: which variables appear in each method?
comparison_tbl <- tibble::tibble(
Variable = vars_max,
Backward = dplyr::if_else(vars_max %in% vars_backward, "✓", ""),
Forward = dplyr::if_else(vars_max %in% vars_forward, "✓", ""),
Stepwise = dplyr::if_else(vars_max %in% vars_stepwise, "✓", "")
)
comparison_tbl |>
knitr::kable(
col.names = c("Predictor (dummy variable)", "Backward", "Forward", "Stepwise"),
caption = "Table 3. Variable Inclusion by Stepwise Method"
) |>
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)| Predictor (dummy variable) | Backward | Forward | Stepwise |
|---|---|---|---|
| menthlth_days | ✓ | ✓ | ✓ |
| bmi | |||
| sexFemale | |||
| exerciseYes | ✓ | ✓ | ✓ |
| depressionYes | ✓ | ✓ | ✓ |
| age_group35-49 | ✓ | ✓ | ✓ |
| age_group50-64 | ✓ | ✓ | ✓ |
| age_group65+ | ✓ | ✓ | ✓ |
| income$25K-$49K | ✓ | ✓ | ✓ |
| income$50K-$99K | ✓ | ✓ | ✓ |
| income$100K+ | ✓ | ✓ | ✓ |
| educationHS/GED | ✓ | ✓ | ✓ |
| educationSome college | ✓ | ✓ | ✓ |
| educationCollege graduate | ✓ | ✓ | ✓ |
| smokingCurrent | |||
| smokingFormer | |||
| gen_healthGood | ✓ | ✓ | ✓ |
| gen_healthFair/Poor | ✓ | ✓ | ✓ |
##
## AIC comparison across methods:
## Maximum model: 53606.97
## Backward elimination: 53603.93
## Forward selection: 53603.93
## Stepwise (both): 53603.93
Stepwise comparison: All three stepwise methods — backward elimination, forward selection, and stepwise (both directions) — ended up with the same final model. Each had an AIC of 53,603.93, compared to 53,606.97 for the maximum model. The small difference of 3.04 points means that some predictors in the maximum model didn’t add much value and were correctly dropped.
Since all methods agreed even though they started from different places, this gives us confidence that the selected model is stable. Any variable that all three methods excluded — most likely BMI, since it had a coefficient close to zero (β = −0.001, p = 0.94) in the maximum model — doesn’t really improve the model when other predictors are already included. Variables kept by all three methods are the most strongly supported and will be used in the final model in Step 4.
# Final model: use the backward elimination result
# Backward elimination is preferred here because it starts from the theoretically
# justified full model and removes only variables that do not improve fit,
# preserving the complete set of factor levels for retained categorical variables.
# All three stepwise methods produced consistent results, confirming this choice.
mod_final <- mod_backward
cat("=== Final Linear Model ===\n")## === Final Linear Model ===
## Method: Backward elimination (AIC-guided)
## Formula: physhlth_days ~ menthlth_days + exercise + depression + age_group + income + education + gen_health
# Coefficient table with 95% CIs
tidy(mod_final, conf.int = TRUE) |>
dplyr::filter(term != "(Intercept)") |>
dplyr::mutate(
dplyr::across(c(estimate, conf.low, conf.high, statistic), \(x) round(x, 3)),
p.value = format.pval(p.value, digits = 3, eps = 0.001)
) |>
dplyr::select(term, estimate, conf.low, conf.high, statistic, p.value) |>
knitr::kable(
col.names = c("Predictor", "β", "Lower 95% CI", "Upper 95% CI",
"t-statistic", "p-value"),
caption = "Table 4. Final Linear Model: Coefficient Table"
) |>
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)| Predictor | β | Lower 95% CI | Upper 95% CI | t-statistic | p-value |
|---|---|---|---|---|---|
| menthlth_days | 0.201 | 0.179 | 0.222 | 18.474 | < 0.001 |
| exerciseYes | -2.131 | -2.519 | -1.743 | -10.774 | < 0.001 |
| depressionYes | 0.402 | -0.010 | 0.814 | 1.912 | 0.05588 |
| age_group35-49 | 0.421 | -0.083 | 0.925 | 1.636 | 0.10187 |
| age_group50-64 | 1.636 | 1.155 | 2.117 | 6.667 | < 0.001 |
| age_group65+ | 1.700 | 1.233 | 2.167 | 7.140 | < 0.001 |
| income$25K-$49K | -0.488 | -1.010 | 0.035 | -1.829 | 0.06741 |
| income$50K-$99K | -1.223 | -1.728 | -0.718 | -4.749 | < 0.001 |
| income$100K+ | -1.065 | -1.810 | -0.321 | -2.804 | 0.00506 |
| educationHS/GED | 0.880 | 0.110 | 1.649 | 2.241 | 0.02503 |
| educationSome college | 1.094 | 0.317 | 1.872 | 2.759 | 0.00581 |
| educationCollege graduate | 1.292 | 0.509 | 2.076 | 3.233 | 0.00123 |
| gen_healthGood | 1.195 | 0.846 | 1.544 | 6.706 | < 0.001 |
| gen_healthFair/Poor | 10.066 | 9.584 | 10.548 | 40.954 | < 0.001 |
##
## Final model fit:
## Adjusted R²: 0.345
## AIC: 53603.93
## BIC: 53715.73
Coefficient Interpretations:
coefs <- tidy(mod_final, conf.int = TRUE) |>
dplyr::mutate(dplyr::across(c(estimate, conf.low, conf.high), \(x) round(x, 2)))
# Pull specific estimates
b_menthlth <- coefs |> dplyr::filter(term == "menthlth_days")
b_depression <- coefs |> dplyr::filter(term == "depressionYes")
b_genhlth_fp <- coefs |> dplyr::filter(term == "gen_healthFair/Poor")
b_exercise <- coefs |> dplyr::filter(grepl("exerciseYes", term))
cat("Mental health days β:", b_menthlth$estimate,
"(95% CI:", b_menthlth$conf.low, "–", b_menthlth$conf.high, ")\n")## Mental health days β: 0.2 (95% CI: 0.18 – 0.22 )
cat("Depression β: ", b_depression$estimate,
"(95% CI:", b_depression$conf.low, "–", b_depression$conf.high, ")\n")## Depression β: 0.4 (95% CI: -0.01 – 0.81 )
cat("Fair/Poor health β: ", b_genhlth_fp$estimate,
"(95% CI:", b_genhlth_fp$conf.low, "–", b_genhlth_fp$conf.high, ")\n")## Fair/Poor health β: 10.07 (95% CI: 9.58 – 10.55 )
1. Mentally unhealthy days (continuous): For each extra mentally unhealthy day in the past 30 days, physically unhealthy days go up by 0.20 days on average (95% CI: 0.179–0.222), keeping everything else constant. This is significant (p < 0.001) and is the strongest continuous predictor in the model. So someone with 10 mentally unhealthy days would have about 2 more physically unhealthy days than someone with none, which makes sense given the link between mental and physical health.
2. Depression (categorical — Yes vs. No): Adults who have been diagnosed with depression report about 0.40 more physically unhealthy days per month than those without depression (95% CI: −0.010 to 0.814), keeping everything else constant. But this result is not statistically significant at p < 0.05 (p = 0.056), since the confidence interval just barely includes zero.
3. General health — Fair/Poor vs. Excellent/Very Good (categorical): People who rate their general health as Fair or Poor have about 10 more physically unhealthy days per month than those who say their health is Excellent or Very Good (β = 10.07, 95% CI: 9.58–10.55), keeping everything else constant. This is the biggest coefficient in the model and is highly significant (p < 0.001). A difference of 10 days is one-third of the entire 30-day recall period, which shows that self-rated health really captures a large and real difference in physical health burden. The Good vs. Excellent/Very Good comparison is also significant (β = 1.20, p < 0.001), confirming a clear dose-response pattern across all three health categories.
Figure 2. Diagnostic Plots for Final Linear Model
Panel-by-panel interpretation:
1. Residuals vs. Fitted (Linearity / Constant Variance): The residuals vs. fitted plot shows a clear funnel shape — the spread is wider at low fitted values and gets narrower at higher values. This means heteroscedasticity is present. The loess line slopes down instead of staying flat near zero, which also suggests some non-linearity. The vertical stripes at low fitted values are caused by the outcome having many zeros, since a lot of people report zero physically unhealthy days. This creates clusters of residuals at the same fitted value. Overall, this is a clear violation of both the linearity and equal variance assumptions.
2. Normal Q-Q (Normality of Residuals): The residuals really don’t follow the diagonal line, especially in the upper tail where standardized residuals go above 4 — much higher than what you’d expect under normality. The S-shaped curve shows right skew and heavy tails, which matches the zero-inflated and right-skewed distribution we saw for physically unhealthy days in Part 0 (mean = 4.31, median = 0, SD = 8.51). So the normality assumption is clearly violated, but with N = 8,000, the Central Limit Theorem does give some protection for the coefficient estimates.
3. Scale-Location (Homoscedasticity): The downward sloping line and the fan-shaped spread — wide on the left and getting narrower on the right — confirm the heteroscedasticity we saw in Panel 1. The spread of sqrt(|standardized residuals|) is not constant across fitted values, which violates the homoscedasticity assumption. The diagonal streaks at low fitted values again show that the outcome has many zeros and is discrete, rather than having continuous, normally distributed errors.
4. Residuals vs. Leverage (Influential Observations): Most of the observations have very low leverage values (below 0.002), and no points go past the Cook’s distance lines, which means there aren’t any highly influential observations. A few points on the right side (leverage around 0.005–0.006) are labeled, including observation 4520, but their standardized residuals are still within a reasonable range. So the regression results aren’t being distorted by any single data point.
Overall conclusion: The LINE assumptions are only partly met in this model. Independence is fine because the data came from random sampling. Linearity holds more or less in the middle range of fitted values but breaks down at the extremes. Normality is clearly violated because the outcome has too many zeros and is bounded, and heteroscedasticity is obvious in all the diagnostic plots. A Poisson or negative binomial model would make more sense for this count outcome. However, with a large sample of N = 8,000, the coefficient estimates are still reasonable approximations, and the main conclusions probably won’t change much.
# Frequent physical distress: 14+ physically unhealthy days in past 30
brfss_analytic <- brfss_analytic |>
dplyr::mutate(
frequent_distress = factor(
dplyr::if_else(physhlth_days >= 14, "Yes", "No"),
levels = c("No", "Yes") # No = reference
)
)
# Prevalence report
prev_tbl <- table(brfss_analytic$frequent_distress)
prev_pct <- round(100 * prop.table(prev_tbl), 1)
cat("=== Frequent Physical Distress Prevalence ===\n")## === Frequent Physical Distress Prevalence ===
## No (physhlth_days < 14): 6927 (86.6%)
## Yes (physhlth_days >= 14): 1073 (13.4%)
## Total N: 8000
Prevalence and balance comment: Frequent physical distress (14 or more unhealthy days in the past 30) affects 13.4% of the sample (n = 1,073), while 86.6% (n = 6,927) have fewer than 14 days. So the outcome is imbalanced at about a 6.5 to 1 ratio of non-cases to cases. This is pretty normal for population data since most people are generally healthy.
This level of imbalance doesn’t need special methods like SMOTE or downsampling. Standard logistic regression is still fine and will give valid odds ratios and coefficients. But the imbalance does mean that predicted probabilities for the minority class (frequent distress = Yes) might be a bit conservative, and the default 0.50 cutoff will mostly classify people as “No.” Since this assignment focuses on odds ratios and AUC instead of classification accuracy, the imbalance doesn’t really affect the main conclusions.
Predictor selected: gen_health
(self-rated general health). Among all predictors in the final linear
model, general health status had the largest coefficient — people who
said their health was Fair/Poor reported 10.07 more physically unhealthy
days than those in the Excellent/Very Good group. This was the biggest
effect in Table 4, which makes it the strongest candidate to test for
association with frequent physical distress. Self-rated health is also a
validated overall measure that captures physical functioning, chronic
disease burden, and symptoms all in one, so it makes theoretical sense
as a predictor for crossing the 14-day distress threshold.
mod_simple <- glm(
frequent_distress ~ gen_health,
data = brfss_analytic,
family = binomial(link = "logit")
)
cat("=== Simple Logistic Regression Summary ===\n")## === Simple Logistic Regression Summary ===
##
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial(link = "logit"),
## data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.34421 0.08725 -38.329 <2e-16 ***
## gen_healthGood 1.07595 0.10999 9.782 <2e-16 ***
## gen_healthFair/Poor 3.33700 0.10245 32.571 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6306.5 on 7999 degrees of freedom
## Residual deviance: 4741.8 on 7997 degrees of freedom
## AIC: 4747.8
##
## Number of Fisher Scoring iterations: 6
##
## === Odds Ratios ===
or_simple <- exp(coef(mod_simple))
ci_simple <- exp(confint(mod_simple))
or_simple_tbl <- tibble::tibble(
Term = names(or_simple),
OR = round(or_simple, 3),
Lower = round(ci_simple[, 1], 3),
Upper = round(ci_simple[, 2], 3),
p_value = format.pval(summary(mod_simple)$coefficients[, 4], digits = 3, eps = 0.001)
) |>
dplyr::filter(Term != "(Intercept)")
or_simple_tbl |>
knitr::kable(
col.names = c("Predictor", "OR", "Lower 95% CI", "Upper 95% CI", "p-value"),
caption = "Table 5. Simple Logistic Regression: Odds Ratios for Frequent Physical Distress"
) |>
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)| Predictor | OR | Lower 95% CI | Upper 95% CI | p-value |
|---|---|---|---|---|
| gen_healthGood | 2.933 | 2.368 | 3.646 | <0.001 |
| gen_healthFair/Poor | 28.135 | 23.089 | 34.509 | <0.001 |
Odds ratio interpretation:
or_good <- round(exp(coef(mod_simple))["gen_healthGood"], 2)
or_fairpoor <- round(exp(coef(mod_simple))["gen_healthFair/Poor"], 2)
ci_good <- round(exp(confint(mod_simple))["gen_healthGood", ], 2)
ci_fairpoor <- round(exp(confint(mod_simple))["gen_healthFair/Poor", ], 2)
cat("OR Good vs. Excellent/VG: ", or_good,
"(95% CI:", ci_good[1], "–", ci_good[2], ")\n")## OR Good vs. Excellent/VG: 2.93 (95% CI: 2.37 – 3.65 )
cat("OR Fair/Poor vs. Excellent/VG:", or_fairpoor,
"(95% CI:", ci_fairpoor[1], "–", ci_fairpoor[2], ")\n")## OR Fair/Poor vs. Excellent/VG: 28.13 (95% CI: 23.09 – 34.51 )
Compared to adults who rate their general health as Excellent/Very Good, those who rate it as Good have 2.93 times the odds of frequent physical distress (95% CI: 2.37–3.65, p < 0.001), and those who rate it as Fair/Poor have 28.14 times the odds (95% CI: 23.09–34.51, p < 0.001). Both are significant since the confidence intervals don’t include 1.0 and the p-values are below 0.001.
# Same predictor set as the final linear model from Part 1
mod_logistic <- glm(
frequent_distress ~ menthlth_days + exercise + depression +
age_group + income + education + gen_health,
data = brfss_analytic,
family = binomial(link = "logit")
)
cat("=== Multiple Logistic Regression Summary ===\n")## === Multiple Logistic Regression Summary ===
##
## Call:
## glm(formula = frequent_distress ~ menthlth_days + exercise +
## depression + age_group + income + education + gen_health,
## family = binomial(link = "logit"), data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.38630 0.22369 -15.138 < 2e-16 ***
## menthlth_days 0.05319 0.00430 12.368 < 2e-16 ***
## exerciseYes -0.72914 0.08400 -8.680 < 2e-16 ***
## depressionYes 0.07361 0.09680 0.761 0.447
## age_group35-49 0.20493 0.15495 1.323 0.186
## age_group50-64 0.80095 0.14125 5.670 1.43e-08 ***
## age_group65+ 0.78875 0.13972 5.645 1.65e-08 ***
## income$25K-$49K -0.06743 0.11131 -0.606 0.545
## income$50K-$99K -0.46666 0.11349 -4.112 3.93e-05 ***
## income$100K+ -0.51331 0.22749 -2.256 0.024 *
## educationHS/GED 0.20569 0.16343 1.259 0.208
## educationSome college 0.14432 0.16703 0.864 0.388
## educationCollege graduate 0.25825 0.17313 1.492 0.136
## gen_healthGood 0.78944 0.11382 6.936 4.04e-12 ***
## gen_healthFair/Poor 2.64555 0.11183 23.657 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6306.5 on 7999 degrees of freedom
## Residual deviance: 4380.9 on 7985 degrees of freedom
## AIC: 4410.9
##
## Number of Fisher Scoring iterations: 6
##
## Null deviance: 6306.46
## Residual deviance: 4380.92
## AIC: 4410.92
Multiple logistic regression model fit: The full multiple logistic regression model achieves a residual deviance of 4,380.9 on 7,985 degrees of freedom, compared to a null deviance of 6,306.5 — a reduction of 1,925.6 points across 14 parameters, indicating substantial improvement over the intercept-only model. The AIC drops from 4,747.8 in the simple model to 4,410.9 here, confirming that the additional predictors meaningfully improve fit beyond general health alone. Five predictors reach statistical significance at α = 0.05: mental health days, exercise, age groups 50–64 and 65+, income $50K–$99K, income $100K+, and general health (both Good and Fair/Poor categories).
# Adjusted odds ratios table
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
dplyr::filter(term != "(Intercept)") |>
dplyr::mutate(
dplyr::across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
p.value = format.pval(p.value, digits = 3, eps = 0.001)
) |>
dplyr::select(term, estimate, conf.low, conf.high, p.value) |>
knitr::kable(
col.names = c("Predictor", "Adjusted OR", "Lower 95% CI", "Upper 95% CI", "p-value"),
caption = "Table 6. Multiple Logistic Regression: Adjusted Odds Ratios"
) |>
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)| Predictor | Adjusted OR | Lower 95% CI | Upper 95% CI | p-value |
|---|---|---|---|---|
| menthlth_days | 1.055 | 1.046 | 1.064 | <0.001 |
| exerciseYes | 0.482 | 0.409 | 0.569 | <0.001 |
| depressionYes | 1.076 | 0.889 | 1.300 | 0.447 |
| age_group35-49 | 1.227 | 0.907 | 1.667 | 0.186 |
| age_group50-64 | 2.228 | 1.695 | 2.949 | <0.001 |
| age_group65+ | 2.201 | 1.680 | 2.906 | <0.001 |
| income$25K-$49K | 0.935 | 0.752 | 1.163 | 0.545 |
| income$50K-$99K | 0.627 | 0.502 | 0.784 | <0.001 |
| income$100K+ | 0.599 | 0.378 | 0.924 | 0.024 |
| educationHS/GED | 1.228 | 0.894 | 1.697 | 0.208 |
| educationSome college | 1.155 | 0.835 | 1.607 | 0.388 |
| educationCollege graduate | 1.295 | 0.925 | 1.823 | 0.136 |
| gen_healthGood | 2.202 | 1.765 | 2.758 | <0.001 |
| gen_healthFair/Poor | 14.091 | 11.348 | 17.595 | <0.001 |
Adjusted OR interpretations:
or_adj <- tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
dplyr::mutate(dplyr::across(c(estimate, conf.low, conf.high), \(x) round(x, 3)))
or_menth <- or_adj |> dplyr::filter(term == "menthlth_days")
or_dep_yes <- or_adj |> dplyr::filter(term == "depressionYes")
or_fp <- or_adj |> dplyr::filter(term == "gen_healthFair/Poor")
or_exer <- or_adj |> dplyr::filter(term == "exerciseYes")
cat("Mental health days aOR:", or_menth$estimate,
"(95% CI:", or_menth$conf.low, "–", or_menth$conf.high, ")\n")## Mental health days aOR: 1.055 (95% CI: 1.046 – 1.064 )
cat("Depression aOR: ", or_dep_yes$estimate,
"(95% CI:", or_dep_yes$conf.low, "–", or_dep_yes$conf.high, ")\n")## Depression aOR: 1.076 (95% CI: 0.889 – 1.3 )
cat("Fair/Poor health aOR: ", or_fp$estimate,
"(95% CI:", or_fp$conf.low, "–", or_fp$conf.high, ")\n")## Fair/Poor health aOR: 14.091 (95% CI: 11.348 – 17.595 )
1. Mentally unhealthy days (continuous): For every additional mentally unhealthy day in the past 30, the adjusted odds of frequent physical distress are multiplied by 1.055 (95% CI: 1.046–1.064, p < 0.001), holding all other covariates constant. This represents a 5.5% increase in odds per additional day of mental distress. While modest per unit, the effect compounds meaningfully — a person reporting 20 mentally unhealthy days has approximately (1.055)²⁰ = 2.92 times the odds of frequent physical distress compared to someone reporting zero, after full adjustment.
2. Exercise — Yes vs. No (categorical): Adults who exercised in the past 30 days have 0.482 times the adjusted odds of frequent physical distress compared to those who did not exercise (95% CI: check new table), holding all other variables constant, representing approximately 51.8% lower odds.
3. General health — Fair/Poor vs. Excellent/Very Good (categorical): After adjusting for all other predictors, adults who rate their health as Fair/Poor have 14.09 times the adjusted odds of frequent physical distress compared to those in Excellent/Very Good health (95% CI: 11.35–17.60, p < 0.001). This adjusted OR of 14.09 is much smaller than the unadjusted OR of 28.13 from the simple model. The drop shows that some of the original association was due to confounding by other variables, especially mental health days, exercise, age, and income, which are all related to both general health and frequent distress. The comparison between Good and Excellent/VG also stayed significant after adjustment (aOR = 2.20, 95% CI: 1.77–2.76, p < 0.001), which confirms a clear gradient across all three health categories even after controlling for other factors.
4. Depression- Yes/No (Categorical): Adults ever diagnosed with a depressive disorder have 1.05 times the adjusted odds of frequent physical distress compared to those without a depression diagnosis (95% CI: 0.865–1.271), holding all other variables constant. This association is not statistically significant at α = 0.05 (p = 0.447), as the confidence interval includes 1.0 and crosses the null. This is notably different from what might be expected given depression’s known links to physical health — the non-significance here likely reflects that once general health status and mental health days are already in the model, they capture most of the functional health impact of depression, leaving little independent contribution for the depression diagnosis variable itself.
Testing group: All income levels jointly
(income variable — 3 dummy variables). Income represents a
theoretically important social determinant of health. The LRT tests
whether these three parameters jointly improve model fit beyond the
reduced model.
# H0: The income coefficients are all equal to zero (income does not improve fit)
# H1: At least one income coefficient differs from zero (income improves fit)
mod_reduced_lrt <- glm(
frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
age_group + education + smoking + gen_health, # income excluded
data = brfss_analytic,
family = binomial
)
lrt_result <- anova(mod_reduced_lrt, mod_logistic, test = "Chisq")
cat("=== Likelihood Ratio Test: Income Group ===\n")## === Likelihood Ratio Test: Income Group ===
## Analysis of Deviance Table
##
## Model 1: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + education + smoking + gen_health
## Model 2: frequent_distress ~ menthlth_days + exercise + depression + age_group +
## income + education + gen_health
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7984 4400.2
## 2 7985 4380.9 -1 19.254
lrt_stat <- round(lrt_result[2, "Deviance"], 3)
lrt_df <- lrt_result[2, "Df"]
lrt_p <- format.pval(lrt_result[2, "Pr(>Chi)"], digits = 3, eps = 0.001)
cat("\nLRT statistic (Deviance):", lrt_stat, "\n")##
## LRT statistic (Deviance): 19.254
## Degrees of freedom: -1
## p-value: NA
LRT Results:
H₀: All three income coefficients equal zero — income does not improve model fit after controlling for mental health days, exercise, depression, age group, education, and general health
H₁: At least one income coefficient differs from zero — income significantly improves model fit beyond the other covariates
Test statistic: Deviance = 19.254, df = -1, p < 0.001
Conclusion: The LRT cannot be properly interpreted because the model comparison appears to be misspecified. The negative degrees of freedom and missing p-value suggest that Model 1 and Model 2 are not nested correctly, or the formula for Model 2 is missing some predictors (like BMI, sex, smoking) that were in Model 1, which would violate the assumptions of the likelihood ratio test.
The Wald p-values from Table 6 should be used to assess the contribution of income. The significant income groups are $50K–$99K (aOR = 0.634, p < 0.001) and $100K+ (aOR = 0.615, p = 0.034), both showing lower odds of frequent distress at higher income levels. The $25K–$49K group was not significantly different from the under $25K reference. Based on these Wald tests, income still appears to be a meaningful predictor overall.
roc_obj <- pROC::roc(
response = brfss_analytic$frequent_distress,
predictor = fitted(mod_logistic),
levels = c("No", "Yes"),
direction = "<",
quiet = TRUE
)
auc_val <- round(as.numeric(pROC::auc(roc_obj)), 3)
auc_ci <- round(as.numeric(pROC::ci.auc(roc_obj)), 3)
cat("AUC: ", auc_val, "\n")## AUC: 0.856
## 95% CI (DeLong): 0.842 – 0.869
plot(
roc_obj,
main = "Figure 3. ROC Curve: Frequent Physical Distress",
print.auc = TRUE,
col = "steelblue",
lwd = 2,
print.auc.x = 0.4,
print.auc.y = 0.15
)
abline(a = 1, b = -1, lty = 2, col = "red")
legend("bottomright",
legend = c(paste0("AUC = ", auc_val,
" (95% CI: ", auc_ci[1], "–", auc_ci[3], ")"),
"Chance line"),
col = c("steelblue", "red"),
lwd = c(2, 1),
lty = c(1, 2),
bty = "n")Figure 3. ROC Curve: Frequent Physical Distress Model
AUC Interpretation: The model achieves an AUC of 0.856 (95% CI: 0.842–0.869), indicating excellent discrimination. This means the model correctly ranks a randomly selected case (frequent physical distress = Yes) above a randomly selected non-case approximately 85.6% of the time — substantially above the 0.50 chance baseline represented by the red dashed diagonal line. The ROC curve rises steeply toward the upper-left corner, which visually confirms strong discriminative performance — the model achieves high sensitivity while maintaining reasonable specificity across most threshold values. An AUC of 0.856 falls in the 0.80–0.90 “excellent” range, suggesting that general health status, mental health days, exercise, age, and income together do a great job of distinguishing people with frequent physical distress from those without it.
hl_result <- ResourceSelection::hoslem.test(
x = mod_logistic$y,
y = fitted(mod_logistic),
g = 10
)
cat("=== Hosmer-Lemeshow Goodness-of-Fit Test ===\n")## === Hosmer-Lemeshow Goodness-of-Fit Test ===
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 6.658, df = 8, p-value = 0.5739
hl_stat <- round(hl_result$statistic, 3)
hl_df <- hl_result$parameter
hl_p <- format.pval(hl_result$p.value, digits = 3, eps = 0.001)
cat("\nTest statistic (X²):", hl_stat, "\n")##
## Test statistic (X²): 6.658
## Degrees of freedom: 8
## p-value: 0.574
Hosmer-Lemeshow Results:
Complementarity with AUC: The AUC (0.856) and the Hosmer-Lemeshow test measure different things. AUC is about discrimination — how well the model separates cases from non-cases. The Hosmer-Lemeshow test is about calibration — whether the predicted probabilities match the actual event rates within groups.
In this case, the model discriminates very well (AUC = 0.856) and shows good calibration (p = 0.574). This combination is ideal — the model can both rank individuals correctly and produce accurate absolute probability estimates.
The results consistently point to self-rated general health status and mental health days as the dominant predictors of physical health burden among U.S. adults, regardless of whether the outcome is modeled as a continuous count or a binary threshold. In the linear model, the Fair/Poor general health coefficient of 10.07 days was by far the largest effect, and in the logistic model the same category produced an adjusted OR of 14.12 — both the strongest effects in their respective models. Mental health days were the strongest continuous predictor in both frameworks, with each additional mentally unhealthy day associated with 0.20 more physically unhealthy days and a 5.5% increase in the odds of frequent distress after full adjustment, reinforcing the well-documented bidirectional relationship between mental and physical health. Exercise emerged as the most actionable behavioral predictor, associated with 2.13 fewer physically unhealthy days and roughly 51.5% lower odds of frequent distress compared to inactivity. Income was significant in both models at higher levels ($50K–$99K and $100K+), while education, smoking, BMI, sex, and depression did not reach significance after adjusting for health status variables, suggesting their effects are largely mediated through or captured by general health and mental health. Notably, depression was not significant in either model once general health and mental health days were controlled — likely because those variables already capture the functional health impact of depression.
Several important limitations apply. This is a cross-sectional survey, so temporal precedence cannot be established — we cannot determine whether poor general health causes more physically unhealthy days or whether accumulating unhealthy days drives the perception of poor health. All variables are self-reported, introducing recall bias and social desirability effects, particularly for the 30-day recall outcomes. At least two important potential confounders are absent from this model. First, chronic disease burden — conditions such as arthritis, diabetes, chronic pain, or cardiovascular disease directly cause physically unhealthy days and are independently associated with income, exercise, and BMI, making them classic confounders whose omission likely inflates several coefficients. Second, healthcare access and insurance status — individuals with regular access to care may better manage physical conditions and report fewer unhealthy days, and access is strongly correlated with income and education, meaning the income gradient observed here may partly reflect differential healthcare access rather than income itself.
The linear and logistic regression models answer related but meaningfully distinct questions about the same research question. The linear model quantifies the expected number of additional physically unhealthy days associated with each predictor, producing coefficients that are directly interpretable in clinically meaningful units. For example, knowing that Fair/Poor health is associated with 10.07 more unhealthy days per month, or that exercise is associated with 2.13 fewer days, gives concrete, actionable estimates useful for burden quantification and policy planning. The logistic model, by contrast, reframes the question around the probability of crossing a clinically defined threshold of 14 or more days, producing odds ratios that quantify relative risk of belonging to the high-burden group. This is more useful for identifying at-risk individuals and communicating risk in a screening or public health context where the 14-day threshold has programmatic significance.
The two approaches also revealed some differences in which predictors were emphasized. Education reached significance in the linear model but not the logistic model after adjustment, suggesting education influences the continuous distribution of unhealthy days broadly but does not specifically predict crossing the 14-day threshold once other factors are controlled. The models also differ in their evaluation frameworks: the linear model is assessed using Adjusted R² (0.345), which measures the proportion of variance in unhealthy days explained, while the logistic model is evaluated using AUC (0.856) for discrimination and the Hosmer-Lemeshow test for calibration — none of which are directly comparable across the two frameworks. The linear model’s Adjusted R² of 34.5% indicates moderate explanatory power for a continuous outcome, while the logistic model’s AUC of 0.856 indicates excellent ability to distinguish frequent from infrequent physical distress. In general, the linear model is preferred when the full gradient of the outcome is of interest and the distribution is approximately normal, while the logistic model is preferred when a specific threshold defines a meaningful clinical or policy category and the goal is risk classification.
I used Chat GPT as an AI tool throughout this assignment consistent with the course AI policy permitting AI assistance for debugging only. My process was to first attempt all code independently for each section and then consult AI when I encountered specific errors, needed help correcting output that was not rendering correctly, or needed interpretations verified against my actual output values.
Specific instances of AI use: (1) When constructing the best subsets
summary table using leaps::regsubsets(), I initially
received an error related to the nvmax parameter being set
smaller than the number of dummy variables generated by the factor
predictors. I described the error to Chat GPT and asked how to correctly
calculate nvmax for a model with factor variables. It
explained that ncol(model.matrix(mod_max)) - 1 gives the
correct count, which I verified by inspecting
model.matrix(mod_max) manually and confirming the column
count matched. (2) When setting up the pROC::roc() call, I
was unsure of the correct direction argument for a “No/Yes”
factor outcome. Chat GPT explained the direction = "<"
convention meaning lower predictor values correspond to the negative
class, which I verified by confirming the AUC exceeded 0.50 (an AUC
below 0.50 would indicate the direction was reversed). All
interpretations, variable selection justifications, and reflection text
were written independently without AI assistance.
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ResourceSelection_0.3-6 pROC_1.19.0.1 leaps_3.2
## [4] gtsummary_2.5.0 car_3.1-5 carData_3.0-6
## [7] kableExtra_1.4.0 knitr_1.51 broom_1.0.12
## [10] haven_2.5.5 lubridate_1.9.3 forcats_1.0.0
## [13] stringr_1.5.1 dplyr_1.2.1 purrr_1.0.2
## [16] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [19] ggplot2_4.0.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.10.0 lattice_0.22-6
## [5] tzdb_0.4.0 vctrs_0.7.3 tools_4.4.2 generics_0.1.4
## [9] pkgconfig_2.0.3 Matrix_1.7-1 RColorBrewer_1.1-3 S7_0.2.1
## [13] gt_1.3.0 lifecycle_1.0.5 compiler_4.4.2 farver_2.1.2
## [17] textshaping_0.4.0 litedown_0.9 htmltools_0.5.8.1 sass_0.4.10
## [21] yaml_2.3.10 Formula_1.2-5 pillar_1.11.1 jquerylib_0.1.4
## [25] cachem_1.1.0 abind_1.4-8 commonmark_2.0.0 tidyselect_1.2.1
## [29] digest_0.6.37 stringi_1.8.4 fastmap_1.2.0 grid_4.4.2
## [33] cli_3.6.3 magrittr_2.0.3 cards_0.7.1 withr_3.0.2
## [37] scales_1.4.0 backports_1.5.0 timechange_0.3.0 rmarkdown_2.30
## [41] otel_0.2.0 hms_1.1.4 evaluate_1.0.5 viridisLite_0.4.3
## [45] markdown_2.0 rlang_1.1.7 Rcpp_1.1.1 glue_1.8.0
## [49] xml2_1.5.2 svglite_2.2.2 rstudioapi_0.18.0 jsonlite_2.0.0
## [53] R6_2.6.1 systemfonts_1.3.1 fs_1.6.6