library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(leaps)
library(pROC)
library(ResourceSelection)## Rows in raw dataset: 433323
## Columns in raw dataset: 350
brfss_clean <- brfss_selected |>
mutate(
# Outcome: physically unhealthy days
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
TRUE ~ NA_real_
),
# Mental health days
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_
),
# BMI
bmi = case_when(
`_BMI5` == 9999 ~ NA_real_,
TRUE ~ as.numeric(`_BMI5`) / 100
),
# Sex
sex = factor(case_when(
SEXVAR == 1 ~ "Male",
SEXVAR == 2 ~ "Female",
TRUE ~ NA_character_
), levels = c("Male", "Female")),
# Exercise
exercise = factor(case_when(
EXERANY2 == 1 ~ "Yes",
EXERANY2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
# Depression
depression = factor(case_when(
ADDEPEV3 == 1 ~ "Yes",
ADDEPEV3 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
# Age group
age_group = factor(case_when(
`_AGEG5YR` %in% 1:3 ~ "18-34",
`_AGEG5YR` %in% 4:6 ~ "35-49",
`_AGEG5YR` %in% 7:9 ~ "50-64",
`_AGEG5YR` %in% 10:13 ~ "65+",
TRUE ~ NA_character_
), levels = c("18-34", "35-49", "50-64", "65+")),
# Income
income = factor(case_when(
`_INCOMG1` %in% 1:2 ~ "Less than $25K",
`_INCOMG1` %in% 3:4 ~ "$25K-$49K",
`_INCOMG1` %in% 5:6 ~ "$50K-$99K",
`_INCOMG1` == 7 ~ "$100K+",
TRUE ~ NA_character_
), levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")),
# Education
education = factor(case_when(
EDUCA %in% 1:3 ~ "Less than HS",
EDUCA == 4 ~ "HS/GED",
EDUCA == 5 ~ "Some college",
EDUCA == 6 ~ "College graduate",
TRUE ~ NA_character_
), levels = c("Less than HS", "HS/GED", "Some college", "College graduate")),
# Smoking
smoking = factor(case_when(
`_SMOKER3` %in% 1:2 ~ "Current",
`_SMOKER3` == 3 ~ "Former",
`_SMOKER3` == 4 ~ "Never",
TRUE ~ NA_character_
), levels = c("Never", "Current", "Former")),
# General health
gen_health = factor(case_when(
GENHLTH %in% 1:2 ~ "Excellent/Very Good",
GENHLTH == 3 ~ "Good",
GENHLTH %in% 4:5 ~ "Fair/Poor",
TRUE ~ NA_character_
), levels = c("Excellent/Very Good", "Good", "Fair/Poor")),
# Marital status
marital = factor(case_when(
MARITAL %in% c(1, 6) ~ "Married/Partnered",
MARITAL %in% 2:4 ~ "Previously married",
MARITAL == 5 ~ "Never married",
TRUE ~ NA_character_
), levels = c("Married/Partnered", "Previously married", "Never married"))
)tibble(
Variable = c("physhlth_days", "menthlth_days", "bmi",
"depression", "smoking", "gen_health"),
`N Missing` = c(
sum(is.na(brfss_clean$physhlth_days)),
sum(is.na(brfss_clean$menthlth_days)),
sum(is.na(brfss_clean$bmi)),
sum(is.na(brfss_clean$depression)),
sum(is.na(brfss_clean$smoking)),
sum(is.na(brfss_clean$gen_health))
),
`% Missing` = round(c(
mean(is.na(brfss_clean$physhlth_days)),
mean(is.na(brfss_clean$menthlth_days)),
mean(is.na(brfss_clean$bmi)),
mean(is.na(brfss_clean$depression)),
mean(is.na(brfss_clean$smoking)),
mean(is.na(brfss_clean$gen_health))
) * 100, 2)
) |>
kable(caption = "Table 0. Missing Values Before Exclusions") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Variable | N Missing | % Missing |
|---|---|---|
| physhlth_days | 10785 | 2.49 |
| menthlth_days | 8108 | 1.87 |
| bmi | 40535 | 9.35 |
| depression | 2587 | 0.60 |
| smoking | 23062 | 5.32 |
| gen_health | 1262 | 0.29 |
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na() |>
slice_sample(n = 8000)
cat("Final analytic n:", nrow(brfss_analytic), "\n")## Final analytic n: 8000
brfss_analytic |>
dplyr::select(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
age_group, income, education, smoking, gen_health, marital) |>
tbl_summary(
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
label = list(
physhlth_days ~ "Physically unhealthy days (past 30)",
menthlth_days ~ "Mentally unhealthy days (past 30)",
bmi ~ "BMI",
sex ~ "Sex",
exercise ~ "Exercise (past 30 days)",
depression ~ "Ever depressive disorder",
age_group ~ "Age group",
income ~ "Annual household income",
education ~ "Education level",
smoking ~ "Smoking status",
gen_health ~ "General health status",
marital ~ "Marital status"
),
missing = "no"
) |>
bold_labels() |>
modify_caption("**Table 1. Descriptive Statistics -- BRFSS 2023 (n = 8,000)**")| Characteristic | N = 8,0001 |
|---|---|
| Physically unhealthy days (past 30) | 4 (9) |
| Mentally unhealthy days (past 30) | 4 (8) |
| BMI | 29 (7) |
| Sex | |
| Male | 3,971 (50%) |
| Female | 4,029 (50%) |
| Exercise (past 30 days) | 6,157 (77%) |
| Ever depressive disorder | 1,776 (22%) |
| Age group | |
| 18-34 | 1,294 (16%) |
| 35-49 | 1,657 (21%) |
| 50-64 | 2,109 (26%) |
| 65+ | 2,940 (37%) |
| Annual household income | |
| Less than $25K | 1,090 (14%) |
| $25K-$49K | 1,931 (24%) |
| $50K-$99K | 4,297 (54%) |
| $100K+ | 682 (8.5%) |
| Education level | |
| Less than HS | 391 (4.9%) |
| HS/GED | 1,887 (24%) |
| Some college | 2,115 (26%) |
| College graduate | 3,607 (45%) |
| Smoking status | |
| Never | 4,808 (60%) |
| Current | 962 (12%) |
| Former | 2,230 (28%) |
| General health status | |
| Excellent/Very Good | 3,926 (49%) |
| Good | 2,648 (33%) |
| Fair/Poor | 1,426 (18%) |
| Marital status | |
| Married/Partnered | 4,582 (57%) |
| Previously married | 2,050 (26%) |
| Never married | 1,368 (17%) |
| 1 Mean (SD); n (%) | |
Predictor justification: Ten predictors were selected based on their known associations with physical health burden in population surveys. Mental health days, depression, and general health status capture the mind-body connection and overall health perception. BMI, exercise, and smoking represent behavioral and physiological risk factors. Age group, income, education, and marital status capture social and demographic determinants of physical health.
mod_max <- lm(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health + marital,
data = brfss_analytic
)
summary(mod_max)##
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise +
## depression + age_group + income + education + smoking + gen_health +
## marital, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.795 -2.839 -1.145 0.690 31.047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.20132 0.62746 3.508 0.000453 ***
## menthlth_days 0.18362 0.01135 16.172 < 2e-16 ***
## bmi -0.01877 0.01287 -1.459 0.144638
## sexFemale 0.23449 0.16389 1.431 0.152538
## exerciseYes -1.93289 0.20408 -9.471 < 2e-16 ***
## depressionYes 0.77536 0.21528 3.602 0.000318 ***
## age_group35-49 0.76935 0.28365 2.712 0.006696 **
## age_group50-64 1.40108 0.27874 5.026 5.11e-07 ***
## age_group65+ 1.72725 0.27852 6.202 5.87e-10 ***
## income$25K-$49K -1.08440 0.27627 -3.925 8.74e-05 ***
## income$50K-$99K -1.52385 0.27607 -5.520 3.50e-08 ***
## income$100K+ -1.31580 0.39821 -3.304 0.000956 ***
## educationHS/GED 0.29371 0.40089 0.733 0.463806
## educationSome college 1.00198 0.40273 2.488 0.012867 *
## educationCollege graduate 1.05262 0.40445 2.603 0.009269 **
## smokingCurrent 0.46414 0.26617 1.744 0.081241 .
## smokingFormer 0.43777 0.18776 2.332 0.019749 *
## gen_healthGood 1.40982 0.18635 7.565 4.30e-14 ***
## gen_healthFair/Poor 10.06509 0.25239 39.879 < 2e-16 ***
## maritalPreviously married -0.26701 0.20679 -1.291 0.196680
## maritalNever married -0.41244 0.24899 -1.656 0.097666 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared: 0.3258, Adjusted R-squared: 0.3242
## F-statistic: 192.8 on 20 and 7979 DF, p-value: < 2.2e-16
glance(mod_max) |>
dplyr::select(r.squared, adj.r.squared, sigma, statistic, p.value, df.residual) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Table 2. Maximum Model Fit Statistics",
col.names = c("R-squared", "Adj. R-squared", "RSE", "F-statistic", "p-value", "Residual df")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| R-squared | Adj. R-squared | RSE | F-statistic | p-value | Residual df |
|---|---|---|---|---|---|
| 0.3258 | 0.3242 | 7.1122 | 192.8264 | 0 | 7979 |
## AIC: 54115.05
## BIC: 54268.77
r2_max <- round(summary(mod_max)$r.squared, 4)
adj_r2_max <- round(summary(mod_max)$adj.r.squared, 4)Interpretation: R-squared = 0.3258, meaning the maximum model explains 32.58% of the variance in physically unhealthy days. Adjusted R-squared = 0.3242 is slightly lower due to the penalty for including 11 predictors (with multiple dummy variables). AIC and BIC serve as baselines for comparing simpler models. Lower values are preferred. The highly significant overall F-test confirms that the model as a whole explains a statistically significant amount of variance.
best_sub <- regsubsets(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health + marital,
data = brfss_analytic,
nvmax = 25,
method = "exhaustive"
)
best_sum <- summary(best_sub)
subset_metrics <- tibble(
p = 1:length(best_sum$adjr2),
`Adj. R2` = best_sum$adjr2,
BIC = best_sum$bic
)
p1 <- ggplot(subset_metrics, aes(x = p, y = `Adj. R2`)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.max(best_sum$adjr2),
linetype = "dashed", color = "tomato") +
labs(title = "Adjusted R2 by Model Size", x = "Number of Variables", y = "Adjusted R2") +
theme_minimal(base_size = 12)
p2 <- ggplot(subset_metrics, aes(x = p, y = BIC)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.min(best_sum$bic),
linetype = "dashed", color = "tomato") +
labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
theme_minimal(base_size = 12)
gridExtra::grid.arrange(p1, p2, ncol = 2)## Model size maximizing Adj. R2: 19
## Model size minimizing BIC: 9
Note regsubsets() treats each dummy
variable from a factor as a separate predictor. It may include some
levels of a categorical variable while excluding others. This can
produce models that are difficult to interpret and violates the
convention of including or excluding an entire factor together. The
stepwise methods in Step 3 handle factors as complete units, which is
more appropriate for inference.
Interpretation: Adjusted R2 reaches its maximum at 19 variables. Afterwards, additional predictors fail to meaningfully improve fit. BIC minimizes at 9 variables. It favors a more stingy model. BIC penalizes complexity more heavily (log(n) vs. 2 per parameter in AIC), so it prefers the simpler model when the additional parameters provide only small gains in fit.
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)
# Backward elimination
mod_backward <- step(mod_max, direction = "backward", trace = 1)## Start: AIC=31410.04
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + income + education + smoking + gen_health + marital
##
## Df Sum of Sq RSS AIC
## - marital 2 180 403789 31410
## <none> 403609 31410
## - sex 1 104 403712 31410
## - bmi 1 108 403716 31410
## - smoking 2 347 403956 31413
## - depression 1 656 404265 31421
## - education 3 876 404485 31421
## - income 3 1550 405158 31435
## - age_group 3 2235 405844 31448
## - exercise 1 4537 408146 31497
## - menthlth_days 1 13230 416839 31666
## - gen_health 2 84670 488279 32930
##
## Step: AIC=31409.61
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
##
## Df Sum of Sq RSS AIC
## - bmi 1 98 403887 31410
## <none> 403789 31410
## - sex 1 102 403891 31410
## - smoking 2 345 404134 31412
## - depression 1 637 404427 31420
## - education 3 875 404665 31421
## - income 3 1388 405177 31431
## - age_group 3 3048 406837 31464
## - exercise 1 4538 408327 31497
## - menthlth_days 1 13195 416984 31665
## - gen_health 2 84707 488496 32929
##
## Step: AIC=31409.55
## physhlth_days ~ menthlth_days + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## - sex 1 103 403990 31410
## - smoking 2 357 404244 31413
## - depression 1 612 404499 31420
## - education 3 875 404762 31421
## - income 3 1394 405281 31431
## - age_group 3 3049 406936 31464
## - exercise 1 4451 408338 31495
## - menthlth_days 1 13238 417125 31666
## - gen_health 2 85645 489532 32944
# Forward selection
mod_forward <- step(mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "forward",
trace = 1)## Start: AIC=34524.38
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 164022 434665 31967
## + menthlth_days 1 60303 538384 33677
## + exercise 1 34542 564145 34051
## + income 3 28842 569845 34135
## + depression 1 20750 577937 34244
## + smoking 2 12790 585897 34356
## + education 3 8593 590094 34415
## + marital 2 7321 591366 34430
## + bmi 1 6253 592434 34442
## + age_group 3 6527 592160 34443
## + sex 1 1649 597038 34504
## <none> 598687 34524
##
## Step: AIC=31967.07
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 17940.1 416725 31632
## + exercise 1 6466.7 428198 31849
## + depression 1 6070.1 428595 31857
## + income 3 3231.0 431434 31913
## + smoking 2 1776.3 432889 31938
## + age_group 3 1535.7 433129 31945
## + sex 1 1254.2 433411 31946
## + marital 2 830.3 433835 31956
## + education 3 415.5 434250 31965
## <none> 434665 31967
## + bmi 1 0.2 434665 31969
##
## Step: AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 5820.8 410904 31521
## + age_group 3 4661.5 412064 31548
## + income 3 1987.0 414738 31600
## + marital 2 1402.5 415322 31609
## + smoking 2 1034.6 415690 31616
## + depression 1 738.1 415987 31620
## + sex 1 605.2 416120 31622
## + education 3 318.9 416406 31632
## <none> 416725 31632
## + bmi 1 3.6 416721 31634
##
## Step: AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3720.5 407184 31455
## + income 3 1198.4 409706 31504
## + marital 2 1064.8 409839 31505
## + depression 1 739.1 410165 31509
## + smoking 2 706.2 410198 31512
## + education 3 686.9 410217 31514
## + sex 1 449.3 410455 31515
## <none> 410904 31521
## + bmi 1 82.9 410821 31522
##
## Step: AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 1162.69 406021 31438
## + depression 1 932.70 406251 31438
## + education 3 501.69 406682 31451
## + sex 1 261.03 406923 31452
## + smoking 2 360.51 406823 31452
## <none> 407184 31455
## + bmi 1 83.59 407100 31455
## + marital 2 40.11 407144 31458
##
## Step: AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + depression 1 861.57 405159 31423
## + education 3 953.62 405067 31425
## + smoking 2 305.02 405716 31436
## + sex 1 203.01 405818 31436
## <none> 406021 31438
## + bmi 1 74.91 405946 31438
## + marital 2 153.70 405867 31439
##
## Step: AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression
##
## Df Sum of Sq RSS AIC
## + education 3 837.21 404322 31412
## + smoking 2 259.11 404900 31422
## + sex 1 110.16 405049 31423
## + bmi 1 105.12 405054 31423
## <none> 405159 31423
## + marital 2 169.72 404990 31423
##
## Step: AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education
##
## Df Sum of Sq RSS AIC
## + smoking 2 332.18 403990 31410
## + bmi 1 110.19 404212 31412
## <none> 404322 31412
## + sex 1 77.87 404244 31413
## + marital 2 168.26 404154 31413
##
## Step: AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking
##
## Df Sum of Sq RSS AIC
## + sex 1 102.92 403887 31410
## <none> 403990 31410
## + bmi 1 98.80 403891 31410
## + marital 2 169.54 403820 31410
##
## Step: AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## + bmi 1 97.861 403789 31410
## + marital 2 170.601 403716 31410
# Stepwise
mod_stepwise <- step(mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "both",
trace = 1)## Start: AIC=34524.38
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 164022 434665 31967
## + menthlth_days 1 60303 538384 33677
## + exercise 1 34542 564145 34051
## + income 3 28842 569845 34135
## + depression 1 20750 577937 34244
## + smoking 2 12790 585897 34356
## + education 3 8593 590094 34415
## + marital 2 7321 591366 34430
## + bmi 1 6253 592434 34442
## + age_group 3 6527 592160 34443
## + sex 1 1649 597038 34504
## <none> 598687 34524
##
## Step: AIC=31967.07
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 17940 416725 31632
## + exercise 1 6467 428198 31849
## + depression 1 6070 428595 31857
## + income 3 3231 431434 31913
## + smoking 2 1776 432889 31938
## + age_group 3 1536 433129 31945
## + sex 1 1254 433411 31946
## + marital 2 830 433835 31956
## + education 3 416 434250 31965
## <none> 434665 31967
## + bmi 1 0 434665 31969
## - gen_health 2 164022 598687 34524
##
## Step: AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 5821 410904 31521
## + age_group 3 4661 412064 31548
## + income 3 1987 414738 31600
## + marital 2 1402 415322 31609
## + smoking 2 1035 415690 31616
## + depression 1 738 415987 31620
## + sex 1 605 416120 31622
## + education 3 319 416406 31632
## <none> 416725 31632
## + bmi 1 4 416721 31634
## - menthlth_days 1 17940 434665 31967
## - gen_health 2 121660 538384 33677
##
## Step: AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3720 407184 31455
## + income 3 1198 409706 31504
## + marital 2 1065 409839 31505
## + depression 1 739 410165 31509
## + smoking 2 706 410198 31512
## + education 3 687 410217 31514
## + sex 1 449 410455 31515
## <none> 410904 31521
## + bmi 1 83 410821 31522
## - exercise 1 5821 416725 31632
## - menthlth_days 1 17294 428198 31849
## - gen_health 2 101805 512709 33288
##
## Step: AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 1163 406021 31438
## + depression 1 933 406251 31438
## + education 3 502 406682 31451
## + sex 1 261 406923 31451
## + smoking 2 361 406823 31451
## <none> 407184 31455
## + bmi 1 84 407100 31455
## + marital 2 40 407144 31458
## - age_group 3 3720 410904 31521
## - exercise 1 4880 412064 31548
## - menthlth_days 1 19957 427141 31835
## - gen_health 2 94875 502059 33126
##
## Step: AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + depression 1 862 405159 31423
## + education 3 954 405067 31425
## + smoking 2 305 405716 31436
## + sex 1 203 405818 31436
## <none> 406021 31438
## + bmi 1 75 405946 31438
## + marital 2 154 405867 31439
## - income 3 1163 407184 31455
## - age_group 3 3685 409706 31504
## - exercise 1 4214 410235 31518
## - menthlth_days 1 18945 424966 31801
## - gen_health 2 87509 493530 32995
##
## Step: AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression
##
## Df Sum of Sq RSS AIC
## + education 3 837 404322 31412
## + smoking 2 259 404900 31422
## + sex 1 110 405049 31423
## + bmi 1 105 405054 31423
## <none> 405159 31423
## + marital 2 170 404990 31423
## - depression 1 862 406021 31438
## - income 3 1092 406251 31438
## - age_group 3 3871 409030 31493
## - exercise 1 4219 409378 31504
## - menthlth_days 1 13674 418834 31686
## - gen_health 2 86610 491770 32969
##
## Step: AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education
##
## Df Sum of Sq RSS AIC
## + smoking 2 332 403990 31410
## + bmi 1 110 404212 31412
## <none> 404322 31412
## + sex 1 78 404244 31413
## + marital 2 168 404154 31413
## - education 3 837 405159 31423
## - depression 1 745 405067 31425
## - income 3 1495 405818 31436
## - age_group 3 3558 407880 31476
## - exercise 1 4604 408926 31501
## - menthlth_days 1 13607 417929 31675
## - gen_health 2 86813 491135 32964
##
## Step: AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking
##
## Df Sum of Sq RSS AIC
## + sex 1 103 403887 31410
## <none> 403990 31410
## + bmi 1 99 403891 31410
## + marital 2 170 403820 31410
## - smoking 2 332 404322 31412
## - depression 1 691 404681 31421
## - education 3 910 404900 31422
## - income 3 1457 405447 31432
## - age_group 3 3178 407168 31466
## - exercise 1 4502 408492 31496
## - menthlth_days 1 13347 417337 31668
## - gen_health 2 85552 489542 32942
##
## Step: AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## - sex 1 103 403990 31410
## + bmi 1 98 403789 31410
## + marital 2 171 403716 31410
## - smoking 2 357 404244 31413
## - depression 1 612 404499 31420
## - education 3 875 404762 31421
## - income 3 1394 405281 31431
## - age_group 3 3049 406936 31464
## - exercise 1 4451 408338 31495
## - menthlth_days 1 13238 417125 31666
## - gen_health 2 85645 489532 32944
tibble(
Method = c("Backward", "Forward", "Stepwise"),
Variables = c(
paste(names(coef(mod_backward))[-1], collapse = ", "),
paste(names(coef(mod_forward))[-1], collapse = ", "),
paste(names(coef(mod_stepwise))[-1], collapse = ", ")
),
`Adj. R2` = c(
round(glance(mod_backward)$adj.r.squared, 4),
round(glance(mod_forward)$adj.r.squared, 4),
round(glance(mod_stepwise)$adj.r.squared, 4)
),
AIC = c(round(AIC(mod_backward), 1),
round(AIC(mod_forward), 1),
round(AIC(mod_stepwise), 1)),
BIC = c(round(BIC(mod_backward), 1),
round(BIC(mod_forward), 1),
round(BIC(mod_stepwise), 1))
) |>
kable(caption = "Table 3. Stepwise Selection Results Comparison") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Method | Variables | Adj. R2 | AIC | BIC |
|---|---|---|---|---|
| Backward | menthlth_days, sexFemale, exerciseYes, depressionYes, age_group35-49, age_group50-64, age_group65+, income$25K-$49K, income$50K-$99K, income$100K+, educationHS/GED, educationSome college, educationCollege graduate, smokingCurrent, smokingFormer, gen_healthGood, gen_healthFair/Poor | 0.3239 | 54114.6 | 54247.3 |
| Forward | gen_healthGood, gen_healthFair/Poor, menthlth_days, exerciseYes, age_group35-49, age_group50-64, age_group65+, income$25K-$49K, income$50K-$99K, income$100K+, depressionYes, educationHS/GED, educationSome college, educationCollege graduate, smokingCurrent, smokingFormer, sexFemale | 0.3239 | 54114.6 | 54247.3 |
| Stepwise | gen_healthGood, gen_healthFair/Poor, menthlth_days, exerciseYes, age_group35-49, age_group50-64, age_group65+, income$25K-$49K, income$50K-$99K, income$100K+, depressionYes, educationHS/GED, educationSome college, educationCollege graduate, smokingCurrent, smokingFormer, sexFemale | 0.3239 | 54114.6 | 54247.3 |
Comparison: All three methods meet on the same final model with the same Adjusted R2, AIC, and BIC, which indicates strong agreement. This convergence gives confidence that the selected predictors are the genuinely informative ones. Variables excluded by all three methods contribute very little explanatory power and are dropped. Variables retained by all three (including menthlth_days, gen_health, depression, and exercise) are the most important predictors of physically unhealthy days.
mod_final <- mod_backward
tidy(mod_final, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Table 4. Final Linear Model Coefficients",
col.names = c("Term", "Estimate", "Std. Error", "t-statistic",
"p-value", "95% CI Lower", "95% CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.3853 | 0.4874 | 2.8425 | 0.0045 | 0.4300 | 2.3407 |
| menthlth_days | 0.1836 | 0.0114 | 16.1748 | 0.0000 | 0.1614 | 0.2059 |
| sexFemale | 0.2329 | 0.1633 | 1.4262 | 0.1539 | -0.0872 | 0.5530 |
| exerciseYes | -1.9048 | 0.2031 | -9.3793 | 0.0000 | -2.3029 | -1.5067 |
| depressionYes | 0.7471 | 0.2149 | 3.4769 | 0.0005 | 0.3259 | 1.1683 |
| age_group35-49 | 0.8374 | 0.2703 | 3.0980 | 0.0020 | 0.3075 | 1.3672 |
| age_group50-64 | 1.4778 | 0.2583 | 5.7206 | 0.0000 | 0.9714 | 1.9841 |
| age_group65+ | 1.8366 | 0.2513 | 7.3099 | 0.0000 | 1.3441 | 2.3292 |
| income$25K-$49K | -1.0324 | 0.2749 | -3.7557 | 0.0002 | -1.5713 | -0.4936 |
| income$50K-$99K | -1.3884 | 0.2658 | -5.2239 | 0.0000 | -1.9094 | -0.8674 |
| income$100K+ | -1.1122 | 0.3849 | -2.8897 | 0.0039 | -1.8666 | -0.3577 |
| educationHS/GED | 0.2576 | 0.4006 | 0.6431 | 0.5202 | -0.5276 | 1.0428 |
| educationSome college | 0.9636 | 0.4025 | 2.3943 | 0.0167 | 0.1747 | 1.7525 |
| educationCollege graduate | 1.0310 | 0.4043 | 2.5502 | 0.0108 | 0.2385 | 1.8236 |
| smokingCurrent | 0.4776 | 0.2647 | 1.8038 | 0.0713 | -0.0414 | 0.9965 |
| smokingFormer | 0.4399 | 0.1876 | 2.3442 | 0.0191 | 0.0720 | 0.8077 |
| gen_healthGood | 1.3636 | 0.1839 | 7.4161 | 0.0000 | 1.0032 | 1.7241 |
| gen_healthFair/Poor | 10.0009 | 0.2486 | 40.2245 | 0.0000 | 9.5136 | 10.4883 |
Choice justification: The final model is the result of backward elimination which meets on the same solution. This model is preferred over the maximum model because it drops predictors that did not improve fit while maintaining very similar Adjusted R2. It is also preferred over the BIC-best model because it handles factors as complete units.
Coefficient interpretations:
menthlth_days: Each additional mentally unhealthy day in the past 30 days is associated with approximately 0.184 more physically unhealthy days on average, holding all other variables constant. This is one of the strongest continuous predictors which reflects the bidirectional relationship between mental and physical health.
exerciseYes: Compared to individuals who did not exercise in the past 30 days (the reference group), those who exercised report approximately 1.905 fewer physically unhealthy days on average, holding all other variables constant. The negative sign confirms exercise as independently protective against physical health burden.
gen_healthFair/Poor: Compared to individuals reporting Excellent/Very Good general health (the reference group), those reporting Fair/Poor general health report approximately 10.001 more physically unhealthy days on average, holding all other variables constant. This is the largest coefficient in the model which highlights the strong alignment between self-rated health and physical health burden.
Figure 1. Diagnostic Plots for the Final Linear Model
Residuals vs. Fitted: The residuals show a slight fan pattern with greater spread at higher fitted values –> mild heteroscedasticity. There is also a visible cluster near zero fitted values corresponding to the large number of respondents reporting zero physically unhealthy days, which introduces non-linearity in this region.
Normal Q-Q Plot: The upper tail deviates from the diagonal reference line, indicating right-skewed residuals. This is expected given the large number of zero and near-zero responses. The right skew of the outcome itself carries through to the residuals.
Scale-Location Plot: The smoother shows an upward trend, confirming that residual variance increases with fitted values (heteroscedasticity).
Residuals vs. Leverage: A small number of observations show elevated leverage, but none appear to cross the Cook’s distance = 0.5 threshold. This suggests that no single observation is influencing the regression estimates.
Conclusion: The LINE assumptions are not fully satisfied. The residuals are right-skewed and heteroscedastic. Both are attributable to the zero-inflated, count-like distribution of physically unhealthy days. A log-transformation or a generalized linear model may be considered in a final report. However, the large sample size and the robustness of OLS to moderate assumption violations suggest that the coefficient estimates and their standard errors are valid for descriptive purposes.
brfss_analytic <- brfss_analytic |>
mutate(
frequent_distress = factor(
ifelse(physhlth_days >= 14, "Yes", "No"),
levels = c("No", "Yes")
)
)
brfss_analytic |>
count(frequent_distress) |>
mutate(`%` = round(n / sum(n) * 100, 1)) |>
kable(
caption = "Table 5. Frequent Physical Distress (>= 14 days)",
col.names = c("Frequent Distress", "N", "%")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Frequent Distress | N | % |
|---|---|---|
| No | 6948 | 86.9 |
| Yes | 1052 | 13.2 |
The prevalence of frequent physical distress is 13.2%, meaning approximately 1 in 8 adults in this sample reports 14 or more physically unhealthy days in the past 30 days. The outcome has fewer cases than non-cases.
Mental health days (menthlth_days) is selected as the
predictor for the simple model because it was the strongest continuous
predictor in the final linear model.
mod_simple <- glm(frequent_distress ~ menthlth_days,
data = brfss_analytic,
family = binomial)
summary(mod_simple)##
## Call:
## glm(formula = frequent_distress ~ menthlth_days, family = binomial,
## data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.346000 0.042986 -54.58 <2e-16 ***
## menthlth_days 0.073236 0.003181 23.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 5734.5 on 7998 degrees of freedom
## AIC: 5738.5
##
## Number of Fisher Scoring iterations: 5
# Odds ratio and 95% CI
or_simple <- round(exp(coef(mod_simple)), 4)
ci_simple <- round(exp(confint(mod_simple)), 4)
tibble(
Term = names(or_simple),
OR = or_simple,
`CI Lower` = ci_simple[, 1],
`CI Upper` = ci_simple[, 2]
) |>
kable(caption = "Table 6. Simple Logistic Regression: OR and 95% CI") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | OR | CI Lower | CI Upper |
|---|---|---|---|
| (Intercept) | 0.0958 | 0.0879 | 0.1041 |
| menthlth_days | 1.0760 | 1.0693 | 1.0827 |
or_val <- or_simple["menthlth_days"]
ci_lo <- ci_simple["menthlth_days", 1]
ci_hi <- ci_simple["menthlth_days", 2]
p_val <- round(tidy(mod_simple)$p.value[2], 4)Interpretation: For every one-day increase in mentally unhealthy days in the past 30 days, the odds of frequent physical distress are multiplied by 1.076 (95% CI: 1.0693 to 1.0827). Since the confidence interval does not include 1.0 and the Wald test p-value is 0 (< 0.001), the association is statistically significant at alpha = 0.05. Each additional mentally unhealthy day is associated with meaningfully higher odds of crossing the 14-day physical distress threshold.
mod_logistic <- glm(
frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health + marital,
data = brfss_analytic,
family = binomial
)
summary(mod_logistic)##
## Call:
## glm(formula = frequent_distress ~ menthlth_days + bmi + sex +
## exercise + depression + age_group + income + education +
## smoking + gen_health + marital, family = binomial, data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.639413 0.291867 -12.469 < 2e-16 ***
## menthlth_days 0.044787 0.004358 10.277 < 2e-16 ***
## bmi -0.004822 0.005556 -0.868 0.38546
## sexFemale 0.176389 0.081382 2.167 0.03020 *
## exerciseYes -0.586264 0.085768 -6.835 8.17e-12 ***
## depressionYes 0.267888 0.095954 2.792 0.00524 **
## age_group35-49 0.512672 0.162401 3.157 0.00160 **
## age_group50-64 0.767269 0.154816 4.956 7.20e-07 ***
## age_group65+ 0.964882 0.155220 6.216 5.09e-10 ***
## income$25K-$49K -0.215375 0.111712 -1.928 0.05386 .
## income$50K-$99K -0.501752 0.118036 -4.251 2.13e-05 ***
## income$100K+ -0.541456 0.227454 -2.381 0.01729 *
## educationHS/GED 0.049334 0.165069 0.299 0.76504
## educationSome college 0.331979 0.165058 2.011 0.04430 *
## educationCollege graduate 0.283640 0.171335 1.655 0.09783 .
## smokingCurrent 0.208048 0.116374 1.788 0.07381 .
## smokingFormer 0.200388 0.089714 2.234 0.02551 *
## gen_healthGood 0.895044 0.116270 7.698 1.38e-14 ***
## gen_healthFair/Poor 2.696530 0.115309 23.385 < 2e-16 ***
## maritalPreviously married -0.138611 0.096205 -1.441 0.14965
## maritalNever married -0.143542 0.127789 -1.123 0.26132
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 4420.7 on 7979 degrees of freedom
## AIC: 4462.7
##
## Number of Fisher Scoring iterations: 6
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Table 7. Multiple Logistic Regression: Adjusted Odds Ratios",
col.names = c("Term", "OR", "Std. Error", "z-statistic",
"p-value", "95% CI Lower", "95% CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | OR | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.0263 | 0.2919 | -12.4694 | 0.0000 | 0.0148 | 0.0463 |
| menthlth_days | 1.0458 | 0.0044 | 10.2769 | 0.0000 | 1.0369 | 1.0548 |
| bmi | 0.9952 | 0.0056 | -0.8679 | 0.3855 | 0.9844 | 1.0061 |
| sexFemale | 1.1929 | 0.0814 | 2.1674 | 0.0302 | 1.0172 | 1.3995 |
| exerciseYes | 0.5564 | 0.0858 | -6.8355 | 0.0000 | 0.4704 | 0.6585 |
| depressionYes | 1.3072 | 0.0960 | 2.7918 | 0.0052 | 1.0821 | 1.5764 |
| age_group35-49 | 1.6697 | 0.1624 | 3.1568 | 0.0016 | 1.2174 | 2.3020 |
| age_group50-64 | 2.1539 | 0.1548 | 4.9560 | 0.0000 | 1.5956 | 2.9287 |
| age_group65+ | 2.6245 | 0.1552 | 6.2162 | 0.0000 | 1.9437 | 3.5731 |
| income$25K-$49K | 0.8062 | 0.1117 | -1.9279 | 0.0539 | 0.6478 | 1.0039 |
| income$50K-$99K | 0.6055 | 0.1180 | -4.2508 | 0.0000 | 0.4806 | 0.7634 |
| income$100K+ | 0.5819 | 0.2275 | -2.3805 | 0.0173 | 0.3680 | 0.8993 |
| educationHS/GED | 1.0506 | 0.1651 | 0.2989 | 0.7650 | 0.7621 | 1.4562 |
| educationSome college | 1.3937 | 0.1651 | 2.0113 | 0.0443 | 1.0114 | 1.9323 |
| educationCollege graduate | 1.3280 | 0.1713 | 1.6555 | 0.0978 | 0.9520 | 1.8641 |
| smokingCurrent | 1.2313 | 0.1164 | 1.7878 | 0.0738 | 0.9788 | 1.5448 |
| smokingFormer | 1.2219 | 0.0897 | 2.2336 | 0.0255 | 1.0243 | 1.4561 |
| gen_healthGood | 2.4474 | 0.1163 | 7.6980 | 0.0000 | 1.9524 | 3.0809 |
| gen_healthFair/Poor | 14.8282 | 0.1153 | 23.3853 | 0.0000 | 11.8632 | 18.6475 |
| maritalPreviously married | 0.8706 | 0.0962 | -1.4408 | 0.1496 | 0.7204 | 1.0505 |
| maritalNever married | 0.8663 | 0.1278 | -1.1233 | 0.2613 | 0.6730 | 1.1108 |
Interpretation of adjusted ORs:
menthlth_days: Each additional mentally unhealthy day is associated with 1.046 times the odds of frequent physical distress, holding all other variables constant. This is one of the strongest continuous predictors in the multiple model which confirms the pattern seen in the simple model.
exerciseYes: Compared to non-exercisers (reference), those who exercised in the past 30 days have 0.556 times the odds of frequent physical distress, holding all other variables constant. Since this OR is below 1, exercise is independently protective.
depressionYes: Compared to those without a history of depressive disorder (reference), those who were told they have depression have 1.307 times the odds of frequent physical distress, holding all other variables constant. This is a substantially elevated risk that underscores the burden of depression.
# Test smoking as a group (Current vs. Never, Former vs. Never)
mod_logistic_no_smoking <- glm(
frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + gen_health + marital,
data = brfss_analytic,
family = binomial
)
anova(mod_logistic_no_smoking, mod_logistic, test = "Chisq") |>
as.data.frame() |>
rownames_to_column("Model") |>
mutate(
Model = c("Reduced (no smoking)", "Full (+ smoking)"),
across(where(is.numeric), \(x) round(x, 4))
) |>
kable(
caption = "Table 8. Likelihood Ratio Test: Smoking as a Group",
col.names = c("Model", "Res. df", "Residual Dev.", "df", "Dev. diff.", "p-value")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Model | Res. df | Residual Dev. | df | Dev. diff. | p-value |
|---|---|---|---|---|---|
| Reduced (no smoking) | 7981 | 4426.873 | NA | NA | NA |
| Full (+ smoking) | 7979 | 4420.659 | 2 | 6.2138 | 0.0447 |
H0: The smoking coefficients for Current and Former (vs. Never reference) are both equal to zero. Smoking as a group does not improve prediction of frequent physical distress after adjusting for all other predictors.
HA: At least one smoking coefficient differs from zero. Smoking collectively adds predictive information.
The LR test yielded a deviance difference of 6.214 on 2 degrees of freedom (p = 0.0447). Since p < 0.05, we reject H0 and conclude that smoking status as a group significantly improves the prediction of frequent physical distress after adjusting for all other predictors. The two smoking dummy variables contribute meaningful information that would be lost if smoking were excluded from the model.
roc_obj <- roc(brfss_analytic$frequent_distress, fitted(mod_logistic))
plot(roc_obj,
main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE,
col = "steelblue",
lwd = 2)Figure 2. ROC Curve: Frequent Physical Distress Model
auc_val <- round(auc(roc_obj), 3)
auc_ci <- round(ci.auc(roc_obj), 3)
tibble(
AUC = auc_val,
`95% CI Lower` = auc_ci[1],
`95% CI Upper` = auc_ci[3],
Interpretation = case_when(
auc_val >= 0.9 ~ "Outstanding",
auc_val >= 0.8 ~ "Excellent",
auc_val >= 0.7 ~ "Acceptable",
auc_val >= 0.5 ~ "Poor",
TRUE ~ "No discrimination"
)
) |>
kable(caption = "Table 9. AUC and Discrimination Interpretation") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| AUC | 95% CI Lower | 95% CI Upper | Interpretation |
|---|---|---|---|
| 0.855 | 0.843 | 0.868 | Excellent |
Interpretation: The AUC of 0.855 (95% CI: 0.843 to 0.868) indicates excellent discrimination. The model correctly ranks a randomly selected individual with frequent physical distress wabove a randomly selected individual without it with a probability of 0.855, substantially above the 0.5 chance level.
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 5.5773, df = 8, p-value = 0.6945
tibble(
`X-squared` = round(hl_test$statistic, 3),
df = hl_test$parameter,
`p-value` = round(hl_test$p.value, 4)
) |>
kable(caption = "Table 10. Hosmer-Lemeshow Goodness-of-Fit Test") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| X-squared | df | p-value |
|---|---|---|
| 5.577 | 8 | 0.6945 |
H0: The model fits the data well. Predicted and observed frequencies agree across deciles of predicted probability.
HA: The model does not fit the data well. There are significant discrepancies between predicted and observed frequencies.
The Hosmer-Lemeshow test produced X2 = 5.577, df = 8, p = 0.6945. Since p > 0.05, we fail to reject H0 and conclude that the model fits the data well. The predicted and observed FPD proportions agree across the ten deciles of predicted probability. The Hosmer-Lemeshow result builds on the AUC finding. While the AUC of 0.855 confirms that the model discriminates excellently between individuals with and without frequent physical distress, the Hosmer-Lemeshow test confirms that the predicted probabilities are also well-calibrated to the observed rates. Both criteria are satisfied. Therefore, a model that is both accurate in its rankings and honest in its probability estimates.
A. Statistical Insights
The results suggest that mental health burden, general health status, depression history, and exercise are the most consistently strong predictors of physical health burden among U.S. adults. They appear significant across both the linear and logistic models. Mental health days emerged as the strongest continuous predictor in both frameworks. This is consistent with the bidirectional relationship between mental and physical health. Individuals experiencing psychological distress are more likely to report symptoms and physical health limitations. General health status produced the largest coefficient in the linear model. This reflects that self-rated health is a measure that captures physical symptoms directly. Exercise was significant in both models as a protective factor, while depression history increased odds of frequent physical distress substantially in the logistic model. Income and education were significant in the linear model. However, their effects were reduced after adjusting for behavioral and health status variables. The key limitation of cross-sectional BRFSS data is that temporal ordering cannot be established. We cannot determine whether: - poor mental health causes more physical health days - physical illness depresses mood, or - both are driven by a common factor. Two specific unmeasured confounders that could skew these associations are chronic pain conditions and access to primary care.
B. Linear versus Logistic Regression
The linear regression model answers “by how much does physical health burden increase for a one-unit change in each predictor”. It provides a continuous, quantitative estimate of association in days as units. The logistic regression model answers “how do the odds of crossing a clinically meaningful threshold change”. It is more directly actionable for public health surveillance and policy because the CDC threshold of 14+ days has a meaning. Neither model is strictly superior. The linear model preserves the full information in the continuous outcome and avoids the 14-day cutoff. The logistic model is more robust to the distribution violations that afflict linear regression with this outcome type. The model selection criteria also differ. The linear framework uses Adjusted R2, AIC, and BIC to select predictors based on how much variance they explain. However, logistic regression uses the AUC to evaluate how well the model discriminates between cases and non-cases, and the Hosmer-Lemeshow test to evaluate calibration. R2 has no direct equivalent in logistic regression; McFadden’s pseudo-R2 is not comparable to linear R2. A value of 0.10 in logistic regression may indicate the same relative improvement in fit as R2 = 0.30 in linear regression.
C. AI Transparency
As part of my process I encountered some syntax errors. This included missing brackets and “” sign when categorizing the variables(Error: Incomplete expression:). As the chunk of code was pretty large It was a bit of a challenge to find, I used AI to find the missing symbols. I just copied and pasted the whole error: “Error: Incomplete expression: brfss_clean <- brfss_selected |> mutate( # Outcome: physically unhealthy days physhlth_days = case_when( PHYSHLTH == 88 ~ 0, PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH), TRUE ~ NA_real_ ),
# Mental health days
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_
),
# BMI
bmi = case_when(
`_BMI5` == 9999 ~ NA_real_,
TRUE ~ as.numeric(`_BMI5`) / 100
),
# Sex
sex = factor(case_when(
SEXVAR == 1 ~ "Male",
SEXVAR == 2 ~ "Female",
TRUE ~ NA_character_
), levels = c("Male", "Female")),
# Exercise
exercise = factor(case_when(
EXERANY2 == 1 ~ "Yes",
EXERANY2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
” Once the I received the output pointing me to the missing ) I corrected my code and let it run again.
```
End of Homework 4