## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.39 R6_2.6.1 fastmap_1.2.0 xfun_0.56
## [5] cachem_1.1.0 knitr_1.51 htmltools_0.5.9 rmarkdown_2.30
## [9] lifecycle_1.0.5 cli_3.6.5 sass_0.4.10 jquerylib_0.1.4
## [13] compiler_4.5.3 rstudioapi_0.18.0 tools_4.5.3 evaluate_1.0.5
## [17] bslib_0.9.0 yaml_2.3.12 otel_0.2.0 jsonlite_2.0.0
## [21] rlang_1.1.7
What individual, behavioral, and health factors are associated with physical health burden among U.S. adults? How do findings compare when the outcome is modeled as the number of physically unhealthy days (continuous) versus as an indicator of frequent physical distress (binary: 14 or more days in the past 30)?
# Import the BRFSS 2023 XPT file
# I downloaded the BRFSS 2023 XPT file from the CDC and saved it on my desktop and copied and used as my path for this assignment
brfss_raw <- read_xpt("C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/data/LLCP2023.XPT")
# Report raw dimensions
cat("Raw dataset dimensions:\n")## Raw dataset dimensions:
## Rows: 433323
## Columns: 350
# Variable selection for this work
brfss_selected <- brfss_raw |>
select(
PHYSHLTH,
MENTHLTH,
`_BMI5`,
SEXVAR,
EXERANY2,
ADDEPEV3,
`_AGEG5YR`,
`_INCOMG1`,
EDUCA,
`_SMOKER3`,
GENHLTH,
MARITAL
)
cat("Selected dataset dimensions:\n")## Selected dataset dimensions:
## Rows: 433323
## Columns: 12
Predictor Selection Justification: I selected all 11
available predictors (both continuous: menthlth_days and
bmi; plus binary and categorical variables) to build the
most comprehensive maximum model possible. Mental health days and BMI
are strong a priori predictors of physical health burden based on
existing literature. The demographic, behavioral, and socioeconomic
variables (age, income, education, smoking, general health, marital
status, sex, exercise, and depression) capture the multi-dimensional
determinants of physical health outlined in the social-ecological model.
Including all candidate predictors in the maximum model allows the
formal selection procedures to objectively determine which subset best
predicts the outcome. In our semester long assignments and lab work,
including class lecture you have mentioned and justified most of this
variables hence the need to be used for this work.
brfss_clean <- brfss_selected |>
# Recode outcome: physhlth_days
mutate(
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH %in% c(77, 99) ~ NA_real_,
PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
TRUE ~ NA_real_
)
) |>
# Recode menthlth_days (continuous)
mutate(
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH %in% c(77, 99) ~ NA_real_,
MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_
)
) |>
# Recode bmi (continuous): divide by 100; 9999 -> NA
mutate(
bmi = case_when(
`_BMI5` >= 9999 ~ NA_real_,
TRUE ~ as.numeric(`_BMI5`) / 100
)
) |>
# Recode sex (binary): 1 = "Male", 2 = "Female"
mutate(
sex = factor(
case_when(
SEXVAR == 1 ~ "Male",
SEXVAR == 2 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")
)
) |>
# Recode exercise (binary): 1 = "Yes", 2 = "No"; 7, 9 -> NA
mutate(
exercise = factor(
case_when(
EXERANY2 == 1 ~ "Yes",
EXERANY2 == 2 ~ "No",
EXERANY2 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No", "Yes")
)
) |>
# Recode depression (binary): 1 = "Yes", 2 = "No"; 7, 9 -> NA
mutate(
depression = factor(
case_when(
ADDEPEV3 == 1 ~ "Yes",
ADDEPEV3 == 2 ~ "No",
ADDEPEV3 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No", "Yes")
)
) |>
# Recode age_group (categorical): collapse 5-year groups
mutate(
age_group = factor(
case_when(
`_AGEG5YR` %in% 1:3 ~ "18-34",
`_AGEG5YR` %in% 4:6 ~ "35-49",
`_AGEG5YR` %in% 7:9 ~ "50-64",
`_AGEG5YR` %in% 10:13 ~ "65+",
`_AGEG5YR` == 14 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("18-34", "35-49", "50-64", "65+")
)
) |>
# Recode income (categorical): collapse to 4 groups
mutate(
income = factor(
case_when(
`_INCOMG1` %in% 1:2 ~ "Less than $25K",
`_INCOMG1` %in% 3:4 ~ "$25K-$49K",
`_INCOMG1` %in% 5:6 ~ "$50K-$99K",
`_INCOMG1` == 7 ~ "$100K+",
`_INCOMG1` == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")
)
) |>
# Recode education (categorical)
mutate(
education = factor(
case_when(
EDUCA %in% 1:3 ~ "Less than HS",
EDUCA == 4 ~ "HS/GED",
EDUCA == 5 ~ "Some college",
EDUCA == 6 ~ "College graduate",
EDUCA == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than HS", "HS/GED", "Some college", "College graduate")
)
) |>
# Recode smoking (categorical): collapse to 3 groups
mutate(
smoking = factor(
case_when(
`_SMOKER3` %in% 1:2 ~ "Current",
`_SMOKER3` == 3 ~ "Former",
`_SMOKER3` == 4 ~ "Never",
`_SMOKER3` == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Never", "Current", "Former")
)
) |>
# Recode gen_health (categorical): collapse to 3 groups
mutate(
gen_health = factor(
case_when(
GENHLTH %in% 1:2 ~ "Excellent/Very Good",
GENHLTH == 3 ~ "Good",
GENHLTH %in% 4:5 ~ "Fair/Poor",
GENHLTH %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Excellent/Very Good", "Good", "Fair/Poor")
)
) |>
# Recode marital (categorical): collapse to 3 groups
mutate(
marital = factor(
case_when(
MARITAL %in% c(1, 6) ~ "Married/Partnered",
MARITAL %in% 2:4 ~ "Previously married",
MARITAL == 5 ~ "Never married",
MARITAL == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Married/Partnered", "Previously married", "Never married")
)
) |>
# Select only recoded variables
select(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
age_group, income, education, smoking, gen_health, marital)## Factor levels (reference = first level):
## sex: Male Female
## exercise: No Yes
## depression: No Yes
## age_group: 18-34 35-49 50-64 65+
## income: Less than $25K $25K-$49K $50K-$99K $100K+
## education: Less than HS HS/GED Some college College graduate
## smoking: Never Current Former
## gen_health: Excellent/Very Good Good Fair/Poor
## marital: Married/Partnered Previously married Never married
# Report missingness before removing missing data
cat("=== Missing Data Report (Before Removal) ===\n\n")## === Missing Data Report (Before Removal) ===
miss_report <- brfss_clean |>
summarise(across(everything(), ~ sum(is.na(.)))) |>
pivot_longer(everything(), names_to = "Variable", values_to = "N_Missing") |>
mutate(
Total = nrow(brfss_clean),
Pct_Missing = round(N_Missing / Total * 100, 1)
)
miss_report |>
kable(
caption = "Missing Data Summary Before Listwise Deletion",
col.names = c("Variable", "N Missing", "Total N", "% Missing"),
align = c("l", "r", "r", "r")
) |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Variable | N Missing | Total N | % Missing |
|---|---|---|---|
| physhlth_days | 10785 | 433323 | 2.5 |
| menthlth_days | 8108 | 433323 | 1.9 |
| bmi | 40535 | 433323 | 9.4 |
| sex | 0 | 433323 | 0.0 |
| exercise | 1251 | 433323 | 0.3 |
| depression | 2587 | 433323 | 0.6 |
| age_group | 7779 | 433323 | 1.8 |
| income | 86623 | 433323 | 20.0 |
| education | 2325 | 433323 | 0.5 |
| smoking | 23062 | 433323 | 5.3 |
| gen_health | 1262 | 433323 | 0.3 |
| marital | 4289 | 433323 | 1.0 |
# Remove missing values and draw sample
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na() |>
slice_sample(n = 8000)
cat("Final analytic sample size:", nrow(brfss_analytic), "\n")## Final analytic sample size: 8000
## Number of variables: 12
# Descriptive summary table
brfss_analytic |>
tbl_summary(
label = list(
physhlth_days ~ "Physically Unhealthy Days (past 30)",
menthlth_days ~ "Mentally Unhealthy Days (past 30)",
bmi ~ "Body Mass Index",
sex ~ "Sex Assigned at Birth",
exercise ~ "Any Exercise (past 30 days)",
depression ~ "Ever Told Depressive Disorder",
age_group ~ "Age Group",
income ~ "Annual Household Income",
education ~ "Education Level",
smoking ~ "Smoking Status",
gen_health ~ "General Health Status",
marital ~ "Marital Status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd}); Median: {median} [{p25}, {p75}]",
all_categorical() ~ "{n} ({p}%)"
),
digits = list(all_continuous() ~ 1),
missing = "no"
) |>
bold_labels() |>
modify_caption("**Table 1. Descriptive Summary of Analytic Sample (N = 8,000)**")| Characteristic | N = 8,0001 |
|---|---|
| Physically Unhealthy Days (past 30) | 4.3 (8.7); Median: 0.0 [0.0, 4.0] |
| Mentally Unhealthy Days (past 30) | 4.3 (8.2); Median: 0.0 [0.0, 5.0] |
| Body Mass Index | 28.7 (6.5); Median: 27.5 [24.3, 31.8] |
| Sex Assigned at Birth | |
| Male | 3,971 (50%) |
| Female | 4,029 (50%) |
| Any Exercise (past 30 days) | 6,157 (77%) |
| Ever Told Depressive Disorder | 1,776 (22%) |
| Age Group | |
| 18-34 | 1,294 (16%) |
| 35-49 | 1,657 (21%) |
| 50-64 | 2,109 (26%) |
| 65+ | 2,940 (37%) |
| Annual Household Income | |
| Less than $25K | 1,090 (14%) |
| $25K-$49K | 1,931 (24%) |
| $50K-$99K | 4,297 (54%) |
| $100K+ | 682 (8.5%) |
| Education Level | |
| Less than HS | 391 (4.9%) |
| HS/GED | 1,887 (24%) |
| Some college | 2,115 (26%) |
| College graduate | 3,607 (45%) |
| Smoking Status | |
| Never | 4,808 (60%) |
| Current | 962 (12%) |
| Former | 2,230 (28%) |
| General Health Status | |
| Excellent/Very Good | 3,926 (49%) |
| Good | 2,648 (33%) |
| Fair/Poor | 1,426 (18%) |
| Marital Status | |
| Married/Partnered | 4,582 (57%) |
| Previously married | 2,050 (26%) |
| Never married | 1,368 (17%) |
| 1 Mean (SD); Median: Median [Q1, Q3]; n (%) | |
# Fit the maximum linear regression model with all 11 predictors
mod_max <- lm(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health + marital,
data = brfss_analytic
)
# Display model summary
summary(mod_max)##
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise +
## depression + age_group + income + education + smoking + gen_health +
## marital, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.795 -2.839 -1.145 0.690 31.047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.20132 0.62746 3.508 0.000453 ***
## menthlth_days 0.18362 0.01135 16.172 < 2e-16 ***
## bmi -0.01877 0.01287 -1.459 0.144638
## sexFemale 0.23449 0.16389 1.431 0.152538
## exerciseYes -1.93289 0.20408 -9.471 < 2e-16 ***
## depressionYes 0.77536 0.21528 3.602 0.000318 ***
## age_group35-49 0.76935 0.28365 2.712 0.006696 **
## age_group50-64 1.40108 0.27874 5.026 5.11e-07 ***
## age_group65+ 1.72725 0.27852 6.202 5.87e-10 ***
## income$25K-$49K -1.08440 0.27627 -3.925 8.74e-05 ***
## income$50K-$99K -1.52385 0.27607 -5.520 3.50e-08 ***
## income$100K+ -1.31580 0.39821 -3.304 0.000956 ***
## educationHS/GED 0.29371 0.40089 0.733 0.463806
## educationSome college 1.00198 0.40273 2.488 0.012867 *
## educationCollege graduate 1.05262 0.40445 2.603 0.009269 **
## smokingCurrent 0.46414 0.26617 1.744 0.081241 .
## smokingFormer 0.43777 0.18776 2.332 0.019749 *
## gen_healthGood 1.40982 0.18635 7.565 4.30e-14 ***
## gen_healthFair/Poor 10.06509 0.25239 39.879 < 2e-16 ***
## maritalPreviously married -0.26701 0.20679 -1.291 0.196680
## maritalNever married -0.41244 0.24899 -1.656 0.097666 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared: 0.3258, Adjusted R-squared: 0.3242
## F-statistic: 192.8 on 20 and 7979 DF, p-value: < 2.2e-16
# Report model fit criteria
r2 <- summary(mod_max)$r.squared
adj_r2 <- summary(mod_max)$adj.r.squared
aic <- AIC(mod_max)
bic <- BIC(mod_max)
tibble(
Criterion = c("R-squared", "Adjusted R-squared", "AIC", "BIC"),
Value = round(c(r2, adj_r2, aic, bic), 3)
) |>
kable(
caption = "Model Fit Criteria for Maximum Linear Regression Model",
align = c("l", "r")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Criterion | Value |
|---|---|
| R-squared | 0.326 |
| Adjusted R-squared | 0.324 |
| AIC | 54115.053 |
| BIC | 54268.772 |
Interpretation: The maximum model with all 11 predictors explains 32.6% of the variance in physically unhealthy days (R² = 0.326). The Adjusted R² of 0.324 penalizes for model complexity and suggests that the included predictors collectively contribute meaningful explanatory power beyond chance. The AIC of 5.41151^{4} and BIC of 5.42688^{4} serve as baselines against which to compare more parsimonious models; lower values indicate better fit relative to complexity.
# Run best subsets regression
# Note: nvmax set to number of parameters (dummy variables) in max model
# regsubsets() treats each dummy variable as a separate predictor
best_sub <- regsubsets(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health + marital,
data = brfss_analytic,
nvmax = 25, # generous upper bound to capture all dummy terms
really.big = TRUE
)
best_sub_summary <- summary(best_sub)# Summary table of Adjusted R-squared and BIC by model size
best_sub_df <- tibble(
`Model Size` = seq_along(best_sub_summary$adjr2),
`Adjusted R-sq` = round(best_sub_summary$adjr2, 4),
`BIC` = round(best_sub_summary$bic, 2)
) |>
mutate(
`Best Adj R-sq` = ifelse(`Adjusted R-sq` == max(`Adjusted R-sq`), "★", ""),
`Best BIC` = ifelse(`BIC` == min(`BIC`), "★", "")
)
best_sub_df |>
kable(
caption = "Best Subsets Regression: Adjusted R-squared and BIC by Model Size",
align = c("c", "r", "r", "c", "c")
) |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) |>
row_spec(which(best_sub_summary$adjr2 == max(best_sub_summary$adjr2)),
bold = TRUE, color = "white", background = "#2196F3") |>
row_spec(which(best_sub_summary$bic == min(best_sub_summary$bic)),
bold = TRUE, color = "white", background = "#4CAF50")| Model Size | Adjusted R-sq | BIC | Best Adj R-sq | Best BIC |
|---|---|---|---|---|
| 1 | 0.2627 | -2420.70 | ||
| 2 | 0.2956 | -2778.12 | ||
| 3 | 0.3074 | -2905.97 | ||
| 4 | 0.3133 | -2966.10 | ||
| 5 | 0.3160 | -2989.73 | ||
| 6 | 0.3183 | -3007.98 | ||
| 7 | 0.3197 | -3016.83 | ||
| 8 | 0.3208 | -3021.26 | ||
| 9 | 0.3215 | -3021.91 | ★ | |
| 10 | 0.3221 | -3021.17 | ||
| 11 | 0.3225 | -3018.29 | ||
| 12 | 0.3231 | -3016.99 | ||
| 13 | 0.3235 | -3013.96 | ||
| 14 | 0.3237 | -3008.54 | ||
| 15 | 0.3239 | -3002.54 | ||
| 16 | 0.3240 | -2995.63 | ||
| 17 | 0.3241 | -2988.57 | ||
| 18 | 0.3241 | -2981.41 | ||
| 19 | 0.3242 | -2974.06 | ★ | |
| 20 | 0.3242 | -2965.61 | ★ |
# Plot Adjusted R-squared and BIC across model sizes
par(mfrow = c(1, 2))
# Adjusted R-squared plot
plot(
seq_along(best_sub_summary$adjr2),
best_sub_summary$adjr2,
type = "b", pch = 19, col = "steelblue",
xlab = "Number of Predictors",
ylab = "Adjusted R-squared",
main = "Best Subsets: Adjusted R-squared"
)
best_adjr2_size <- which.max(best_sub_summary$adjr2)
points(best_adjr2_size, best_sub_summary$adjr2[best_adjr2_size],
col = "red", pch = 8, cex = 2)
# BIC plot
plot(
seq_along(best_sub_summary$bic),
best_sub_summary$bic,
type = "b", pch = 19, col = "darkgreen",
xlab = "Number of Predictors",
ylab = "BIC",
main = "Best Subsets: BIC"
)
best_bic_size <- which.min(best_sub_summary$bic)
points(best_bic_size, best_sub_summary$bic[best_bic_size],
col = "red", pch = 8, cex = 2)## Model size maximizing Adjusted R-squared: 19
## Adjusted R-squared value: 0.3242
## Model size minimizing BIC: 9
## BIC value: -3021.91
# Show which variables are in the best models
cat("Variables in best Adj-R2 model (size",
which.max(best_sub_summary$adjr2), "):\n")## Variables in best Adj-R2 model (size 19 ):
## (Intercept) menthlth_days bmi
## 2.42326705 0.18384599 -0.01869124
## sexFemale exerciseYes depressionYes
## 0.23712114 -1.92576399 0.77637429
## age_group35-49 age_group50-64 age_group65+
## 0.76294842 1.39939197 1.73028710
## income$25K-$49K income$50K-$99K income$100K+
## -1.07158786 -1.50068492 -1.29329261
## educationSome college educationCollege graduate smokingCurrent
## 0.75452784 0.80071745 0.46323888
## smokingFormer gen_healthGood gen_healthFair/Poor
## 0.43731142 1.40757016 10.05466396
## maritalPreviously married maritalNever married
## -0.26397878 -0.40454371
##
## Variables in best BIC model (size 9 ):
## (Intercept) menthlth_days exerciseYes depressionYes
## 1.4036767 0.1881763 -1.8794631 0.8954395
## age_group35-49 age_group50-64 age_group65+ income$50K-$99K
## 1.0035144 1.6583700 2.0548414 -0.5080191
## gen_healthGood gen_healthFair/Poor
## 1.3542065 10.0640869
Interpretation: The two criteria identify different optimal model sizes. Adjusted R-squared is maximized at a larger model size of 19 predictors, indicating that additional parameters continue to improve fit (after penalty) at that level. BIC, which applies a more severe penalty for model complexity (especially with N = 8,000), favors a more parsimonious solution at 9 predictors. BIC therefore favors a simpler model, which is consistent with its mathematical design for larger sample sizes.
Important Limitation: regsubsets()
treats each dummy variable from a factor as a separate predictor. This
means some levels of a categorical variable can be included while others
are excluded — which is statistically incoherent and violates the
principle of marginality. For example, only the “Fair/Poor” level of
gen_health might enter without the “Good” level. The
stepwise methods in Step 3 handle factors as complete units and should
be considered the primary selection approach.
# Null model (intercept only) for forward/stepwise starting point
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)## === BACKWARD ELIMINATION ===
## Start: AIC=31410.04
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + income + education + smoking + gen_health + marital
##
## Df Sum of Sq RSS AIC
## - marital 2 180 403789 31410
## <none> 403609 31410
## - sex 1 104 403712 31410
## - bmi 1 108 403716 31410
## - smoking 2 347 403956 31413
## - depression 1 656 404265 31421
## - education 3 876 404485 31421
## - income 3 1550 405158 31435
## - age_group 3 2235 405844 31448
## - exercise 1 4537 408146 31497
## - menthlth_days 1 13230 416839 31666
## - gen_health 2 84670 488279 32930
##
## Step: AIC=31409.61
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
##
## Df Sum of Sq RSS AIC
## - bmi 1 98 403887 31410
## <none> 403789 31410
## - sex 1 102 403891 31410
## - smoking 2 345 404134 31412
## - depression 1 637 404427 31420
## - education 3 875 404665 31421
## - income 3 1388 405177 31431
## - age_group 3 3048 406837 31464
## - exercise 1 4538 408327 31497
## - menthlth_days 1 13195 416984 31665
## - gen_health 2 84707 488496 32929
##
## Step: AIC=31409.55
## physhlth_days ~ menthlth_days + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## - sex 1 103 403990 31410
## - smoking 2 357 404244 31413
## - depression 1 612 404499 31420
## - education 3 875 404762 31421
## - income 3 1394 405281 31431
## - age_group 3 3049 406936 31464
## - exercise 1 4451 408338 31495
## - menthlth_days 1 13238 417125 31666
## - gen_health 2 85645 489532 32944
##
## Final model variables (backward):
## [1] "(Intercept)" "menthlth_days"
## [3] "sexFemale" "exerciseYes"
## [5] "depressionYes" "age_group35-49"
## [7] "age_group50-64" "age_group65+"
## [9] "income$25K-$49K" "income$50K-$99K"
## [11] "income$100K+" "educationHS/GED"
## [13] "educationSome college" "educationCollege graduate"
## [15] "smokingCurrent" "smokingFormer"
## [17] "gen_healthGood" "gen_healthFair/Poor"
## === FORWARD SELECTION ===
mod_forward <- step(
mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "forward",
trace = 1
)## Start: AIC=34524.38
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 164022 434665 31967
## + menthlth_days 1 60303 538384 33677
## + exercise 1 34542 564145 34051
## + income 3 28842 569845 34135
## + depression 1 20750 577937 34244
## + smoking 2 12790 585897 34356
## + education 3 8593 590094 34415
## + marital 2 7321 591366 34430
## + bmi 1 6253 592434 34442
## + age_group 3 6527 592160 34443
## + sex 1 1649 597038 34504
## <none> 598687 34524
##
## Step: AIC=31967.07
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 17940.1 416725 31632
## + exercise 1 6466.7 428198 31849
## + depression 1 6070.1 428595 31857
## + income 3 3231.0 431434 31913
## + smoking 2 1776.3 432889 31938
## + age_group 3 1535.7 433129 31945
## + sex 1 1254.2 433411 31946
## + marital 2 830.3 433835 31956
## + education 3 415.5 434250 31965
## <none> 434665 31967
## + bmi 1 0.2 434665 31969
##
## Step: AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 5820.8 410904 31521
## + age_group 3 4661.5 412064 31548
## + income 3 1987.0 414738 31600
## + marital 2 1402.5 415322 31609
## + smoking 2 1034.6 415690 31616
## + depression 1 738.1 415987 31620
## + sex 1 605.2 416120 31622
## + education 3 318.9 416406 31632
## <none> 416725 31632
## + bmi 1 3.6 416721 31634
##
## Step: AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3720.5 407184 31455
## + income 3 1198.4 409706 31504
## + marital 2 1064.8 409839 31505
## + depression 1 739.1 410165 31509
## + smoking 2 706.2 410198 31512
## + education 3 686.9 410217 31514
## + sex 1 449.3 410455 31515
## <none> 410904 31521
## + bmi 1 82.9 410821 31522
##
## Step: AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 1162.69 406021 31438
## + depression 1 932.70 406251 31438
## + education 3 501.69 406682 31451
## + sex 1 261.03 406923 31452
## + smoking 2 360.51 406823 31452
## <none> 407184 31455
## + bmi 1 83.59 407100 31455
## + marital 2 40.11 407144 31458
##
## Step: AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + depression 1 861.57 405159 31423
## + education 3 953.62 405067 31425
## + smoking 2 305.02 405716 31436
## + sex 1 203.01 405818 31436
## <none> 406021 31438
## + bmi 1 74.91 405946 31438
## + marital 2 153.70 405867 31439
##
## Step: AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression
##
## Df Sum of Sq RSS AIC
## + education 3 837.21 404322 31412
## + smoking 2 259.11 404900 31422
## + sex 1 110.16 405049 31423
## + bmi 1 105.12 405054 31423
## <none> 405159 31423
## + marital 2 169.72 404990 31423
##
## Step: AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education
##
## Df Sum of Sq RSS AIC
## + smoking 2 332.18 403990 31410
## + bmi 1 110.19 404212 31412
## <none> 404322 31412
## + sex 1 77.87 404244 31413
## + marital 2 168.26 404154 31413
##
## Step: AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking
##
## Df Sum of Sq RSS AIC
## + sex 1 102.92 403887 31410
## <none> 403990 31410
## + bmi 1 98.80 403891 31410
## + marital 2 169.54 403820 31410
##
## Step: AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## + bmi 1 97.861 403789 31410
## + marital 2 170.601 403716 31410
##
## Final model variables (forward):
## [1] "(Intercept)" "gen_healthGood"
## [3] "gen_healthFair/Poor" "menthlth_days"
## [5] "exerciseYes" "age_group35-49"
## [7] "age_group50-64" "age_group65+"
## [9] "income$25K-$49K" "income$50K-$99K"
## [11] "income$100K+" "depressionYes"
## [13] "educationHS/GED" "educationSome college"
## [15] "educationCollege graduate" "smokingCurrent"
## [17] "smokingFormer" "sexFemale"
## === STEPWISE SELECTION ===
mod_stepwise <- step(
mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "both",
trace = 1
)## Start: AIC=34524.38
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 164022 434665 31967
## + menthlth_days 1 60303 538384 33677
## + exercise 1 34542 564145 34051
## + income 3 28842 569845 34135
## + depression 1 20750 577937 34244
## + smoking 2 12790 585897 34356
## + education 3 8593 590094 34415
## + marital 2 7321 591366 34430
## + bmi 1 6253 592434 34442
## + age_group 3 6527 592160 34443
## + sex 1 1649 597038 34504
## <none> 598687 34524
##
## Step: AIC=31967.07
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 17940 416725 31632
## + exercise 1 6467 428198 31849
## + depression 1 6070 428595 31857
## + income 3 3231 431434 31913
## + smoking 2 1776 432889 31938
## + age_group 3 1536 433129 31945
## + sex 1 1254 433411 31946
## + marital 2 830 433835 31956
## + education 3 416 434250 31965
## <none> 434665 31967
## + bmi 1 0 434665 31969
## - gen_health 2 164022 598687 34524
##
## Step: AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 5821 410904 31521
## + age_group 3 4661 412064 31548
## + income 3 1987 414738 31600
## + marital 2 1402 415322 31609
## + smoking 2 1035 415690 31616
## + depression 1 738 415987 31620
## + sex 1 605 416120 31622
## + education 3 319 416406 31632
## <none> 416725 31632
## + bmi 1 4 416721 31634
## - menthlth_days 1 17940 434665 31967
## - gen_health 2 121660 538384 33677
##
## Step: AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3720 407184 31455
## + income 3 1198 409706 31504
## + marital 2 1065 409839 31505
## + depression 1 739 410165 31509
## + smoking 2 706 410198 31512
## + education 3 687 410217 31514
## + sex 1 449 410455 31515
## <none> 410904 31521
## + bmi 1 83 410821 31522
## - exercise 1 5821 416725 31632
## - menthlth_days 1 17294 428198 31849
## - gen_health 2 101805 512709 33288
##
## Step: AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 1163 406021 31438
## + depression 1 933 406251 31438
## + education 3 502 406682 31451
## + sex 1 261 406923 31451
## + smoking 2 361 406823 31451
## <none> 407184 31455
## + bmi 1 84 407100 31455
## + marital 2 40 407144 31458
## - age_group 3 3720 410904 31521
## - exercise 1 4880 412064 31548
## - menthlth_days 1 19957 427141 31835
## - gen_health 2 94875 502059 33126
##
## Step: AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + depression 1 862 405159 31423
## + education 3 954 405067 31425
## + smoking 2 305 405716 31436
## + sex 1 203 405818 31436
## <none> 406021 31438
## + bmi 1 75 405946 31438
## + marital 2 154 405867 31439
## - income 3 1163 407184 31455
## - age_group 3 3685 409706 31504
## - exercise 1 4214 410235 31518
## - menthlth_days 1 18945 424966 31801
## - gen_health 2 87509 493530 32995
##
## Step: AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression
##
## Df Sum of Sq RSS AIC
## + education 3 837 404322 31412
## + smoking 2 259 404900 31422
## + sex 1 110 405049 31423
## + bmi 1 105 405054 31423
## <none> 405159 31423
## + marital 2 170 404990 31423
## - depression 1 862 406021 31438
## - income 3 1092 406251 31438
## - age_group 3 3871 409030 31493
## - exercise 1 4219 409378 31504
## - menthlth_days 1 13674 418834 31686
## - gen_health 2 86610 491770 32969
##
## Step: AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education
##
## Df Sum of Sq RSS AIC
## + smoking 2 332 403990 31410
## + bmi 1 110 404212 31412
## <none> 404322 31412
## + sex 1 78 404244 31413
## + marital 2 168 404154 31413
## - education 3 837 405159 31423
## - depression 1 745 405067 31425
## - income 3 1495 405818 31436
## - age_group 3 3558 407880 31476
## - exercise 1 4604 408926 31501
## - menthlth_days 1 13607 417929 31675
## - gen_health 2 86813 491135 32964
##
## Step: AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking
##
## Df Sum of Sq RSS AIC
## + sex 1 103 403887 31410
## <none> 403990 31410
## + bmi 1 99 403891 31410
## + marital 2 170 403820 31410
## - smoking 2 332 404322 31412
## - depression 1 691 404681 31421
## - education 3 910 404900 31422
## - income 3 1457 405447 31432
## - age_group 3 3178 407168 31466
## - exercise 1 4502 408492 31496
## - menthlth_days 1 13347 417337 31668
## - gen_health 2 85552 489542 32942
##
## Step: AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## - sex 1 103 403990 31410
## + bmi 1 98 403789 31410
## + marital 2 171 403716 31410
## - smoking 2 357 404244 31413
## - depression 1 612 404499 31420
## - education 3 875 404762 31421
## - income 3 1394 405281 31431
## - age_group 3 3049 406936 31464
## - exercise 1 4451 408338 31495
## - menthlth_days 1 13238 417125 31666
## - gen_health 2 85645 489532 32944
##
## Final model variables (stepwise):
## [1] "(Intercept)" "gen_healthGood"
## [3] "gen_healthFair/Poor" "menthlth_days"
## [5] "exerciseYes" "age_group35-49"
## [7] "age_group50-64" "age_group65+"
## [9] "income$25K-$49K" "income$50K-$99K"
## [11] "income$100K+" "depressionYes"
## [13] "educationHS/GED" "educationSome college"
## [15] "educationCollege graduate" "smokingCurrent"
## [17] "smokingFormer" "sexFemale"
# Compare final variables across methods
backward_vars <- names(coef(mod_backward))[-1] # exclude intercept
forward_vars <- names(coef(mod_forward))[-1]
stepwise_vars <- names(coef(mod_stepwise))[-1]
# Get all unique predictor terms (from max model)
all_terms <- names(coef(mod_max))[-1]
comparison_df <- tibble(
Term = all_terms,
Backward = ifelse(all_terms %in% backward_vars, "✓", "✗"),
Forward = ifelse(all_terms %in% forward_vars, "✓", "✗"),
Stepwise = ifelse(all_terms %in% stepwise_vars, "✓", "✗")
)
comparison_df |>
kable(
caption = "Variable Inclusion Across Stepwise Selection Methods",
align = c("l", "c", "c", "c")
) |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Term | Backward | Forward | Stepwise |
|---|---|---|---|
| menthlth_days | ✓ | ✓ | ✓ |
| bmi | ✗ | ✗ | ✗ |
| sexFemale | ✓ | ✓ | ✓ |
| exerciseYes | ✓ | ✓ | ✓ |
| depressionYes | ✓ | ✓ | ✓ |
| age_group35-49 | ✓ | ✓ | ✓ |
| age_group50-64 | ✓ | ✓ | ✓ |
| age_group65+ | ✓ | ✓ | ✓ |
| income$25K-$49K | ✓ | ✓ | ✓ |
| income$50K-$99K | ✓ | ✓ | ✓ |
| income$100K+ | ✓ | ✓ | ✓ |
| educationHS/GED | ✓ | ✓ | ✓ |
| educationSome college | ✓ | ✓ | ✓ |
| educationCollege graduate | ✓ | ✓ | ✓ |
| smokingCurrent | ✓ | ✓ | ✓ |
| smokingFormer | ✓ | ✓ | ✓ |
| gen_healthGood | ✓ | ✓ | ✓ |
| gen_healthFair/Poor | ✓ | ✓ | ✓ |
| maritalPreviously married | ✗ | ✗ | ✗ |
| maritalNever married | ✗ | ✗ | ✗ |
# AIC comparison across final models
aic_compare <- tibble(
Method = c("Maximum Model", "Backward", "Forward", "Stepwise"),
Predictors = c(length(names(coef(mod_max))),
length(names(coef(mod_backward))),
length(names(coef(mod_forward))),
length(names(coef(mod_stepwise)))),
AIC = round(c(AIC(mod_max), AIC(mod_backward),
AIC(mod_forward), AIC(mod_stepwise)), 2),
AdjR2 = round(c(summary(mod_max)$adj.r.squared,
summary(mod_backward)$adj.r.squared,
summary(mod_forward)$adj.r.squared,
summary(mod_stepwise)$adj.r.squared), 4)
)
aic_compare |>
kable(
caption = "Comparison of Model Fit Criteria Across Selection Methods",
col.names = c("Method", "# Parameters", "AIC", "Adjusted R²"),
align = c("l", "c", "r", "r")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Method | # Parameters | AIC | Adjusted R² |
|---|---|---|---|
| Maximum Model | 21 | 54115.05 | 0.3242 |
| Backward | 18 | 54114.57 | 0.3239 |
| Forward | 18 | 54114.57 | 0.3239 |
| Stepwise | 18 | 54114.57 | 0.3239 |
Comparison of Stepwise Methods: All three selection methods — backward elimination, forward selection, and stepwise — converge on the same final model, which includes all predictors from the maximum model with no variables removed. This agreement across methods provides strong evidence that each predictor contributes meaningfully to predicting physically unhealthy days. The consistency across methods is not surprising given the large sample size (N = 8,000), which provides sufficient power to detect even modest associations; AIC-based selection therefore retains variables that would be penalized out in smaller samples. No variables were excluded by any method, indicating that all 11 candidate predictors (mental health days, BMI, sex, exercise, depression, age group, income, education, smoking, general health, and marital status) each carry predictive signal above and beyond the others.
Final model selection: All three stepwise procedures retained the same predictors as the maximum model, confirming that the full 11-predictor model is optimal under AIC. While BIC favored a smaller model in best subsets (penalizing more heavily for sample size), the stepwise methods, which appropriately handle categorical variables as complete units, uniformly supported retaining all predictors. Given the consistency of the stepwise results, the theoretical plausibility of all included variables, and the acceptable Adjusted R², the full model with all 11 predictors is selected as the final linear regression model.
# Final model = maximum model (all stepwise methods agreed)
mod_final <- mod_max
# Display coefficient table
tidy(mod_final, conf.int = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 3))) |>
kable(
caption = "Final Linear Regression Model: Coefficient Table (N = 8,000)",
col.names = c("Term", "Estimate", "Std. Error", "t-statistic",
"p-value", "95% CI Lower", "95% CI Upper"),
align = c("l", rep("r", 6))
) |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) |>
scroll_box(width = "100%")| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 2.201 | 0.627 | 3.508 | 0.000 | 0.971 | 3.431 |
| menthlth_days | 0.184 | 0.011 | 16.172 | 0.000 | 0.161 | 0.206 |
| bmi | -0.019 | 0.013 | -1.459 | 0.145 | -0.044 | 0.006 |
| sexFemale | 0.234 | 0.164 | 1.431 | 0.153 | -0.087 | 0.556 |
| exerciseYes | -1.933 | 0.204 | -9.471 | 0.000 | -2.333 | -1.533 |
| depressionYes | 0.775 | 0.215 | 3.602 | 0.000 | 0.353 | 1.197 |
| age_group35-49 | 0.769 | 0.284 | 2.712 | 0.007 | 0.213 | 1.325 |
| age_group50-64 | 1.401 | 0.279 | 5.026 | 0.000 | 0.855 | 1.947 |
| age_group65+ | 1.727 | 0.279 | 6.202 | 0.000 | 1.181 | 2.273 |
| income$25K-$49K | -1.084 | 0.276 | -3.925 | 0.000 | -1.626 | -0.543 |
| income$50K-$99K | -1.524 | 0.276 | -5.520 | 0.000 | -2.065 | -0.983 |
| income$100K+ | -1.316 | 0.398 | -3.304 | 0.001 | -2.096 | -0.535 |
| educationHS/GED | 0.294 | 0.401 | 0.733 | 0.464 | -0.492 | 1.080 |
| educationSome college | 1.002 | 0.403 | 2.488 | 0.013 | 0.213 | 1.791 |
| educationCollege graduate | 1.053 | 0.404 | 2.603 | 0.009 | 0.260 | 1.845 |
| smokingCurrent | 0.464 | 0.266 | 1.744 | 0.081 | -0.058 | 0.986 |
| smokingFormer | 0.438 | 0.188 | 2.332 | 0.020 | 0.070 | 0.806 |
| gen_healthGood | 1.410 | 0.186 | 7.565 | 0.000 | 1.045 | 1.775 |
| gen_healthFair/Poor | 10.065 | 0.252 | 39.879 | 0.000 | 9.570 | 10.560 |
| maritalPreviously married | -0.267 | 0.207 | -1.291 | 0.197 | -0.672 | 0.138 |
| maritalNever married | -0.412 | 0.249 | -1.656 | 0.098 | -0.901 | 0.076 |
Interpretation of Selected Coefficients:
Mental Health Days (continuous): Holding all other variables constant, each additional mentally unhealthy day in the past 30 is associated with an increase of approximately 0.18 physically unhealthy days, on average. This positive association is highly statistically significant (p < 0.001), indicating a strong co-occurrence of mental and physical health burden.
General Health — Fair/Poor vs. Excellent/Very Good (categorical): Holding all other variables constant, adults who rate their general health as Fair or Poor report, on average, 10.1 more physically unhealthy days per month compared to those rating their health as Excellent or Very Good (the reference category). This is the largest single coefficient in the model, underscoring that self-rated health captures substantial variance in physical health burden.
Exercise — Yes vs. No (binary): Holding all other variables constant, adults who engaged in any exercise in the past 30 days report approximately 1.93 fewer physically unhealthy days per month compared to those who did not exercise. This negative association is consistent with the established role of physical activity in reducing pain and physical impairment.
par(mfrow = c(2, 2))
plot(mod_final, which = 1:4,
sub.caption = "Diagnostic Plots: Final Linear Regression Model")Interpretation of Diagnostic Plots:
Residuals vs. Fitted: There is a clear non-linear pattern in this plot, with residuals showing a fan-shaped spread that widens at higher fitted values. This suggests both non-linearity and heteroscedasticity (non-constant variance), which is a common finding with count-like outcome variables bounded at zero.
Normal Q-Q: The residuals deviate substantially from the diagonal reference line, particularly in the right tail, indicating a right-skewed residual distribution. This is expected given that physically unhealthy days is a zero-inflated, bounded count variable that violates the normality assumption.
Scale-Location: The smoothed line is not flat, it increases across fitted values, confirming that residual variance grows with the mean (heteroscedasticity). The assumption of equal variance (homoscedasticity) appears violated.
Residuals vs. Leverage: Most observations cluster near the origin with low leverage and small residuals. A small number of points have higher Cook’s distance but remain below the conventional threshold of 0.5, suggesting no single observation is unduly driving the model estimates.
Overall Conclusion on LINE Assumptions: The LINE assumptions are not fully satisfied for this model. Linearity is approximately met for the continuous predictors, but the residuals exhibit clear non-normality and heteroscedasticity, both attributable to the bounded, zero-inflated nature of the outcome (physically unhealthy days). Independence is reasonable given the survey design (with the caveat that BRFSS uses complex cluster sampling). These violations suggest that the linear model is a reasonable first approximation but that alternative approaches (e.g., Poisson regression, zero-inflated models, or log-transformation of the outcome) might better fit the data structure. However, with N = 8,000 the Central Limit Theorem provides some protection for inference on coefficients.
# Create frequent_distress: 1 if physhlth_days >= 14, 0 otherwise
brfss_analytic <- brfss_analytic |>
mutate(
frequent_distress = factor(
ifelse(physhlth_days >= 14, "Yes", "No"),
levels = c("No", "Yes") # "No" = reference level
)
)
# Report prevalence
distress_table <- brfss_analytic |>
count(frequent_distress) |>
mutate(Percentage = round(n / sum(n) * 100, 1))
distress_table |>
kable(
caption = "Prevalence of Frequent Physical Distress (≥14 days in past 30)",
col.names = c("Frequent Physical Distress", "N", "Percentage (%)"),
align = c("l", "r", "r")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Frequent Physical Distress | N | Percentage (%) |
|---|---|---|
| No | 6948 | 86.9 |
| Yes | 1052 | 13.2 |
Comment on outcome balance: The outcome is moderately imbalanced: approximately 13.2% of the analytic sample meets the CDC’s frequent physical distress threshold of 14 or more days, compared to 86.9% who do not. This level of imbalance (roughly 13.2% prevalence) is common in population-based health surveys and does not require special correction methods such as oversampling; standard logistic regression is appropriate.
Predictor chosen: gen_health (general
health status). I chose this predictor because the linear regression
results showed it had the largest coefficient magnitude among all
predictors, indicating the strongest unadjusted association with
physically unhealthy days. Individuals who rate their health as Fair or
Poor are likely to report high physical distress, making it a
theoretically and empirically strong predictor of the binary threshold
outcome.
# Fit simple logistic regression
mod_simple <- glm(
frequent_distress ~ gen_health,
data = brfss_analytic,
family = binomial
)
summary(mod_simple)##
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial,
## data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.39831 0.09021 -37.67 <2e-16 ***
## gen_healthGood 1.14179 0.11198 10.20 <2e-16 ***
## gen_healthFair/Poor 3.28880 0.10465 31.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 4754.1 on 7997 degrees of freedom
## AIC: 4760.1
##
## Number of Fisher Scoring iterations: 6
# Odds ratios and 95% CIs
or_simple <- exp(coef(mod_simple))
ci_simple <- exp(confint(mod_simple))
tibble(
Term = names(or_simple),
`Odds Ratio` = round(or_simple, 3),
`95% CI Lower` = round(ci_simple[, 1], 3),
`95% CI Upper` = round(ci_simple[, 2], 3)
) |>
kable(
caption = "Simple Logistic Regression: Odds Ratios for General Health Status",
align = c("l", "r", "r", "r")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Odds Ratio | 95% CI Lower | 95% CI Upper |
|---|---|---|---|
| (Intercept) | 0.033 | 0.028 | 0.040 |
| gen_healthGood | 3.132 | 2.521 | 3.911 |
| gen_healthFair/Poor | 26.811 | 21.915 | 33.038 |
Interpretation:
Good vs. Excellent/Very Good: Compared to adults who rate their general health as Excellent or Very Good (reference), those who rate it as Good have 3.13 times the odds of frequent physical distress (95% CI: [2.52, 3.91]). This association is statistically significant at α = 0.05 because the 95% CI excludes 1.0 and the Wald test p-value is < 0.001.
Fair/Poor vs. Excellent/Very Good: Compared to adults rating their health as Excellent or Very Good, those rating it as Fair or Poor have 26.81 times the odds of frequent physical distress (95% CI: [21.91, 33.04]). This is a strikingly large odds ratio confirming that poor self-rated health is strongly associated with frequent physical distress. This association is statistically significant at α = 0.05 (95% CI does not include 1.0; p < 0.001).
# Fit multiple logistic regression using same predictors as final linear model
mod_logistic <- glm(
frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health + marital,
data = brfss_analytic,
family = binomial
)
summary(mod_logistic)##
## Call:
## glm(formula = frequent_distress ~ menthlth_days + bmi + sex +
## exercise + depression + age_group + income + education +
## smoking + gen_health + marital, family = binomial, data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.639413 0.291867 -12.469 < 2e-16 ***
## menthlth_days 0.044787 0.004358 10.277 < 2e-16 ***
## bmi -0.004822 0.005556 -0.868 0.38546
## sexFemale 0.176389 0.081382 2.167 0.03020 *
## exerciseYes -0.586264 0.085768 -6.835 8.17e-12 ***
## depressionYes 0.267888 0.095954 2.792 0.00524 **
## age_group35-49 0.512672 0.162401 3.157 0.00160 **
## age_group50-64 0.767269 0.154816 4.956 7.20e-07 ***
## age_group65+ 0.964882 0.155220 6.216 5.09e-10 ***
## income$25K-$49K -0.215375 0.111712 -1.928 0.05386 .
## income$50K-$99K -0.501752 0.118036 -4.251 2.13e-05 ***
## income$100K+ -0.541456 0.227454 -2.381 0.01729 *
## educationHS/GED 0.049334 0.165069 0.299 0.76504
## educationSome college 0.331979 0.165058 2.011 0.04430 *
## educationCollege graduate 0.283640 0.171335 1.655 0.09783 .
## smokingCurrent 0.208048 0.116374 1.788 0.07381 .
## smokingFormer 0.200388 0.089714 2.234 0.02551 *
## gen_healthGood 0.895044 0.116270 7.698 1.38e-14 ***
## gen_healthFair/Poor 2.696530 0.115309 23.385 < 2e-16 ***
## maritalPreviously married -0.138611 0.096205 -1.441 0.14965
## maritalNever married -0.143542 0.127789 -1.123 0.26132
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 4420.7 on 7979 degrees of freedom
## AIC: 4462.7
##
## Number of Fisher Scoring iterations: 6
# Adjusted odds ratios with 95% CIs
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 3))) |>
kable(
caption = "Multiple Logistic Regression: Adjusted Odds Ratios (N = 8,000)",
col.names = c("Term", "Adj. OR", "Std. Error", "z-statistic",
"p-value", "95% CI Lower", "95% CI Upper"),
align = c("l", rep("r", 6))
) |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) |>
scroll_box(width = "100%")| Term | Adj. OR | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.026 | 0.292 | -12.469 | 0.000 | 0.015 | 0.046 |
| menthlth_days | 1.046 | 0.004 | 10.277 | 0.000 | 1.037 | 1.055 |
| bmi | 0.995 | 0.006 | -0.868 | 0.385 | 0.984 | 1.006 |
| sexFemale | 1.193 | 0.081 | 2.167 | 0.030 | 1.017 | 1.400 |
| exerciseYes | 0.556 | 0.086 | -6.835 | 0.000 | 0.470 | 0.658 |
| depressionYes | 1.307 | 0.096 | 2.792 | 0.005 | 1.082 | 1.576 |
| age_group35-49 | 1.670 | 0.162 | 3.157 | 0.002 | 1.217 | 2.302 |
| age_group50-64 | 2.154 | 0.155 | 4.956 | 0.000 | 1.596 | 2.929 |
| age_group65+ | 2.624 | 0.155 | 6.216 | 0.000 | 1.944 | 3.573 |
| income$25K-$49K | 0.806 | 0.112 | -1.928 | 0.054 | 0.648 | 1.004 |
| income$50K-$99K | 0.605 | 0.118 | -4.251 | 0.000 | 0.481 | 0.763 |
| income$100K+ | 0.582 | 0.227 | -2.381 | 0.017 | 0.368 | 0.899 |
| educationHS/GED | 1.051 | 0.165 | 0.299 | 0.765 | 0.762 | 1.456 |
| educationSome college | 1.394 | 0.165 | 2.011 | 0.044 | 1.011 | 1.932 |
| educationCollege graduate | 1.328 | 0.171 | 1.655 | 0.098 | 0.952 | 1.864 |
| smokingCurrent | 1.231 | 0.116 | 1.788 | 0.074 | 0.979 | 1.545 |
| smokingFormer | 1.222 | 0.090 | 2.234 | 0.026 | 1.024 | 1.456 |
| gen_healthGood | 2.447 | 0.116 | 7.698 | 0.000 | 1.952 | 3.081 |
| gen_healthFair/Poor | 14.828 | 0.115 | 23.385 | 0.000 | 11.863 | 18.648 |
| maritalPreviously married | 0.871 | 0.096 | -1.441 | 0.150 | 0.720 | 1.050 |
| maritalNever married | 0.866 | 0.128 | -1.123 | 0.261 | 0.673 | 1.111 |
Interpretation of Adjusted Odds Ratios:
Mental Health Days (continuous): Holding all other variables constant, each additional mentally unhealthy day in the past 30 is associated with 1.046 times the odds of frequent physical distress (95% CI: [1.037, 1.055]). Each additional day of mental distress slightly but significantly increases the odds of crossing the physical distress threshold.
General Health — Fair/Poor vs. Excellent/Very Good (categorical): After adjusting for all other predictors, adults rating their health as Fair or Poor have 14.83 times the odds of frequent physical distress compared to those rating it as Excellent or Very Good. This remains the strongest predictor in the adjusted model, with a 95% CI of [11.86, 18.65].
Depression — Yes vs. No (binary): Holding all other variables constant, adults who have ever been told they have a depressive disorder have 1.31 times the odds of frequent physical distress compared to those without a depression history (95% CI: [1.08, 1.58]). This confirms the well-documented bidirectional relationship between depression and physical health symptoms even after adjusting for demographic and behavioral confounders.
Testing the income group of predictors: I will
collectively test all income level dummy variables (i.e.,
income$25K-$49K, income$50K-$99K, and
income$100K+) to determine whether socioeconomic status, as
captured by income, significantly improves the logistic model beyond the
other predictors.
Hypotheses: - H₀: The income predictors do not improve model fit; the coefficients for all three income levels are simultaneously equal to zero (β_$25K-$49K = β_$50K-$99K = β_$100K+ = 0). - H₁: At least one income level coefficient is not equal to zero; income collectively improves model fit.
# Fit reduced model excluding income
mod_reduced <- glm(
frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
age_group + education + smoking + gen_health + marital,
data = brfss_analytic,
family = binomial
)
# Likelihood ratio test
lrt_result <- anova(mod_reduced, mod_logistic, test = "Chisq")
print(lrt_result)## Analysis of Deviance Table
##
## Model 1: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + education + smoking + gen_health + marital
## Model 2: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + income + education + smoking + gen_health + marital
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7982 4439.7
## 2 7979 4420.7 3 19.082 0.0002629 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Report LRT results
tibble(
Model = c("Reduced (no income)", "Full (with income)"),
Deviance = round(c(mod_reduced$deviance, mod_logistic$deviance), 2),
df = c(mod_reduced$df.residual, mod_logistic$df.residual)
) |>
kable(
caption = "Model Deviances for Likelihood Ratio Test",
align = c("l", "r", "r")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Model | Deviance | df |
|---|---|---|
| Reduced (no income) | 4439.74 | 7982 |
| Full (with income) | 4420.66 | 7979 |
## LRT Chi-square statistic: 19.082
## Degrees of freedom: 3
## p-value: 0.0002629
Conclusion: The likelihood ratio test yields a chi-square statistic of 19.08 with 3 degrees of freedom (corresponding to the 3 income dummy variables). The p-value is 0.000263. At α = 0.05, we reject the null hypothesis and conclude that the income predictors significantly improve the model fit beyond the other predictors.
# Generate ROC curve
roc_obj <- roc(
brfss_analytic$frequent_distress,
fitted(mod_logistic),
levels = c("No", "Yes"),
direction = "<"
)
# Plot ROC curve
plot(
roc_obj,
main = "ROC Curve: Frequent Physical Distress Model",
col = "steelblue",
lwd = 2,
print.auc = TRUE,
auc.polygon = TRUE,
auc.polygon.col = "lightblue",
print.auc.y = 0.45
)
abline(a = 0, b = 1, lty = 2, col = "gray50")# Report AUC with 95% CI
auc_val <- auc(roc_obj)
auc_ci <- ci.auc(roc_obj)
cat("AUC:", round(auc_val, 4), "\n")## AUC: 0.8555
## 95% CI: 0.8426 – 0.8683
Interpretation: The Area Under the ROC Curve (AUC) is 0.855 (95% CI: 0.843–0.868). Based on the discrimination guide provided, this value falls in the excellent (0.80–0.90) range. This means the model has excellent ability to discriminate between individuals who do and do not experience frequent physical distress: if we randomly selected one person with frequent distress and one without, the model would correctly rank the distressed person as having a higher predicted probability approximately 85.5% of the time (compared to 50% for a useless model). The combination of self-rated health, mental health days, depression history, and sociodemographic variables therefore provides substantial discriminative capacity for identifying frequent physical distress.
Hypotheses: - H₀: The logistic regression model fits the data well (observed and expected event frequencies are similar across deciles of predicted probability). - H₁: The logistic regression model does not fit the data well (there is significant discrepancy between observed and expected frequencies across deciles).
# Hosmer-Lemeshow goodness-of-fit test (g = 10 groups)
hl_test <- hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
print(hl_test)##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 5.5773, df = 8, p-value = 0.6945
# Observed vs. expected
hl_obs_exp <- cbind(hl_test$observed, hl_test$expected)
colnames(hl_obs_exp) <- c("Obs. No", "Obs. Yes", "Exp. No", "Exp. Yes")
hl_obs_exp |>
as.data.frame() |>
rownames_to_column("Decile") |>
mutate(across(where(is.numeric), ~ round(., 1))) |>
kable(
caption = "Hosmer-Lemeshow Test: Observed vs. Expected by Predicted Probability Decile",
align = c("l", rep("r", 4))
) |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Decile | Obs. No | Obs. Yes | Exp. No | Exp. Yes |
|---|---|---|---|---|
| [0.00675,0.0185] | 786 | 14 | 789.0 | 11.0 |
| (0.0185,0.025] | 784 | 16 | 782.6 | 17.4 |
| (0.025,0.0305] | 785 | 15 | 778.1 | 21.9 |
| (0.0305,0.0378] | 777 | 23 | 773.1 | 26.9 |
| (0.0378,0.0507] | 765 | 35 | 764.9 | 35.1 |
| (0.0507,0.0679] | 753 | 47 | 752.7 | 47.3 |
| (0.0679,0.0946] | 728 | 72 | 736.1 | 63.9 |
| (0.0946,0.187] | 692 | 108 | 695.8 | 104.2 |
| (0.187,0.425] | 546 | 254 | 552.1 | 247.9 |
| (0.425,0.893] | 332 | 468 | 323.6 | 476.4 |
##
## Hosmer-Lemeshow Test Summary:
## Chi-square statistic: 5.577
## Degrees of freedom: 8
## p-value: 0.6945
Interpretation: The Hosmer-Lemeshow test yields a chi-square statistic of 5.58 with 8 degrees of freedom. The p-value is 0.694. At α = 0.05, we fail to reject the null hypothesis, indicating no significant evidence of poor model fit.
Complementarity with ROC/AUC: The ROC/AUC and Hosmer-Lemeshow test evaluate different and complementary aspects of logistic model performance. The AUC measures discrimination — the model’s ability to rank individuals by their predicted probability — and does not depend on the calibration of predicted probabilities. The Hosmer-Lemeshow test assesses calibration — whether the predicted probabilities are well-aligned with observed event rates across the spectrum of risk. A model can have excellent AUC but poor calibration (if it consistently over- or under-estimates risk in certain subgroups), or vice versa. The combination of excellent discrimination (high AUC) and adequate calibration (non-significant HL test) provides a more complete picture of overall model performance than either metric alone.
The results of both the linear and logistic regression models paint a consistent picture of the multidimensional drivers of physical health burden among U.S. adults. General health status emerged as the single strongest predictor in both models: adults who rate their health as Fair or Poor report dramatically more physically unhealthy days and have markedly higher odds of frequent physical distress, even after adjusting for all other variables in the model. This likely reflects a genuine construct validity relationship, self-rated health captures accumulated chronic illness experience that the other variables only partially measure. Mental health days was the second strongest predictor in both frameworks, underscoring the well-established bidirectional linkage between mental and physical health; each additional mentally unhealthy day incrementally increased both the count of physical health days and the odds of crossing the distress threshold. Depression history was also consistently significant, reinforcing these findings.
Several predictors were significant across both models, including exercise (protective), income (gradient of worsening health with lower income), and age group (older adults reporting greater burden). A notable limitation of this analysis is its cross-sectional design: we cannot establish temporal ordering or causal direction between any predictor and the outcome. Among the most important unmeasured confounders are (1) chronic disease burden (e.g., diabetes, arthritis, cardiovascular disease), which drives both physical symptoms and poor self-rated health and would substantially confound associations with all sociodemographic predictors; and (2) access to healthcare and insurance status, which shapes both the experience of physical symptoms and one’s capacity to manage them, and which correlates with income, education, and employment status already in the model.
The linear regression model quantifies how many more physically unhealthy days are associated with each predictor, providing a clinically intuitive metric on the original scale (days per month). This is valuable for estimating the population-level burden: knowing that Fair/Poor self-rated health is associated with approximately 10.1 additional physically unhealthy days per month is directly actionable for public health planning. However, linear regression assumes a continuous, normally distributed outcome — an assumption clearly violated by a zero-inflated, bounded count variable.
Logistic regression reframes the question as a classification problem: does the individual cross the CDC’s clinical threshold for frequent physical distress? This binary framing has direct policy relevance because programs such as Medicaid care management use exactly such thresholds to identify high-need patients. The odds ratio scale, while less intuitive than the day count, allows for probabilistic interpretation and risk ranking across heterogeneous populations. The AUC of 0.855 provides an overall measure of the model’s discriminative value that has no equivalent in linear regression; conversely, R² (0.326 in the linear model) quantifies explained variance — a metric with no natural logistic equivalent. In clinical or screening contexts where the goal is to identify high-risk individuals, logistic regression is preferred; when the goal is to quantify the magnitude of association or predict the expected number of days for resource planning purposes, linear regression (or an appropriate count model) is more suitable.
I used google to prove some of the interpretations, this was to confirm my suspicions and make more understanding of what I am working on.
End of Assignment