## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.39     R6_2.6.1          fastmap_1.2.0     xfun_0.56        
##  [5] cachem_1.1.0      knitr_1.51        htmltools_0.5.9   rmarkdown_2.30   
##  [9] lifecycle_1.0.5   cli_3.6.5         sass_0.4.10       jquerylib_0.1.4  
## [13] compiler_4.5.3    rstudioapi_0.18.0 tools_4.5.3       evaluate_1.0.5   
## [17] bslib_0.9.0       yaml_2.3.12       otel_0.2.0        jsonlite_2.0.0   
## [21] rlang_1.1.7

1 Part 0: Data Preparation

1.1 Step 1: Load Packages and Import Data

library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(leaps)
library(pROC)
library(ResourceSelection)

1.2 Research Question

What individual, behavioral, and health factors are associated with physical health burden among U.S. adults? How do findings compare when the outcome is modeled as the number of physically unhealthy days (continuous) versus as an indicator of frequent physical distress (binary: 14 or more days in the past 30)?

# Import the BRFSS 2023 XPT file
# I downloaded the BRFSS 2023 XPT file from the CDC and saved it on my desktop and copied and used as my path for this assignment
brfss_raw <- read_xpt("C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/data/LLCP2023.XPT")

# Report raw dimensions
cat("Raw dataset dimensions:\n")

## Raw dataset dimensions:

cat("  Rows:", nrow(brfss_raw), "\n")

##   Rows: 433323

cat("  Columns:", ncol(brfss_raw), "\n")

##   Columns: 350

# Variable selection for this work 
brfss_selected <- brfss_raw |>
  select(
    PHYSHLTH,
    MENTHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    ADDEPEV3,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`,
    GENHLTH,
    MARITAL
  )

cat("Selected dataset dimensions:\n")

## Selected dataset dimensions:

cat("  Rows:", nrow(brfss_selected), "\n")

##   Rows: 433323

cat("  Columns:", ncol(brfss_selected), "\n")

##   Columns: 12

Predictor Selection Justification: I selected all 11 available predictors (both continuous: menthlth_days and bmi; plus binary and categorical variables) to build the most comprehensive maximum model possible. Mental health days and BMI are strong a priori predictors of physical health burden based on existing literature. The demographic, behavioral, and socioeconomic variables (age, income, education, smoking, general health, marital status, sex, exercise, and depression) capture the multi-dimensional determinants of physical health outlined in the social-ecological model. Including all candidate predictors in the maximum model allows the formal selection procedures to objectively determine which subset best predicts the outcome. In our semester long assignments and lab work, including class lecture you have mentioned and justified most of this variables hence the need to be used for this work.

1.3 Step 2: Recode All Variables

brfss_clean <- brfss_selected |>
  # Recode outcome: physhlth_days
  mutate(
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
      TRUE ~ NA_real_
    )
  ) |>
  
  # Recode menthlth_days (continuous)
  mutate(
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
      TRUE ~ NA_real_
    )
  ) |>
  
  # Recode bmi (continuous): divide by 100; 9999 -> NA
  mutate(
    bmi = case_when(
      `_BMI5` >= 9999 ~ NA_real_,
      TRUE ~ as.numeric(`_BMI5`) / 100
    )
  ) |>
  
  # Recode sex (binary): 1 = "Male", 2 = "Female"
  mutate(
    sex = factor(
      case_when(
        SEXVAR == 1 ~ "Male",
        SEXVAR == 2 ~ "Female",
        TRUE ~ NA_character_
      ),
      levels = c("Male", "Female")
    )
  ) |>
  
  # Recode exercise (binary): 1 = "Yes", 2 = "No"; 7, 9 -> NA
  mutate(
    exercise = factor(
      case_when(
        EXERANY2 == 1 ~ "Yes",
        EXERANY2 == 2 ~ "No",
        EXERANY2 %in% c(7, 9) ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("No", "Yes")
    )
  ) |>
  
  # Recode depression (binary): 1 = "Yes", 2 = "No"; 7, 9 -> NA
  mutate(
    depression = factor(
      case_when(
        ADDEPEV3 == 1 ~ "Yes",
        ADDEPEV3 == 2 ~ "No",
        ADDEPEV3 %in% c(7, 9) ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("No", "Yes")
    )
  ) |>
  
  # Recode age_group (categorical): collapse 5-year groups
  mutate(
    age_group = factor(
      case_when(
        `_AGEG5YR` %in% 1:3 ~ "18-34",
        `_AGEG5YR` %in% 4:6 ~ "35-49",
        `_AGEG5YR` %in% 7:9 ~ "50-64",
        `_AGEG5YR` %in% 10:13 ~ "65+",
        `_AGEG5YR` == 14 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("18-34", "35-49", "50-64", "65+")
    )
  ) |>
  
  # Recode income (categorical): collapse to 4 groups
  mutate(
    income = factor(
      case_when(
        `_INCOMG1` %in% 1:2 ~ "Less than $25K",
        `_INCOMG1` %in% 3:4 ~ "$25K-$49K",
        `_INCOMG1` %in% 5:6 ~ "$50K-$99K",
        `_INCOMG1` == 7 ~ "$100K+",
        `_INCOMG1` == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")
    )
  ) |>
  
  # Recode education (categorical)
  mutate(
    education = factor(
      case_when(
        EDUCA %in% 1:3 ~ "Less than HS",
        EDUCA == 4 ~ "HS/GED",
        EDUCA == 5 ~ "Some college",
        EDUCA == 6 ~ "College graduate",
        EDUCA == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Less than HS", "HS/GED", "Some college", "College graduate")
    )
  ) |>
  
  # Recode smoking (categorical): collapse to 3 groups
  mutate(
    smoking = factor(
      case_when(
        `_SMOKER3` %in% 1:2 ~ "Current",
        `_SMOKER3` == 3 ~ "Former",
        `_SMOKER3` == 4 ~ "Never",
        `_SMOKER3` == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Never", "Current", "Former")
    )
  ) |>
  
  # Recode gen_health (categorical): collapse to 3 groups
  mutate(
    gen_health = factor(
      case_when(
        GENHLTH %in% 1:2 ~ "Excellent/Very Good",
        GENHLTH == 3 ~ "Good",
        GENHLTH %in% 4:5 ~ "Fair/Poor",
        GENHLTH %in% c(7, 9) ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Excellent/Very Good", "Good", "Fair/Poor")
    )
  ) |>
  
  # Recode marital (categorical): collapse to 3 groups
  mutate(
    marital = factor(
      case_when(
        MARITAL %in% c(1, 6) ~ "Married/Partnered",
        MARITAL %in% 2:4 ~ "Previously married",
        MARITAL == 5 ~ "Never married",
        MARITAL == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Married/Partnered", "Previously married", "Never married")
    )
  ) |>
  
  # Select only recoded variables
  select(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
         age_group, income, education, smoking, gen_health, marital)

# Verify factor levels and reference levels
cat("Factor levels (reference = first level):\n\n")

## Factor levels (reference = first level):

cat("sex:", levels(brfss_clean$sex), "\n")

## sex: Male Female

cat("exercise:", levels(brfss_clean$exercise), "\n")

## exercise: No Yes

cat("depression:", levels(brfss_clean$depression), "\n")

## depression: No Yes

cat("age_group:", levels(brfss_clean$age_group), "\n")

## age_group: 18-34 35-49 50-64 65+

cat("income:", levels(brfss_clean$income), "\n")

## income: Less than $25K $25K-$49K $50K-$99K $100K+

cat("education:", levels(brfss_clean$education), "\n")

## education: Less than HS HS/GED Some college College graduate

cat("smoking:", levels(brfss_clean$smoking), "\n")

## smoking: Never Current Former

cat("gen_health:", levels(brfss_clean$gen_health), "\n")

## gen_health: Excellent/Very Good Good Fair/Poor

cat("marital:", levels(brfss_clean$marital), "\n")

## marital: Married/Partnered Previously married Never married

1.4 Step 3: Create the Analytic Dataset

# Report missingness before removing missing data
cat("=== Missing Data Report (Before Removal) ===\n\n")

## === Missing Data Report (Before Removal) ===

miss_report <- brfss_clean |>
  summarise(across(everything(), ~ sum(is.na(.)))) |>
  pivot_longer(everything(), names_to = "Variable", values_to = "N_Missing") |>
  mutate(
    Total = nrow(brfss_clean),
    Pct_Missing = round(N_Missing / Total * 100, 1)
  )

miss_report |>
  kable(
    caption = "Missing Data Summary Before Listwise Deletion",
    col.names = c("Variable", "N Missing", "Total N", "% Missing"),
    align = c("l", "r", "r", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Missing Data Summary Before Listwise Deletion
Variable	N Missing	Total N	% Missing
physhlth_days	10785	433323	2.5
menthlth_days	8108	433323	1.9
bmi	40535	433323	9.4
sex	0	433323	0.0
exercise	1251	433323	0.3
depression	2587	433323	0.6
age_group	7779	433323	1.8
income	86623	433323	20.0
education	2325	433323	0.5
smoking	23062	433323	5.3
gen_health	1262	433323	0.3
marital	4289	433323	1.0

# Remove missing values and draw sample
set.seed(1220)
brfss_analytic <- brfss_clean |>
  drop_na() |>
  slice_sample(n = 8000)

cat("Final analytic sample size:", nrow(brfss_analytic), "\n")

## Final analytic sample size: 8000

cat("Number of variables:", ncol(brfss_analytic), "\n")

## Number of variables: 12

# Descriptive summary table
brfss_analytic |>
  tbl_summary(
    label = list(
      physhlth_days ~ "Physically Unhealthy Days (past 30)",
      menthlth_days ~ "Mentally Unhealthy Days (past 30)",
      bmi ~ "Body Mass Index",
      sex ~ "Sex Assigned at Birth",
      exercise ~ "Any Exercise (past 30 days)",
      depression ~ "Ever Told Depressive Disorder",
      age_group ~ "Age Group",
      income ~ "Annual Household Income",
      education ~ "Education Level",
      smoking ~ "Smoking Status",
      gen_health ~ "General Health Status",
      marital ~ "Marital Status"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd}); Median: {median} [{p25}, {p75}]",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = list(all_continuous() ~ 1),
    missing = "no"
  ) |>
  bold_labels() |>
  modify_caption("**Table 1. Descriptive Summary of Analytic Sample (N = 8,000)**")

**Table 1. Descriptive Summary of Analytic Sample (N = 8,000)**
Characteristic	N = 8,000¹
Physically Unhealthy Days (past 30)	4.3 (8.7); Median: 0.0 [0.0, 4.0]
Mentally Unhealthy Days (past 30)	4.3 (8.2); Median: 0.0 [0.0, 5.0]
Body Mass Index	28.7 (6.5); Median: 27.5 [24.3, 31.8]
Sex Assigned at Birth
Male	3,971 (50%)
Female	4,029 (50%)
Any Exercise (past 30 days)	6,157 (77%)
Ever Told Depressive Disorder	1,776 (22%)
Age Group
18-34	1,294 (16%)
35-49	1,657 (21%)
50-64	2,109 (26%)
65+	2,940 (37%)
Annual Household Income
Less than $25K	1,090 (14%)
$25K-$49K	1,931 (24%)
$50K-$99K	4,297 (54%)
$100K+	682 (8.5%)
Education Level
Less than HS	391 (4.9%)
HS/GED	1,887 (24%)
Some college	2,115 (26%)
College graduate	3,607 (45%)
Smoking Status
Never	4,808 (60%)
Current	962 (12%)
Former	2,230 (28%)
General Health Status
Excellent/Very Good	3,926 (49%)
Good	2,648 (33%)
Fair/Poor	1,426 (18%)
Marital Status
Married/Partnered	4,582 (57%)
Previously married	2,050 (26%)
Never married	1,368 (17%)
¹ Mean (SD); Median: Median [Q1, Q3]; n (%)

2 Part 1: Model Selection for Linear Regression

2.1 Step 1: Fit the Maximum Model

# Fit the maximum linear regression model with all 11 predictors
mod_max <- lm(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data = brfss_analytic
)

# Display model summary
summary(mod_max)

## 
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise + 
##     depression + age_group + income + education + smoking + gen_health + 
##     marital, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.795  -2.839  -1.145   0.690  31.047 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.20132    0.62746   3.508 0.000453 ***
## menthlth_days              0.18362    0.01135  16.172  < 2e-16 ***
## bmi                       -0.01877    0.01287  -1.459 0.144638    
## sexFemale                  0.23449    0.16389   1.431 0.152538    
## exerciseYes               -1.93289    0.20408  -9.471  < 2e-16 ***
## depressionYes              0.77536    0.21528   3.602 0.000318 ***
## age_group35-49             0.76935    0.28365   2.712 0.006696 ** 
## age_group50-64             1.40108    0.27874   5.026 5.11e-07 ***
## age_group65+               1.72725    0.27852   6.202 5.87e-10 ***
## income$25K-$49K           -1.08440    0.27627  -3.925 8.74e-05 ***
## income$50K-$99K           -1.52385    0.27607  -5.520 3.50e-08 ***
## income$100K+              -1.31580    0.39821  -3.304 0.000956 ***
## educationHS/GED            0.29371    0.40089   0.733 0.463806    
## educationSome college      1.00198    0.40273   2.488 0.012867 *  
## educationCollege graduate  1.05262    0.40445   2.603 0.009269 ** 
## smokingCurrent             0.46414    0.26617   1.744 0.081241 .  
## smokingFormer              0.43777    0.18776   2.332 0.019749 *  
## gen_healthGood             1.40982    0.18635   7.565 4.30e-14 ***
## gen_healthFair/Poor       10.06509    0.25239  39.879  < 2e-16 ***
## maritalPreviously married -0.26701    0.20679  -1.291 0.196680    
## maritalNever married      -0.41244    0.24899  -1.656 0.097666 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared:  0.3258, Adjusted R-squared:  0.3242 
## F-statistic: 192.8 on 20 and 7979 DF,  p-value: < 2.2e-16

# Report model fit criteria
r2     <- summary(mod_max)$r.squared
adj_r2 <- summary(mod_max)$adj.r.squared
aic    <- AIC(mod_max)
bic    <- BIC(mod_max)

tibble(
  Criterion        = c("R-squared", "Adjusted R-squared", "AIC", "BIC"),
  Value            = round(c(r2, adj_r2, aic, bic), 3)
) |>
  kable(
    caption = "Model Fit Criteria for Maximum Linear Regression Model",
    align = c("l", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model Fit Criteria for Maximum Linear Regression Model
Criterion	Value
R-squared	0.326
Adjusted R-squared	0.324
AIC	54115.053
BIC	54268.772

Interpretation: The maximum model with all 11 predictors explains 32.6% of the variance in physically unhealthy days (R² = 0.326). The Adjusted R² of 0.324 penalizes for model complexity and suggests that the included predictors collectively contribute meaningful explanatory power beyond chance. The AIC of 5.41151^{4} and BIC of 5.42688^{4} serve as baselines against which to compare more parsimonious models; lower values indicate better fit relative to complexity.

2.2 Step 2: Best Subsets Regression

# Run best subsets regression
# Note: nvmax set to number of parameters (dummy variables) in max model
# regsubsets() treats each dummy variable as a separate predictor

best_sub <- regsubsets(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data   = brfss_analytic,
  nvmax  = 25,   # generous upper bound to capture all dummy terms
  really.big = TRUE
)

best_sub_summary <- summary(best_sub)

# Summary table of Adjusted R-squared and BIC by model size
best_sub_df <- tibble(
  `Model Size`       = seq_along(best_sub_summary$adjr2),
  `Adjusted R-sq`    = round(best_sub_summary$adjr2, 4),
  `BIC`              = round(best_sub_summary$bic, 2)
) |>
  mutate(
    `Best Adj R-sq` = ifelse(`Adjusted R-sq` == max(`Adjusted R-sq`), "★", ""),
    `Best BIC`      = ifelse(`BIC` == min(`BIC`), "★", "")
  )

best_sub_df |>
  kable(
    caption = "Best Subsets Regression: Adjusted R-squared and BIC by Model Size",
    align = c("c", "r", "r", "c", "c")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) |>
  row_spec(which(best_sub_summary$adjr2 == max(best_sub_summary$adjr2)),
           bold = TRUE, color = "white", background = "#2196F3") |>
  row_spec(which(best_sub_summary$bic == min(best_sub_summary$bic)),
           bold = TRUE, color = "white", background = "#4CAF50")

Best Subsets Regression: Adjusted R-squared and BIC by Model Size
Model Size	Adjusted R-sq	BIC	Best Adj R-sq	Best BIC
1	0.2627	-2420.70
2	0.2956	-2778.12
3	0.3074	-2905.97
4	0.3133	-2966.10
5	0.3160	-2989.73
6	0.3183	-3007.98
7	0.3197	-3016.83
8	0.3208	-3021.26
9	0.3215	-3021.91		★
10	0.3221	-3021.17
11	0.3225	-3018.29
12	0.3231	-3016.99
13	0.3235	-3013.96
14	0.3237	-3008.54
15	0.3239	-3002.54
16	0.3240	-2995.63
17	0.3241	-2988.57
18	0.3241	-2981.41
19	0.3242	-2974.06	★
20	0.3242	-2965.61	★

# Plot Adjusted R-squared and BIC across model sizes
par(mfrow = c(1, 2))

# Adjusted R-squared plot
plot(
  seq_along(best_sub_summary$adjr2),
  best_sub_summary$adjr2,
  type = "b", pch = 19, col = "steelblue",
  xlab = "Number of Predictors",
  ylab = "Adjusted R-squared",
  main = "Best Subsets: Adjusted R-squared"
)
best_adjr2_size <- which.max(best_sub_summary$adjr2)
points(best_adjr2_size, best_sub_summary$adjr2[best_adjr2_size],
       col = "red", pch = 8, cex = 2)

# BIC plot
plot(
  seq_along(best_sub_summary$bic),
  best_sub_summary$bic,
  type = "b", pch = 19, col = "darkgreen",
  xlab = "Number of Predictors",
  ylab = "BIC",
  main = "Best Subsets: BIC"
)
best_bic_size <- which.min(best_sub_summary$bic)
points(best_bic_size, best_sub_summary$bic[best_bic_size],
       col = "red", pch = 8, cex = 2)

par(mfrow = c(1, 1))

cat("Model size maximizing Adjusted R-squared:", which.max(best_sub_summary$adjr2), "\n")

## Model size maximizing Adjusted R-squared: 19

cat("Adjusted R-squared value:", round(max(best_sub_summary$adjr2), 4), "\n\n")

## Adjusted R-squared value: 0.3242

cat("Model size minimizing BIC:", which.min(best_sub_summary$bic), "\n")

## Model size minimizing BIC: 9

cat("BIC value:", round(min(best_sub_summary$bic), 2), "\n\n")

## BIC value: -3021.91

# Show which variables are in the best models
cat("Variables in best Adj-R2 model (size",
    which.max(best_sub_summary$adjr2), "):\n")

## Variables in best Adj-R2 model (size 19 ):

print(coef(best_sub, which.max(best_sub_summary$adjr2)))

##               (Intercept)             menthlth_days                       bmi 
##                2.42326705                0.18384599               -0.01869124 
##                 sexFemale               exerciseYes             depressionYes 
##                0.23712114               -1.92576399                0.77637429 
##            age_group35-49            age_group50-64              age_group65+ 
##                0.76294842                1.39939197                1.73028710 
##           income$25K-$49K           income$50K-$99K              income$100K+ 
##               -1.07158786               -1.50068492               -1.29329261 
##     educationSome college educationCollege graduate            smokingCurrent 
##                0.75452784                0.80071745                0.46323888 
##             smokingFormer            gen_healthGood       gen_healthFair/Poor 
##                0.43731142                1.40757016               10.05466396 
## maritalPreviously married      maritalNever married 
##               -0.26397878               -0.40454371

cat("\nVariables in best BIC model (size",
    which.min(best_sub_summary$bic), "):\n")

## 
## Variables in best BIC model (size 9 ):

print(coef(best_sub, which.min(best_sub_summary$bic)))

##         (Intercept)       menthlth_days         exerciseYes       depressionYes 
##           1.4036767           0.1881763          -1.8794631           0.8954395 
##      age_group35-49      age_group50-64        age_group65+     income$50K-$99K 
##           1.0035144           1.6583700           2.0548414          -0.5080191 
##      gen_healthGood gen_healthFair/Poor 
##           1.3542065          10.0640869

Interpretation: The two criteria identify different optimal model sizes. Adjusted R-squared is maximized at a larger model size of 19 predictors, indicating that additional parameters continue to improve fit (after penalty) at that level. BIC, which applies a more severe penalty for model complexity (especially with N = 8,000), favors a more parsimonious solution at 9 predictors. BIC therefore favors a simpler model, which is consistent with its mathematical design for larger sample sizes.

Important Limitation: regsubsets() treats each dummy variable from a factor as a separate predictor. This means some levels of a categorical variable can be included while others are excluded — which is statistically incoherent and violates the principle of marginality. For example, only the “Fair/Poor” level of gen_health might enter without the “Good” level. The stepwise methods in Step 3 handle factors as complete units and should be considered the primary selection approach.

2.3 Step 3: Stepwise Selection Methods

# Null model (intercept only) for forward/stepwise starting point
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)

# Backward elimination: start from maximum model
cat("=== BACKWARD ELIMINATION ===\n")

## === BACKWARD ELIMINATION ===

mod_backward <- step(
  mod_max,
  direction = "backward",
  trace     = 1
)

## Start:  AIC=31410.04
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health + marital
## 
##                 Df Sum of Sq    RSS   AIC
## - marital        2       180 403789 31410
## <none>                       403609 31410
## - sex            1       104 403712 31410
## - bmi            1       108 403716 31410
## - smoking        2       347 403956 31413
## - depression     1       656 404265 31421
## - education      3       876 404485 31421
## - income         3      1550 405158 31435
## - age_group      3      2235 405844 31448
## - exercise       1      4537 408146 31497
## - menthlth_days  1     13230 416839 31666
## - gen_health     2     84670 488279 32930
## 
## Step:  AIC=31409.61
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## - bmi            1        98 403887 31410
## <none>                       403789 31410
## - sex            1       102 403891 31410
## - smoking        2       345 404134 31412
## - depression     1       637 404427 31420
## - education      3       875 404665 31421
## - income         3      1388 405177 31431
## - age_group      3      3048 406837 31464
## - exercise       1      4538 408327 31497
## - menthlth_days  1     13195 416984 31665
## - gen_health     2     84707 488496 32929
## 
## Step:  AIC=31409.55
## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       403887 31410
## - sex            1       103 403990 31410
## - smoking        2       357 404244 31413
## - depression     1       612 404499 31420
## - education      3       875 404762 31421
## - income         3      1394 405281 31431
## - age_group      3      3049 406936 31464
## - exercise       1      4451 408338 31495
## - menthlth_days  1     13238 417125 31666
## - gen_health     2     85645 489532 32944

cat("\nFinal model variables (backward):\n")

## 
## Final model variables (backward):

print(names(coef(mod_backward)))

##  [1] "(Intercept)"               "menthlth_days"            
##  [3] "sexFemale"                 "exerciseYes"              
##  [5] "depressionYes"             "age_group35-49"           
##  [7] "age_group50-64"            "age_group65+"             
##  [9] "income$25K-$49K"           "income$50K-$99K"          
## [11] "income$100K+"              "educationHS/GED"          
## [13] "educationSome college"     "educationCollege graduate"
## [15] "smokingCurrent"            "smokingFormer"            
## [17] "gen_healthGood"            "gen_healthFair/Poor"

# Forward selection: start from null, max model as upper scope
cat("=== FORWARD SELECTION ===\n")

## === FORWARD SELECTION ===

mod_forward <- step(
  mod_null,
  scope     = list(lower = mod_null, upper = mod_max),
  direction = "forward",
  trace     = 1
)

## Start:  AIC=34524.38
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    164022 434665 31967
## + menthlth_days  1     60303 538384 33677
## + exercise       1     34542 564145 34051
## + income         3     28842 569845 34135
## + depression     1     20750 577937 34244
## + smoking        2     12790 585897 34356
## + education      3      8593 590094 34415
## + marital        2      7321 591366 34430
## + bmi            1      6253 592434 34442
## + age_group      3      6527 592160 34443
## + sex            1      1649 597038 34504
## <none>                       598687 34524
## 
## Step:  AIC=31967.07
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1   17940.1 416725 31632
## + exercise       1    6466.7 428198 31849
## + depression     1    6070.1 428595 31857
## + income         3    3231.0 431434 31913
## + smoking        2    1776.3 432889 31938
## + age_group      3    1535.7 433129 31945
## + sex            1    1254.2 433411 31946
## + marital        2     830.3 433835 31956
## + education      3     415.5 434250 31965
## <none>                       434665 31967
## + bmi            1       0.2 434665 31969
## 
## Step:  AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
## 
##              Df Sum of Sq    RSS   AIC
## + exercise    1    5820.8 410904 31521
## + age_group   3    4661.5 412064 31548
## + income      3    1987.0 414738 31600
## + marital     2    1402.5 415322 31609
## + smoking     2    1034.6 415690 31616
## + depression  1     738.1 415987 31620
## + sex         1     605.2 416120 31622
## + education   3     318.9 416406 31632
## <none>                    416725 31632
## + bmi         1       3.6 416721 31634
## 
## Step:  AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##              Df Sum of Sq    RSS   AIC
## + age_group   3    3720.5 407184 31455
## + income      3    1198.4 409706 31504
## + marital     2    1064.8 409839 31505
## + depression  1     739.1 410165 31509
## + smoking     2     706.2 410198 31512
## + education   3     686.9 410217 31514
## + sex         1     449.3 410455 31515
## <none>                    410904 31521
## + bmi         1      82.9 410821 31522
## 
## Step:  AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##              Df Sum of Sq    RSS   AIC
## + income      3   1162.69 406021 31438
## + depression  1    932.70 406251 31438
## + education   3    501.69 406682 31451
## + sex         1    261.03 406923 31452
## + smoking     2    360.51 406823 31452
## <none>                    407184 31455
## + bmi         1     83.59 407100 31455
## + marital     2     40.11 407144 31458
## 
## Step:  AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##              Df Sum of Sq    RSS   AIC
## + depression  1    861.57 405159 31423
## + education   3    953.62 405067 31425
## + smoking     2    305.02 405716 31436
## + sex         1    203.01 405818 31436
## <none>                    406021 31438
## + bmi         1     74.91 405946 31438
## + marital     2    153.70 405867 31439
## 
## Step:  AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression
## 
##             Df Sum of Sq    RSS   AIC
## + education  3    837.21 404322 31412
## + smoking    2    259.11 404900 31422
## + sex        1    110.16 405049 31423
## + bmi        1    105.12 405054 31423
## <none>                   405159 31423
## + marital    2    169.72 404990 31423
## 
## Step:  AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education
## 
##           Df Sum of Sq    RSS   AIC
## + smoking  2    332.18 403990 31410
## + bmi      1    110.19 404212 31412
## <none>                 404322 31412
## + sex      1     77.87 404244 31413
## + marital  2    168.26 404154 31413
## 
## Step:  AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking
## 
##           Df Sum of Sq    RSS   AIC
## + sex      1    102.92 403887 31410
## <none>                 403990 31410
## + bmi      1     98.80 403891 31410
## + marital  2    169.54 403820 31410
## 
## Step:  AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
## 
##           Df Sum of Sq    RSS   AIC
## <none>                 403887 31410
## + bmi      1    97.861 403789 31410
## + marital  2   170.601 403716 31410

cat("\nFinal model variables (forward):\n")

## 
## Final model variables (forward):

print(names(coef(mod_forward)))

##  [1] "(Intercept)"               "gen_healthGood"           
##  [3] "gen_healthFair/Poor"       "menthlth_days"            
##  [5] "exerciseYes"               "age_group35-49"           
##  [7] "age_group50-64"            "age_group65+"             
##  [9] "income$25K-$49K"           "income$50K-$99K"          
## [11] "income$100K+"              "depressionYes"            
## [13] "educationHS/GED"           "educationSome college"    
## [15] "educationCollege graduate" "smokingCurrent"           
## [17] "smokingFormer"             "sexFemale"

# Stepwise selection: start from null, both directions
cat("=== STEPWISE SELECTION ===\n")

## === STEPWISE SELECTION ===

mod_stepwise <- step(
  mod_null,
  scope     = list(lower = mod_null, upper = mod_max),
  direction = "both",
  trace     = 1
)

## Start:  AIC=34524.38
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    164022 434665 31967
## + menthlth_days  1     60303 538384 33677
## + exercise       1     34542 564145 34051
## + income         3     28842 569845 34135
## + depression     1     20750 577937 34244
## + smoking        2     12790 585897 34356
## + education      3      8593 590094 34415
## + marital        2      7321 591366 34430
## + bmi            1      6253 592434 34442
## + age_group      3      6527 592160 34443
## + sex            1      1649 597038 34504
## <none>                       598687 34524
## 
## Step:  AIC=31967.07
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1     17940 416725 31632
## + exercise       1      6467 428198 31849
## + depression     1      6070 428595 31857
## + income         3      3231 431434 31913
## + smoking        2      1776 432889 31938
## + age_group      3      1536 433129 31945
## + sex            1      1254 433411 31946
## + marital        2       830 433835 31956
## + education      3       416 434250 31965
## <none>                       434665 31967
## + bmi            1         0 434665 31969
## - gen_health     2    164022 598687 34524
## 
## Step:  AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
## 
##                 Df Sum of Sq    RSS   AIC
## + exercise       1      5821 410904 31521
## + age_group      3      4661 412064 31548
## + income         3      1987 414738 31600
## + marital        2      1402 415322 31609
## + smoking        2      1035 415690 31616
## + depression     1       738 415987 31620
## + sex            1       605 416120 31622
## + education      3       319 416406 31632
## <none>                       416725 31632
## + bmi            1         4 416721 31634
## - menthlth_days  1     17940 434665 31967
## - gen_health     2    121660 538384 33677
## 
## Step:  AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##                 Df Sum of Sq    RSS   AIC
## + age_group      3      3720 407184 31455
## + income         3      1198 409706 31504
## + marital        2      1065 409839 31505
## + depression     1       739 410165 31509
## + smoking        2       706 410198 31512
## + education      3       687 410217 31514
## + sex            1       449 410455 31515
## <none>                       410904 31521
## + bmi            1        83 410821 31522
## - exercise       1      5821 416725 31632
## - menthlth_days  1     17294 428198 31849
## - gen_health     2    101805 512709 33288
## 
## Step:  AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##                 Df Sum of Sq    RSS   AIC
## + income         3      1163 406021 31438
## + depression     1       933 406251 31438
## + education      3       502 406682 31451
## + sex            1       261 406923 31451
## + smoking        2       361 406823 31451
## <none>                       407184 31455
## + bmi            1        84 407100 31455
## + marital        2        40 407144 31458
## - age_group      3      3720 410904 31521
## - exercise       1      4880 412064 31548
## - menthlth_days  1     19957 427141 31835
## - gen_health     2     94875 502059 33126
## 
## Step:  AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##                 Df Sum of Sq    RSS   AIC
## + depression     1       862 405159 31423
## + education      3       954 405067 31425
## + smoking        2       305 405716 31436
## + sex            1       203 405818 31436
## <none>                       406021 31438
## + bmi            1        75 405946 31438
## + marital        2       154 405867 31439
## - income         3      1163 407184 31455
## - age_group      3      3685 409706 31504
## - exercise       1      4214 410235 31518
## - menthlth_days  1     18945 424966 31801
## - gen_health     2     87509 493530 32995
## 
## Step:  AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression
## 
##                 Df Sum of Sq    RSS   AIC
## + education      3       837 404322 31412
## + smoking        2       259 404900 31422
## + sex            1       110 405049 31423
## + bmi            1       105 405054 31423
## <none>                       405159 31423
## + marital        2       170 404990 31423
## - depression     1       862 406021 31438
## - income         3      1092 406251 31438
## - age_group      3      3871 409030 31493
## - exercise       1      4219 409378 31504
## - menthlth_days  1     13674 418834 31686
## - gen_health     2     86610 491770 32969
## 
## Step:  AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education
## 
##                 Df Sum of Sq    RSS   AIC
## + smoking        2       332 403990 31410
## + bmi            1       110 404212 31412
## <none>                       404322 31412
## + sex            1        78 404244 31413
## + marital        2       168 404154 31413
## - education      3       837 405159 31423
## - depression     1       745 405067 31425
## - income         3      1495 405818 31436
## - age_group      3      3558 407880 31476
## - exercise       1      4604 408926 31501
## - menthlth_days  1     13607 417929 31675
## - gen_health     2     86813 491135 32964
## 
## Step:  AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking
## 
##                 Df Sum of Sq    RSS   AIC
## + sex            1       103 403887 31410
## <none>                       403990 31410
## + bmi            1        99 403891 31410
## + marital        2       170 403820 31410
## - smoking        2       332 404322 31412
## - depression     1       691 404681 31421
## - education      3       910 404900 31422
## - income         3      1457 405447 31432
## - age_group      3      3178 407168 31466
## - exercise       1      4502 408492 31496
## - menthlth_days  1     13347 417337 31668
## - gen_health     2     85552 489542 32942
## 
## Step:  AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       403887 31410
## - sex            1       103 403990 31410
## + bmi            1        98 403789 31410
## + marital        2       171 403716 31410
## - smoking        2       357 404244 31413
## - depression     1       612 404499 31420
## - education      3       875 404762 31421
## - income         3      1394 405281 31431
## - age_group      3      3049 406936 31464
## - exercise       1      4451 408338 31495
## - menthlth_days  1     13238 417125 31666
## - gen_health     2     85645 489532 32944

cat("\nFinal model variables (stepwise):\n")

## 
## Final model variables (stepwise):

print(names(coef(mod_stepwise)))

##  [1] "(Intercept)"               "gen_healthGood"           
##  [3] "gen_healthFair/Poor"       "menthlth_days"            
##  [5] "exerciseYes"               "age_group35-49"           
##  [7] "age_group50-64"            "age_group65+"             
##  [9] "income$25K-$49K"           "income$50K-$99K"          
## [11] "income$100K+"              "depressionYes"            
## [13] "educationHS/GED"           "educationSome college"    
## [15] "educationCollege graduate" "smokingCurrent"           
## [17] "smokingFormer"             "sexFemale"

# Compare final variables across methods
backward_vars  <- names(coef(mod_backward))[-1]  # exclude intercept
forward_vars   <- names(coef(mod_forward))[-1]
stepwise_vars  <- names(coef(mod_stepwise))[-1]

# Get all unique predictor terms (from max model)
all_terms <- names(coef(mod_max))[-1]

comparison_df <- tibble(
  Term        = all_terms,
  Backward    = ifelse(all_terms %in% backward_vars,  "✓", "✗"),
  Forward     = ifelse(all_terms %in% forward_vars,   "✓", "✗"),
  Stepwise    = ifelse(all_terms %in% stepwise_vars,  "✓", "✗")
)

comparison_df |>
  kable(
    caption = "Variable Inclusion Across Stepwise Selection Methods",
    align = c("l", "c", "c", "c")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Variable Inclusion Across Stepwise Selection Methods
Term	Backward	Forward	Stepwise
menthlth_days	✓	✓	✓
bmi	✗	✗	✗
sexFemale	✓	✓	✓
exerciseYes	✓	✓	✓
depressionYes	✓	✓	✓
age_group35-49	✓	✓	✓
age_group50-64	✓	✓	✓
age_group65+	✓	✓	✓
income$25K-$49K	✓	✓	✓
income$50K-$99K	✓	✓	✓
income$100K+	✓	✓	✓
educationHS/GED	✓	✓	✓
educationSome college	✓	✓	✓
educationCollege graduate	✓	✓	✓
smokingCurrent	✓	✓	✓
smokingFormer	✓	✓	✓
gen_healthGood	✓	✓	✓
gen_healthFair/Poor	✓	✓	✓
maritalPreviously married	✗	✗	✗
maritalNever married	✗	✗	✗

# AIC comparison across final models
aic_compare <- tibble(
  Method     = c("Maximum Model", "Backward", "Forward", "Stepwise"),
  Predictors = c(length(names(coef(mod_max))),
                 length(names(coef(mod_backward))),
                 length(names(coef(mod_forward))),
                 length(names(coef(mod_stepwise)))),
  AIC        = round(c(AIC(mod_max), AIC(mod_backward),
                       AIC(mod_forward), AIC(mod_stepwise)), 2),
  AdjR2      = round(c(summary(mod_max)$adj.r.squared,
                       summary(mod_backward)$adj.r.squared,
                       summary(mod_forward)$adj.r.squared,
                       summary(mod_stepwise)$adj.r.squared), 4)
)

aic_compare |>
  kable(
    caption = "Comparison of Model Fit Criteria Across Selection Methods",
    col.names = c("Method", "# Parameters", "AIC", "Adjusted R²"),
    align = c("l", "c", "r", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Comparison of Model Fit Criteria Across Selection Methods
Method	# Parameters	AIC	Adjusted R²
Maximum Model	21	54115.05	0.3242
Backward	18	54114.57	0.3239
Forward	18	54114.57	0.3239
Stepwise	18	54114.57	0.3239

Comparison of Stepwise Methods: All three selection methods — backward elimination, forward selection, and stepwise — converge on the same final model, which includes all predictors from the maximum model with no variables removed. This agreement across methods provides strong evidence that each predictor contributes meaningfully to predicting physically unhealthy days. The consistency across methods is not surprising given the large sample size (N = 8,000), which provides sufficient power to detect even modest associations; AIC-based selection therefore retains variables that would be penalized out in smaller samples. No variables were excluded by any method, indicating that all 11 candidate predictors (mental health days, BMI, sex, exercise, depression, age group, income, education, smoking, general health, and marital status) each carry predictive signal above and beyond the others.

2.4 Step 4: Final Model Selection and Interpretation

Final model selection: All three stepwise procedures retained the same predictors as the maximum model, confirming that the full 11-predictor model is optimal under AIC. While BIC favored a smaller model in best subsets (penalizing more heavily for sample size), the stepwise methods, which appropriately handle categorical variables as complete units, uniformly supported retaining all predictors. Given the consistency of the stepwise results, the theoretical plausibility of all included variables, and the acceptable Adjusted R², the full model with all 11 predictors is selected as the final linear regression model.

# Final model = maximum model (all stepwise methods agreed)
mod_final <- mod_max

# Display coefficient table
tidy(mod_final, conf.int = TRUE) |>
  mutate(across(where(is.numeric), ~ round(., 3))) |>
  kable(
    caption = "Final Linear Regression Model: Coefficient Table (N = 8,000)",
    col.names = c("Term", "Estimate", "Std. Error", "t-statistic",
                  "p-value", "95% CI Lower", "95% CI Upper"),
    align = c("l", rep("r", 6))
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) |>
  scroll_box(width = "100%")

Final Linear Regression Model: Coefficient Table (N = 8,000)
Term	Estimate	Std. Error	t-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	2.201	0.627	3.508	0.000	0.971	3.431
menthlth_days	0.184	0.011	16.172	0.000	0.161	0.206
bmi	-0.019	0.013	-1.459	0.145	-0.044	0.006
sexFemale	0.234	0.164	1.431	0.153	-0.087	0.556
exerciseYes	-1.933	0.204	-9.471	0.000	-2.333	-1.533
depressionYes	0.775	0.215	3.602	0.000	0.353	1.197
age_group35-49	0.769	0.284	2.712	0.007	0.213	1.325
age_group50-64	1.401	0.279	5.026	0.000	0.855	1.947
age_group65+	1.727	0.279	6.202	0.000	1.181	2.273
income$25K-$49K	-1.084	0.276	-3.925	0.000	-1.626	-0.543
income$50K-$99K	-1.524	0.276	-5.520	0.000	-2.065	-0.983
income$100K+	-1.316	0.398	-3.304	0.001	-2.096	-0.535
educationHS/GED	0.294	0.401	0.733	0.464	-0.492	1.080
educationSome college	1.002	0.403	2.488	0.013	0.213	1.791
educationCollege graduate	1.053	0.404	2.603	0.009	0.260	1.845
smokingCurrent	0.464	0.266	1.744	0.081	-0.058	0.986
smokingFormer	0.438	0.188	2.332	0.020	0.070	0.806
gen_healthGood	1.410	0.186	7.565	0.000	1.045	1.775
gen_healthFair/Poor	10.065	0.252	39.879	0.000	9.570	10.560
maritalPreviously married	-0.267	0.207	-1.291	0.197	-0.672	0.138
maritalNever married	-0.412	0.249	-1.656	0.098	-0.901	0.076

Interpretation of Selected Coefficients:

Mental Health Days (continuous): Holding all other variables constant, each additional mentally unhealthy day in the past 30 is associated with an increase of approximately 0.18 physically unhealthy days, on average. This positive association is highly statistically significant (p < 0.001), indicating a strong co-occurrence of mental and physical health burden.
General Health — Fair/Poor vs. Excellent/Very Good (categorical): Holding all other variables constant, adults who rate their general health as Fair or Poor report, on average, 10.1 more physically unhealthy days per month compared to those rating their health as Excellent or Very Good (the reference category). This is the largest single coefficient in the model, underscoring that self-rated health captures substantial variance in physical health burden.
Exercise — Yes vs. No (binary): Holding all other variables constant, adults who engaged in any exercise in the past 30 days report approximately 1.93 fewer physically unhealthy days per month compared to those who did not exercise. This negative association is consistent with the established role of physical activity in reducing pain and physical impairment.

2.5 Step 5: LINE Assumption Check

par(mfrow = c(2, 2))
plot(mod_final, which = 1:4,
     sub.caption = "Diagnostic Plots: Final Linear Regression Model")

par(mfrow = c(1, 1))

Interpretation of Diagnostic Plots:

Residuals vs. Fitted: There is a clear non-linear pattern in this plot, with residuals showing a fan-shaped spread that widens at higher fitted values. This suggests both non-linearity and heteroscedasticity (non-constant variance), which is a common finding with count-like outcome variables bounded at zero.
Normal Q-Q: The residuals deviate substantially from the diagonal reference line, particularly in the right tail, indicating a right-skewed residual distribution. This is expected given that physically unhealthy days is a zero-inflated, bounded count variable that violates the normality assumption.
Scale-Location: The smoothed line is not flat, it increases across fitted values, confirming that residual variance grows with the mean (heteroscedasticity). The assumption of equal variance (homoscedasticity) appears violated.
Residuals vs. Leverage: Most observations cluster near the origin with low leverage and small residuals. A small number of points have higher Cook’s distance but remain below the conventional threshold of 0.5, suggesting no single observation is unduly driving the model estimates.

Overall Conclusion on LINE Assumptions: The LINE assumptions are not fully satisfied for this model. Linearity is approximately met for the continuous predictors, but the residuals exhibit clear non-normality and heteroscedasticity, both attributable to the bounded, zero-inflated nature of the outcome (physically unhealthy days). Independence is reasonable given the survey design (with the caveat that BRFSS uses complex cluster sampling). These violations suggest that the linear model is a reasonable first approximation but that alternative approaches (e.g., Poisson regression, zero-inflated models, or log-transformation of the outcome) might better fit the data structure. However, with N = 8,000 the Central Limit Theorem provides some protection for inference on coefficients.

3 Part 2: Logistic Regression

3.1 Step 1: Create the Binary Outcome

# Create frequent_distress: 1 if physhlth_days >= 14, 0 otherwise
brfss_analytic <- brfss_analytic |>
  mutate(
    frequent_distress = factor(
      ifelse(physhlth_days >= 14, "Yes", "No"),
      levels = c("No", "Yes")   # "No" = reference level
    )
  )

# Report prevalence
distress_table <- brfss_analytic |>
  count(frequent_distress) |>
  mutate(Percentage = round(n / sum(n) * 100, 1))

distress_table |>
  kable(
    caption = "Prevalence of Frequent Physical Distress (≥14 days in past 30)",
    col.names = c("Frequent Physical Distress", "N", "Percentage (%)"),
    align = c("l", "r", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Prevalence of Frequent Physical Distress (≥14 days in past 30)
Frequent Physical Distress	N	Percentage (%)
No	6948	86.9
Yes	1052	13.2

Comment on outcome balance: The outcome is moderately imbalanced: approximately 13.2% of the analytic sample meets the CDC’s frequent physical distress threshold of 14 or more days, compared to 86.9% who do not. This level of imbalance (roughly 13.2% prevalence) is common in population-based health surveys and does not require special correction methods such as oversampling; standard logistic regression is appropriate.

3.2 Step 2: Simple (Unadjusted) Logistic Regression

Predictor chosen: gen_health (general health status). I chose this predictor because the linear regression results showed it had the largest coefficient magnitude among all predictors, indicating the strongest unadjusted association with physically unhealthy days. Individuals who rate their health as Fair or Poor are likely to report high physical distress, making it a theoretically and empirically strong predictor of the binary threshold outcome.

# Fit simple logistic regression
mod_simple <- glm(
  frequent_distress ~ gen_health,
  data   = brfss_analytic,
  family = binomial
)

summary(mod_simple)

## 
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial, 
##     data = brfss_analytic)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.39831    0.09021  -37.67   <2e-16 ***
## gen_healthGood       1.14179    0.11198   10.20   <2e-16 ***
## gen_healthFair/Poor  3.28880    0.10465   31.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4754.1  on 7997  degrees of freedom
## AIC: 4760.1
## 
## Number of Fisher Scoring iterations: 6

# Odds ratios and 95% CIs
or_simple  <- exp(coef(mod_simple))
ci_simple  <- exp(confint(mod_simple))

tibble(
  Term         = names(or_simple),
  `Odds Ratio` = round(or_simple, 3),
  `95% CI Lower` = round(ci_simple[, 1], 3),
  `95% CI Upper` = round(ci_simple[, 2], 3)
) |>
  kable(
    caption = "Simple Logistic Regression: Odds Ratios for General Health Status",
    align = c("l", "r", "r", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Simple Logistic Regression: Odds Ratios for General Health Status
Term	Odds Ratio	95% CI Lower	95% CI Upper
(Intercept)	0.033	0.028	0.040
gen_healthGood	3.132	2.521	3.911
gen_healthFair/Poor	26.811	21.915	33.038

Interpretation:

Good vs. Excellent/Very Good: Compared to adults who rate their general health as Excellent or Very Good (reference), those who rate it as Good have 3.13 times the odds of frequent physical distress (95% CI: [2.52, 3.91]). This association is statistically significant at α = 0.05 because the 95% CI excludes 1.0 and the Wald test p-value is < 0.001.
Fair/Poor vs. Excellent/Very Good: Compared to adults rating their health as Excellent or Very Good, those rating it as Fair or Poor have 26.81 times the odds of frequent physical distress (95% CI: [21.91, 33.04]). This is a strikingly large odds ratio confirming that poor self-rated health is strongly associated with frequent physical distress. This association is statistically significant at α = 0.05 (95% CI does not include 1.0; p < 0.001).

3.3 Step 3: Multiple Logistic Regression

# Fit multiple logistic regression using same predictors as final linear model
mod_logistic <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data   = brfss_analytic,
  family = binomial
)

summary(mod_logistic)

## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days + bmi + sex + 
##     exercise + depression + age_group + income + education + 
##     smoking + gen_health + marital, family = binomial, data = brfss_analytic)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.639413   0.291867 -12.469  < 2e-16 ***
## menthlth_days              0.044787   0.004358  10.277  < 2e-16 ***
## bmi                       -0.004822   0.005556  -0.868  0.38546    
## sexFemale                  0.176389   0.081382   2.167  0.03020 *  
## exerciseYes               -0.586264   0.085768  -6.835 8.17e-12 ***
## depressionYes              0.267888   0.095954   2.792  0.00524 ** 
## age_group35-49             0.512672   0.162401   3.157  0.00160 ** 
## age_group50-64             0.767269   0.154816   4.956 7.20e-07 ***
## age_group65+               0.964882   0.155220   6.216 5.09e-10 ***
## income$25K-$49K           -0.215375   0.111712  -1.928  0.05386 .  
## income$50K-$99K           -0.501752   0.118036  -4.251 2.13e-05 ***
## income$100K+              -0.541456   0.227454  -2.381  0.01729 *  
## educationHS/GED            0.049334   0.165069   0.299  0.76504    
## educationSome college      0.331979   0.165058   2.011  0.04430 *  
## educationCollege graduate  0.283640   0.171335   1.655  0.09783 .  
## smokingCurrent             0.208048   0.116374   1.788  0.07381 .  
## smokingFormer              0.200388   0.089714   2.234  0.02551 *  
## gen_healthGood             0.895044   0.116270   7.698 1.38e-14 ***
## gen_healthFair/Poor        2.696530   0.115309  23.385  < 2e-16 ***
## maritalPreviously married -0.138611   0.096205  -1.441  0.14965    
## maritalNever married      -0.143542   0.127789  -1.123  0.26132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4420.7  on 7979  degrees of freedom
## AIC: 4462.7
## 
## Number of Fisher Scoring iterations: 6

# Adjusted odds ratios with 95% CIs
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
  mutate(across(where(is.numeric), ~ round(., 3))) |>
  kable(
    caption = "Multiple Logistic Regression: Adjusted Odds Ratios (N = 8,000)",
    col.names = c("Term", "Adj. OR", "Std. Error", "z-statistic",
                  "p-value", "95% CI Lower", "95% CI Upper"),
    align = c("l", rep("r", 6))
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) |>
  scroll_box(width = "100%")

Multiple Logistic Regression: Adjusted Odds Ratios (N = 8,000)
Term	Adj. OR	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.026	0.292	-12.469	0.000	0.015	0.046
menthlth_days	1.046	0.004	10.277	0.000	1.037	1.055
bmi	0.995	0.006	-0.868	0.385	0.984	1.006
sexFemale	1.193	0.081	2.167	0.030	1.017	1.400
exerciseYes	0.556	0.086	-6.835	0.000	0.470	0.658
depressionYes	1.307	0.096	2.792	0.005	1.082	1.576
age_group35-49	1.670	0.162	3.157	0.002	1.217	2.302
age_group50-64	2.154	0.155	4.956	0.000	1.596	2.929
age_group65+	2.624	0.155	6.216	0.000	1.944	3.573
income$25K-$49K	0.806	0.112	-1.928	0.054	0.648	1.004
income$50K-$99K	0.605	0.118	-4.251	0.000	0.481	0.763
income$100K+	0.582	0.227	-2.381	0.017	0.368	0.899
educationHS/GED	1.051	0.165	0.299	0.765	0.762	1.456
educationSome college	1.394	0.165	2.011	0.044	1.011	1.932
educationCollege graduate	1.328	0.171	1.655	0.098	0.952	1.864
smokingCurrent	1.231	0.116	1.788	0.074	0.979	1.545
smokingFormer	1.222	0.090	2.234	0.026	1.024	1.456
gen_healthGood	2.447	0.116	7.698	0.000	1.952	3.081
gen_healthFair/Poor	14.828	0.115	23.385	0.000	11.863	18.648
maritalPreviously married	0.871	0.096	-1.441	0.150	0.720	1.050
maritalNever married	0.866	0.128	-1.123	0.261	0.673	1.111

Interpretation of Adjusted Odds Ratios:

Mental Health Days (continuous): Holding all other variables constant, each additional mentally unhealthy day in the past 30 is associated with 1.046 times the odds of frequent physical distress (95% CI: [1.037, 1.055]). Each additional day of mental distress slightly but significantly increases the odds of crossing the physical distress threshold.
General Health — Fair/Poor vs. Excellent/Very Good (categorical): After adjusting for all other predictors, adults rating their health as Fair or Poor have 14.83 times the odds of frequent physical distress compared to those rating it as Excellent or Very Good. This remains the strongest predictor in the adjusted model, with a 95% CI of [11.86, 18.65].
Depression — Yes vs. No (binary): Holding all other variables constant, adults who have ever been told they have a depressive disorder have 1.31 times the odds of frequent physical distress compared to those without a depression history (95% CI: [1.08, 1.58]). This confirms the well-documented bidirectional relationship between depression and physical health symptoms even after adjusting for demographic and behavioral confounders.

3.4 Step 4: Likelihood Ratio Test

Testing the income group of predictors: I will collectively test all income level dummy variables (i.e., income$25K-$49K, income$50K-$99K, and income$100K+) to determine whether socioeconomic status, as captured by income, significantly improves the logistic model beyond the other predictors.

Hypotheses: - H₀: The income predictors do not improve model fit; the coefficients for all three income levels are simultaneously equal to zero (β_$25K-$49K = β_$50K-$99K = β_$100K+ = 0). - H₁: At least one income level coefficient is not equal to zero; income collectively improves model fit.

# Fit reduced model excluding income
mod_reduced <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + education + smoking + gen_health + marital,
  data   = brfss_analytic,
  family = binomial
)

# Likelihood ratio test
lrt_result <- anova(mod_reduced, mod_logistic, test = "Chisq")
print(lrt_result)

## Analysis of Deviance Table
## 
## Model 1: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + education + smoking + gen_health + marital
## Model 2: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health + marital
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7982     4439.7                          
## 2      7979     4420.7  3   19.082 0.0002629 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Report LRT results
tibble(
  Model       = c("Reduced (no income)", "Full (with income)"),
  Deviance    = round(c(mod_reduced$deviance, mod_logistic$deviance), 2),
  df          = c(mod_reduced$df.residual, mod_logistic$df.residual)
) |>
  kable(
    caption = "Model Deviances for Likelihood Ratio Test",
    align = c("l", "r", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model Deviances for Likelihood Ratio Test
Model	Deviance	df
Reduced (no income)	4439.74	7982
Full (with income)	4420.66	7979

cat("LRT Chi-square statistic:",
    round(lrt_result$Deviance[2], 3), "\n")

## LRT Chi-square statistic: 19.082

cat("Degrees of freedom:", lrt_result$Df[2], "\n")

## Degrees of freedom: 3

cat("p-value:", format.pval(lrt_result$`Pr(>Chi)`[2], digits = 4), "\n")

## p-value: 0.0002629

Conclusion: The likelihood ratio test yields a chi-square statistic of 19.08 with 3 degrees of freedom (corresponding to the 3 income dummy variables). The p-value is 0.000263. At α = 0.05, we reject the null hypothesis and conclude that the income predictors significantly improve the model fit beyond the other predictors.

3.5 Step 5: ROC Curve and AUC

# Generate ROC curve
roc_obj <- roc(
  brfss_analytic$frequent_distress,
  fitted(mod_logistic),
  levels = c("No", "Yes"),
  direction = "<"
)

# Plot ROC curve
plot(
  roc_obj,
  main      = "ROC Curve: Frequent Physical Distress Model",
  col       = "steelblue",
  lwd       = 2,
  print.auc = TRUE,
  auc.polygon = TRUE,
  auc.polygon.col = "lightblue",
  print.auc.y = 0.45
)
abline(a = 0, b = 1, lty = 2, col = "gray50")

# Report AUC with 95% CI
auc_val <- auc(roc_obj)
auc_ci  <- ci.auc(roc_obj)

cat("AUC:", round(auc_val, 4), "\n")

## AUC: 0.8555

cat("95% CI:", round(auc_ci[1], 4), "–", round(auc_ci[3], 4), "\n")

## 95% CI: 0.8426 – 0.8683

Interpretation: The Area Under the ROC Curve (AUC) is 0.855 (95% CI: 0.843–0.868). Based on the discrimination guide provided, this value falls in the excellent (0.80–0.90) range. This means the model has excellent ability to discriminate between individuals who do and do not experience frequent physical distress: if we randomly selected one person with frequent distress and one without, the model would correctly rank the distressed person as having a higher predicted probability approximately 85.5% of the time (compared to 50% for a useless model). The combination of self-rated health, mental health days, depression history, and sociodemographic variables therefore provides substantial discriminative capacity for identifying frequent physical distress.

3.6 Step 6: Hosmer-Lemeshow Goodness-of-Fit Test

Hypotheses: - H₀: The logistic regression model fits the data well (observed and expected event frequencies are similar across deciles of predicted probability). - H₁: The logistic regression model does not fit the data well (there is significant discrepancy between observed and expected frequencies across deciles).

# Hosmer-Lemeshow goodness-of-fit test (g = 10 groups)
hl_test <- hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
print(hl_test)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 5.5773, df = 8, p-value = 0.6945

# Observed vs. expected
hl_obs_exp <- cbind(hl_test$observed, hl_test$expected)
colnames(hl_obs_exp) <- c("Obs. No", "Obs. Yes", "Exp. No", "Exp. Yes")

hl_obs_exp |>
  as.data.frame() |>
  rownames_to_column("Decile") |>
  mutate(across(where(is.numeric), ~ round(., 1))) |>
  kable(
    caption = "Hosmer-Lemeshow Test: Observed vs. Expected by Predicted Probability Decile",
    align = c("l", rep("r", 4))
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Hosmer-Lemeshow Test: Observed vs. Expected by Predicted Probability Decile
Decile	Obs. No	Obs. Yes	Exp. No	Exp. Yes
[0.00675,0.0185]	786	14	789.0	11.0
(0.0185,0.025]	784	16	782.6	17.4
(0.025,0.0305]	785	15	778.1	21.9
(0.0305,0.0378]	777	23	773.1	26.9
(0.0378,0.0507]	765	35	764.9	35.1
(0.0507,0.0679]	753	47	752.7	47.3
(0.0679,0.0946]	728	72	736.1	63.9
(0.0946,0.187]	692	108	695.8	104.2
(0.187,0.425]	546	254	552.1	247.9
(0.425,0.893]	332	468	323.6	476.4

cat("\nHosmer-Lemeshow Test Summary:\n")

## 
## Hosmer-Lemeshow Test Summary:

cat("Chi-square statistic:", round(hl_test$statistic, 3), "\n")

## Chi-square statistic: 5.577

cat("Degrees of freedom:", hl_test$parameter, "\n")

## Degrees of freedom: 8

cat("p-value:", format.pval(hl_test$p.value, digits = 4), "\n")

## p-value: 0.6945

Interpretation: The Hosmer-Lemeshow test yields a chi-square statistic of 5.58 with 8 degrees of freedom. The p-value is 0.694. At α = 0.05, we fail to reject the null hypothesis, indicating no significant evidence of poor model fit.

Complementarity with ROC/AUC: The ROC/AUC and Hosmer-Lemeshow test evaluate different and complementary aspects of logistic model performance. The AUC measures discrimination — the model’s ability to rank individuals by their predicted probability — and does not depend on the calibration of predicted probabilities. The Hosmer-Lemeshow test assesses calibration — whether the predicted probabilities are well-aligned with observed event rates across the spectrum of risk. A model can have excellent AUC but poor calibration (if it consistently over- or under-estimates risk in certain subgroups), or vice versa. The combination of excellent discrimination (high AUC) and adequate calibration (non-significant HL test) provides a more complete picture of overall model performance than either metric alone.

4 Part 3: Reflection

4.1 A. Statistical Insights

The results of both the linear and logistic regression models paint a consistent picture of the multidimensional drivers of physical health burden among U.S. adults. General health status emerged as the single strongest predictor in both models: adults who rate their health as Fair or Poor report dramatically more physically unhealthy days and have markedly higher odds of frequent physical distress, even after adjusting for all other variables in the model. This likely reflects a genuine construct validity relationship, self-rated health captures accumulated chronic illness experience that the other variables only partially measure. Mental health days was the second strongest predictor in both frameworks, underscoring the well-established bidirectional linkage between mental and physical health; each additional mentally unhealthy day incrementally increased both the count of physical health days and the odds of crossing the distress threshold. Depression history was also consistently significant, reinforcing these findings.

Several predictors were significant across both models, including exercise (protective), income (gradient of worsening health with lower income), and age group (older adults reporting greater burden). A notable limitation of this analysis is its cross-sectional design: we cannot establish temporal ordering or causal direction between any predictor and the outcome. Among the most important unmeasured confounders are (1) chronic disease burden (e.g., diabetes, arthritis, cardiovascular disease), which drives both physical symptoms and poor self-rated health and would substantially confound associations with all sociodemographic predictors; and (2) access to healthcare and insurance status, which shapes both the experience of physical symptoms and one’s capacity to manage them, and which correlates with income, education, and employment status already in the model.

4.2 B. Linear versus Logistic Regression

The linear regression model quantifies how many more physically unhealthy days are associated with each predictor, providing a clinically intuitive metric on the original scale (days per month). This is valuable for estimating the population-level burden: knowing that Fair/Poor self-rated health is associated with approximately 10.1 additional physically unhealthy days per month is directly actionable for public health planning. However, linear regression assumes a continuous, normally distributed outcome — an assumption clearly violated by a zero-inflated, bounded count variable.

Logistic regression reframes the question as a classification problem: does the individual cross the CDC’s clinical threshold for frequent physical distress? This binary framing has direct policy relevance because programs such as Medicaid care management use exactly such thresholds to identify high-need patients. The odds ratio scale, while less intuitive than the day count, allows for probabilistic interpretation and risk ranking across heterogeneous populations. The AUC of 0.855 provides an overall measure of the model’s discriminative value that has no equivalent in linear regression; conversely, R² (0.326 in the linear model) quantifies explained variance — a metric with no natural logistic equivalent. In clinical or screening contexts where the goal is to identify high-risk individuals, logistic regression is preferred; when the goal is to quantify the magnitude of association or predict the expected number of days for resource planning purposes, linear regression (or an appropriate count model) is more suitable.

4.3 C. AI Transparency

I used google to prove some of the interpretations, this was to confirm my suspicions and make more understanding of what I am working on.

End of Assignment

EPI 553 / STA 553: Computer Homework 4

Model selection (best subsets, backward, forward, stepwise), model comparison criteria (Adjusted R-squared, AIC, BIC), LINE assumption checking, binary logistic regression, odds ratios, likelihood ratio tests, ROC/AUC, Hosmer-Lemeshow goodness-of-fit

Marina Frimpong

2026-04-23