## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.37     R6_2.6.1          fastmap_1.2.0     xfun_0.56        
##  [5] cachem_1.1.0      knitr_1.50        htmltools_0.5.8.1 rmarkdown_2.30   
##  [9] lifecycle_1.0.5   cli_3.6.5         sass_0.4.10       jquerylib_0.1.4  
## [13] compiler_4.5.1    rstudioapi_0.17.1 tools_4.5.1       evaluate_1.0.5   
## [17] bslib_0.9.0       yaml_2.3.10       rlang_1.1.7       jsonlite_2.0.0

1 Part 0: Data Preparation

1.1 Step 1: Load Packages and Import Data

library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(leaps)
library(pROC)
library(ResourceSelection)

1.2 Research Question

What individual, behavioral, and health factors are associated with physical health burden among U.S. adults? How do findings compare when the outcome is modeled as the number of physically unhealthy days (continuous) versus as an indicator of frequent physical distress (binary: 14 or more days in the past 30)?

# Import the BRFSS 2023 XPT file
brfss_raw <- read_xpt('/Users/samriddhi/Downloads/LLCP2023.XPT ')

# Report raw dimensions
cat("Raw dataset dimensions:\n")

## Raw dataset dimensions:

cat("  Rows:", nrow(brfss_raw), "\n")

##   Rows: 433323

cat("  Columns:", ncol(brfss_raw), "\n")

##   Columns: 350

# Variable selection for this work 
brfss_selected <- brfss_raw |>
  select(
    PHYSHLTH,
    MENTHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    ADDEPEV3,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`,
    GENHLTH,
    MARITAL
  )

cat("Selected dataset dimensions:\n")

## Selected dataset dimensions:

cat("  Rows:", nrow(brfss_selected), "\n")

##   Rows: 433323

cat("  Columns:", ncol(brfss_selected), "\n")

##   Columns: 12

Predictor Selection Justification: All 11 available predictors were selected to build the most comprehensive maximum model possible. Including all candidate predictors in the maximum model allows the formal selection procedures to objectively determine which subset best predicts the outcome.

2 Step 2: Recode All Variables

brfss_clean <- brfss_selected |>
  # Recode outcome: physhlth_days
  mutate(
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
      TRUE ~ NA_real_
    )
  ) |>
  
  # Recode menthlth_days (continuous)
  mutate(
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
      TRUE ~ NA_real_
    )
  ) |>
  
  # Recode bmi (continuous): divide by 100; 9999 -> NA
  mutate(
    bmi = case_when(
      `_BMI5` >= 9999 ~ NA_real_,
      TRUE ~ as.numeric(`_BMI5`) / 100
    )
  ) |>
  
  # Recode sex (binary): 1 = "Male", 2 = "Female"
  mutate(
    sex = factor(
      case_when(
        SEXVAR == 1 ~ "Male",
        SEXVAR == 2 ~ "Female",
        TRUE ~ NA_character_
      ),
      levels = c("Male", "Female")
    )
  ) |>
  
  # Recode exercise (binary): 1 = "Yes", 2 = "No"; 7, 9 -> NA
  mutate(
    exercise = factor(
      case_when(
        EXERANY2 == 1 ~ "Yes",
        EXERANY2 == 2 ~ "No",
        EXERANY2 %in% c(7, 9) ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("No", "Yes")
    )
  ) |>
  
  # Recode depression (binary): 1 = "Yes", 2 = "No"; 7, 9 -> NA
  mutate(
    depression = factor(
      case_when(
        ADDEPEV3 == 1 ~ "Yes",
        ADDEPEV3 == 2 ~ "No",
        ADDEPEV3 %in% c(7, 9) ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("No", "Yes")
    )
  ) |>
  
  # Recode age_group (categorical): collapse 5-year groups
  mutate(
    age_group = factor(
      case_when(
        `_AGEG5YR` %in% 1:3 ~ "18-34",
        `_AGEG5YR` %in% 4:6 ~ "35-49",
        `_AGEG5YR` %in% 7:9 ~ "50-64",
        `_AGEG5YR` %in% 10:13 ~ "65+",
        `_AGEG5YR` == 14 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("18-34", "35-49", "50-64", "65+")
    )
  ) |>
  
  # Recode income (categorical): collapse to 4 groups
  mutate(
    income = factor(
      case_when(
        `_INCOMG1` %in% 1:2 ~ "Less than $25K",
        `_INCOMG1` %in% 3:4 ~ "$25K-$49K",
        `_INCOMG1` %in% 5:6 ~ "$50K-$99K",
        `_INCOMG1` == 7 ~ "$100K+",
        `_INCOMG1` == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")
    )
  ) |>
  
  # Recode education (categorical)
  mutate(
    education = factor(
      case_when(
        EDUCA %in% 1:3 ~ "Less than HS",
        EDUCA == 4 ~ "HS/GED",
        EDUCA == 5 ~ "Some college",
        EDUCA == 6 ~ "College graduate",
        EDUCA == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Less than HS", "HS/GED", "Some college", "College graduate")
    )
  ) |>
  
  # Recode smoking (categorical): collapse to 3 groups
  mutate(
    smoking = factor(
      case_when(
        `_SMOKER3` %in% 1:2 ~ "Current",
        `_SMOKER3` == 3 ~ "Former",
        `_SMOKER3` == 4 ~ "Never",
        `_SMOKER3` == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Never", "Current", "Former")
    )
  ) |>
  
  # Recode gen_health (categorical): collapse to 3 groups
  mutate(
    gen_health = factor(
      case_when(
        GENHLTH %in% 1:2 ~ "Excellent/Very Good",
        GENHLTH == 3 ~ "Good",
        GENHLTH %in% 4:5 ~ "Fair/Poor",
        GENHLTH %in% c(7, 9) ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Excellent/Very Good", "Good", "Fair/Poor")
    )
  ) |>
  
  # Recode marital (categorical): collapse to 3 groups
  mutate(
    marital = factor(
      case_when(
        MARITAL %in% c(1, 6) ~ "Married/Partnered",
        MARITAL %in% 2:4 ~ "Previously married",
        MARITAL == 5 ~ "Never married",
        MARITAL == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Married/Partnered", "Previously married", "Never married")
    )
  ) |>
  
  # Select only recoded variables
  select(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
         age_group, income, education, smoking, gen_health, marital)

3 Step 3: Create the Analytic Dataset

# Report missingness before removing missing data
cat("=== Missing Data Report (Before Removal) ===\n\n")

## === Missing Data Report (Before Removal) ===

miss_report <- brfss_clean |>
  summarise(across(everything(), ~ sum(is.na(.)))) |>
  pivot_longer(everything(), names_to = "Variable", values_to = "N_Missing") |>
  mutate(
    Total = nrow(brfss_clean),
    Pct_Missing = round(N_Missing / Total * 100, 1)
  )

miss_report |>
  kable(
    caption = "Missing Data Summary Before Listwise Deletion",
    col.names = c("Variable", "N Missing", "Total N", "% Missing"),
    align = c("l", "r", "r", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Missing Data Summary Before Listwise Deletion
Variable	N Missing	Total N	% Missing
physhlth_days	10785	433323	2.5
menthlth_days	8108	433323	1.9
bmi	40535	433323	9.4
sex	0	433323	0.0
exercise	1251	433323	0.3
depression	2587	433323	0.6
age_group	7779	433323	1.8
income	86623	433323	20.0
education	2325	433323	0.5
smoking	23062	433323	5.3
gen_health	1262	433323	0.3
marital	4289	433323	1.0

# Remove missing values and draw sample
set.seed(1220)
brfss_analytic <- brfss_clean |>
  drop_na() |>
  slice_sample(n = 8000)

cat("Final analytic sample size:", nrow(brfss_analytic), "\n")

## Final analytic sample size: 8000

cat("Number of variables:", ncol(brfss_analytic), "\n")

## Number of variables: 12

# Descriptive summary table
brfss_analytic |>
  tbl_summary(
    label = list(
      physhlth_days ~ "Physically Unhealthy Days (past 30)",
      menthlth_days ~ "Mentally Unhealthy Days (past 30)",
      bmi ~ "Body Mass Index",
      sex ~ "Sex Assigned at Birth",
      exercise ~ "Any Exercise (past 30 days)",
      depression ~ "Ever Told Depressive Disorder",
      age_group ~ "Age Group",
      income ~ "Annual Household Income",
      education ~ "Education Level",
      smoking ~ "Smoking Status",
      gen_health ~ "General Health Status",
      marital ~ "Marital Status"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd}); Median: {median} [{p25}, {p75}]",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = list(all_continuous() ~ 1),
    missing = "no"
  ) |>
  bold_labels() |>
  modify_caption("**Table 1. Descriptive Summary of Analytic Sample (N = 8,000)**")

**Table 1. Descriptive Summary of Analytic Sample (N = 8,000)**
Characteristic	N = 8,000¹
Physically Unhealthy Days (past 30)	4.3 (8.7); Median: 0.0 [0.0, 4.0]
Mentally Unhealthy Days (past 30)	4.3 (8.2); Median: 0.0 [0.0, 5.0]
Body Mass Index	28.7 (6.5); Median: 27.5 [24.3, 31.8]
Sex Assigned at Birth
Male	3,971 (50%)
Female	4,029 (50%)
Any Exercise (past 30 days)	6,157 (77%)
Ever Told Depressive Disorder	1,776 (22%)
Age Group
18-34	1,294 (16%)
35-49	1,657 (21%)
50-64	2,109 (26%)
65+	2,940 (37%)
Annual Household Income
Less than $25K	1,090 (14%)
$25K-$49K	1,931 (24%)
$50K-$99K	4,297 (54%)
$100K+	682 (8.5%)
Education Level
Less than HS	391 (4.9%)
HS/GED	1,887 (24%)
Some college	2,115 (26%)
College graduate	3,607 (45%)
Smoking Status
Never	4,808 (60%)
Current	962 (12%)
Former	2,230 (28%)
General Health Status
Excellent/Very Good	3,926 (49%)
Good	2,648 (33%)
Fair/Poor	1,426 (18%)
Marital Status
Married/Partnered	4,582 (57%)
Previously married	2,050 (26%)
Never married	1,368 (17%)
¹ Mean (SD); Median: Median [Q1, Q3]; n (%)

4 Part 1: Model Selection for Linear Regression

4.1 Step 1: Fit the Maximum Model

# Fit the maximum linear regression model with all 11 predictors
mod_max <- lm(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data = brfss_analytic
)

# Display model summary
summary(mod_max)

## 
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise + 
##     depression + age_group + income + education + smoking + gen_health + 
##     marital, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.795  -2.839  -1.145   0.690  31.047 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.20132    0.62746   3.508 0.000453 ***
## menthlth_days              0.18362    0.01135  16.172  < 2e-16 ***
## bmi                       -0.01877    0.01287  -1.459 0.144638    
## sexFemale                  0.23449    0.16389   1.431 0.152538    
## exerciseYes               -1.93289    0.20408  -9.471  < 2e-16 ***
## depressionYes              0.77536    0.21528   3.602 0.000318 ***
## age_group35-49             0.76935    0.28365   2.712 0.006696 ** 
## age_group50-64             1.40108    0.27874   5.026 5.11e-07 ***
## age_group65+               1.72725    0.27852   6.202 5.87e-10 ***
## income$25K-$49K           -1.08440    0.27627  -3.925 8.74e-05 ***
## income$50K-$99K           -1.52385    0.27607  -5.520 3.50e-08 ***
## income$100K+              -1.31580    0.39821  -3.304 0.000956 ***
## educationHS/GED            0.29371    0.40089   0.733 0.463806    
## educationSome college      1.00198    0.40273   2.488 0.012867 *  
## educationCollege graduate  1.05262    0.40445   2.603 0.009269 ** 
## smokingCurrent             0.46414    0.26617   1.744 0.081241 .  
## smokingFormer              0.43777    0.18776   2.332 0.019749 *  
## gen_healthGood             1.40982    0.18635   7.565 4.30e-14 ***
## gen_healthFair/Poor       10.06509    0.25239  39.879  < 2e-16 ***
## maritalPreviously married -0.26701    0.20679  -1.291 0.196680    
## maritalNever married      -0.41244    0.24899  -1.656 0.097666 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared:  0.3258, Adjusted R-squared:  0.3242 
## F-statistic: 192.8 on 20 and 7979 DF,  p-value: < 2.2e-16

# Report model fit criteria
r2     <- summary(mod_max)$r.squared
adj_r2 <- summary(mod_max)$adj.r.squared
aic    <- AIC(mod_max)
bic    <- BIC(mod_max)

tibble(
  Criterion        = c("R-squared", "Adjusted R-squared", "AIC", "BIC"),
  Value            = round(c(r2, adj_r2, aic, bic), 3)
) |>
  kable(
    caption = "Model Fit Criteria for Maximum Linear Regression Model",
    align = c("l", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model Fit Criteria for Maximum Linear Regression Model
Criterion	Value
R-squared	0.326
Adjusted R-squared	0.324
AIC	54115.053
BIC	54268.772

Interpretation: The full model including all 11 predictors explains 32.6% of the variance in physically unhealthy days (R² = 0.326). The adjusted R² of 0.324, which accounts for model complexity, suggests that the predictors collectively provide meaningful explanatory power beyond chance. The AIC (54115.053) and BIC (54268.772) serve as benchmarks for comparing alternative, more parsimonious models, with lower values indicating a better balance between model fit and complexity.

4.2 Step 2: Best Subsets Regression

# Run best subsets regression

best_sub <- regsubsets(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data   = brfss_analytic,
  nvmax  = 25,   # generous upper bound to capture all dummy terms
  really.big = TRUE
)

best_sub_summary <- summary(best_sub)

# Summary table of Adjusted R-squared and BIC by model size
best_sub_df <- tibble(
  `Model Size`       = seq_along(best_sub_summary$adjr2),
  `Adjusted R-sq`    = round(best_sub_summary$adjr2, 4),
  `BIC`              = round(best_sub_summary$bic, 2)
) |>
  mutate(
    `Best Adj R-sq` = ifelse(`Adjusted R-sq` == max(`Adjusted R-sq`), "★", ""),
    `Best BIC`      = ifelse(`BIC` == min(`BIC`), "★", "")
  )

best_sub_df |>
  kable(
    caption = "Best Subsets Regression: Adjusted R-squared and BIC by Model Size",
    align = c("c", "r", "r", "c", "c")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) |>
  row_spec(which(best_sub_summary$adjr2 == max(best_sub_summary$adjr2)),
           bold = TRUE, color = "white", background = "purple") |>
  row_spec(which(best_sub_summary$bic == min(best_sub_summary$bic)),
           bold = TRUE, color = "white", background = "pink")

Best Subsets Regression: Adjusted R-squared and BIC by Model Size
Model Size	Adjusted R-sq	BIC	Best Adj R-sq	Best BIC
1	0.2627	-2420.70
2	0.2956	-2778.12
3	0.3074	-2905.97
4	0.3133	-2966.10
5	0.3160	-2989.73
6	0.3183	-3007.98
7	0.3197	-3016.83
8	0.3208	-3021.26
9	0.3215	-3021.91		★
10	0.3221	-3021.17
11	0.3225	-3018.29
12	0.3231	-3016.99
13	0.3235	-3013.96
14	0.3237	-3008.54
15	0.3239	-3002.54
16	0.3240	-2995.63
17	0.3241	-2988.57
18	0.3241	-2981.41
19	0.3242	-2974.06	★
20	0.3242	-2965.61	★

# Plot Adjusted R-squared and BIC across model sizes
par(mfrow = c(1, 2))

# Adjusted R-squared plot
plot(
  seq_along(best_sub_summary$adjr2),
  best_sub_summary$adjr2,
  type = "b", pch = 19, col = "blue",
  xlab = "Number of Predictors",
  ylab = "Adjusted R-squared",
  main = "Best Subsets: Adjusted R-squared"
)
best_adjr2_size <- which.max(best_sub_summary$adjr2)
points(best_adjr2_size, best_sub_summary$adjr2[best_adjr2_size],
       col = "orange", pch = 8, cex = 2)

# BIC plot
plot(
  seq_along(best_sub_summary$bic),
  best_sub_summary$bic,
  type = "b", pch = 19, col = "green",
  xlab = "Number of Predictors",
  ylab = "BIC",
  main = "Best Subsets: BIC"
)
best_bic_size <- which.min(best_sub_summary$bic)
points(best_bic_size, best_sub_summary$bic[best_bic_size],
       col = "red", pch = 8, cex = 2)

par(mfrow = c(1, 1))

cat("Model size maximizing Adjusted R-squared:", which.max(best_sub_summary$adjr2), "\n")

## Model size maximizing Adjusted R-squared: 19

cat("Adjusted R-squared value:", round(max(best_sub_summary$adjr2), 4), "\n\n")

## Adjusted R-squared value: 0.3242

cat("Model size minimizing BIC:", which.min(best_sub_summary$bic), "\n")

## Model size minimizing BIC: 9

cat("BIC value:", round(min(best_sub_summary$bic), 2), "\n\n")

## BIC value: -3021.91

# Show which variables are in the best models
cat("Variables in best Adj-R2 model (size",
    which.max(best_sub_summary$adjr2), "):\n")

## Variables in best Adj-R2 model (size 19 ):

print(coef(best_sub, which.max(best_sub_summary$adjr2)))

##               (Intercept)             menthlth_days                       bmi 
##                2.42326705                0.18384599               -0.01869124 
##                 sexFemale               exerciseYes             depressionYes 
##                0.23712114               -1.92576399                0.77637429 
##            age_group35-49            age_group50-64              age_group65+ 
##                0.76294842                1.39939197                1.73028710 
##           income$25K-$49K           income$50K-$99K              income$100K+ 
##               -1.07158786               -1.50068492               -1.29329261 
##     educationSome college educationCollege graduate            smokingCurrent 
##                0.75452784                0.80071745                0.46323888 
##             smokingFormer            gen_healthGood       gen_healthFair/Poor 
##                0.43731142                1.40757016               10.05466396 
## maritalPreviously married      maritalNever married 
##               -0.26397878               -0.40454371

cat("\nVariables in best BIC model (size",
    which.min(best_sub_summary$bic), "):\n")

## 
## Variables in best BIC model (size 9 ):

print(coef(best_sub, which.min(best_sub_summary$bic)))

##         (Intercept)       menthlth_days         exerciseYes       depressionYes 
##           1.4036767           0.1881763          -1.8794631           0.8954395 
##      age_group35-49      age_group50-64        age_group65+     income$50K-$99K 
##           1.0035144           1.6583700           2.0548414          -0.5080191 
##      gen_healthGood gen_healthFair/Poor 
##           1.3542065          10.0640869

4.3 Interpretation: The two criteria identify different optimal model sizes. Adjusted R-squared is maximized at a larger model size of r which.max(best_sub_summary$adjr2) predictors, indicating that adding variables continues to provide small improvements in model fit even after accounting for complexity. In contrast, BIC—which imposes a stronger penalty for model complexity, particularly with a large sample size (N ≈ 8,000)—selects a more parsimonious model at r which.min(best_sub_summary$bic) predictors. This reflects BIC’s tendency to favor simpler models when additional predictors do not substantially improve fit.

4.4 Step 3: Stepwise Selection Methods

# Null model (intercept only) for forward/stepwise starting point
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)

# Backward elimination: start from maximum model
cat("=== BACKWARD ELIMINATION ===\n")

## === BACKWARD ELIMINATION ===

mod_backward <- step(
  mod_max,
  direction = "backward",
  trace     = 1
)

## Start:  AIC=31410.04
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health + marital
## 
##                 Df Sum of Sq    RSS   AIC
## - marital        2       180 403789 31410
## <none>                       403609 31410
## - sex            1       104 403712 31410
## - bmi            1       108 403716 31410
## - smoking        2       347 403956 31413
## - depression     1       656 404265 31421
## - education      3       876 404485 31421
## - income         3      1550 405158 31435
## - age_group      3      2235 405844 31448
## - exercise       1      4537 408146 31497
## - menthlth_days  1     13230 416839 31666
## - gen_health     2     84670 488279 32930
## 
## Step:  AIC=31409.61
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## - bmi            1        98 403887 31410
## <none>                       403789 31410
## - sex            1       102 403891 31410
## - smoking        2       345 404134 31412
## - depression     1       637 404427 31420
## - education      3       875 404665 31421
## - income         3      1388 405177 31431
## - age_group      3      3048 406837 31464
## - exercise       1      4538 408327 31497
## - menthlth_days  1     13195 416984 31665
## - gen_health     2     84707 488496 32929
## 
## Step:  AIC=31409.55
## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       403887 31410
## - sex            1       103 403990 31410
## - smoking        2       357 404244 31413
## - depression     1       612 404499 31420
## - education      3       875 404762 31421
## - income         3      1394 405281 31431
## - age_group      3      3049 406936 31464
## - exercise       1      4451 408338 31495
## - menthlth_days  1     13238 417125 31666
## - gen_health     2     85645 489532 32944

cat("\nFinal model variables (backward):\n")

## 
## Final model variables (backward):

print(names(coef(mod_backward)))

##  [1] "(Intercept)"               "menthlth_days"            
##  [3] "sexFemale"                 "exerciseYes"              
##  [5] "depressionYes"             "age_group35-49"           
##  [7] "age_group50-64"            "age_group65+"             
##  [9] "income$25K-$49K"           "income$50K-$99K"          
## [11] "income$100K+"              "educationHS/GED"          
## [13] "educationSome college"     "educationCollege graduate"
## [15] "smokingCurrent"            "smokingFormer"            
## [17] "gen_healthGood"            "gen_healthFair/Poor"

# Forward selection: start from null, max model as upper scope
cat("=== FORWARD SELECTION ===\n")

## === FORWARD SELECTION ===

mod_forward <- step(
  mod_null,
  scope     = list(lower = mod_null, upper = mod_max),
  direction = "forward",
  trace     = 1
)

## Start:  AIC=34524.38
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    164022 434665 31967
## + menthlth_days  1     60303 538384 33677
## + exercise       1     34542 564145 34051
## + income         3     28842 569845 34135
## + depression     1     20750 577937 34244
## + smoking        2     12790 585897 34356
## + education      3      8593 590094 34415
## + marital        2      7321 591366 34430
## + bmi            1      6253 592434 34442
## + age_group      3      6527 592160 34443
## + sex            1      1649 597038 34504
## <none>                       598687 34524
## 
## Step:  AIC=31967.07
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1   17940.1 416725 31632
## + exercise       1    6466.7 428198 31849
## + depression     1    6070.1 428595 31857
## + income         3    3231.0 431434 31913
## + smoking        2    1776.3 432889 31938
## + age_group      3    1535.7 433129 31945
## + sex            1    1254.2 433411 31946
## + marital        2     830.3 433835 31956
## + education      3     415.5 434250 31965
## <none>                       434665 31967
## + bmi            1       0.2 434665 31969
## 
## Step:  AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
## 
##              Df Sum of Sq    RSS   AIC
## + exercise    1    5820.8 410904 31521
## + age_group   3    4661.5 412064 31548
## + income      3    1987.0 414738 31600
## + marital     2    1402.5 415322 31609
## + smoking     2    1034.6 415690 31616
## + depression  1     738.1 415987 31620
## + sex         1     605.2 416120 31622
## + education   3     318.9 416406 31632
## <none>                    416725 31632
## + bmi         1       3.6 416721 31634
## 
## Step:  AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##              Df Sum of Sq    RSS   AIC
## + age_group   3    3720.5 407184 31455
## + income      3    1198.4 409706 31504
## + marital     2    1064.8 409839 31505
## + depression  1     739.1 410165 31509
## + smoking     2     706.2 410198 31512
## + education   3     686.9 410217 31514
## + sex         1     449.3 410455 31515
## <none>                    410904 31521
## + bmi         1      82.9 410821 31522
## 
## Step:  AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##              Df Sum of Sq    RSS   AIC
## + income      3   1162.69 406021 31438
## + depression  1    932.70 406251 31438
## + education   3    501.69 406682 31451
## + sex         1    261.03 406923 31452
## + smoking     2    360.51 406823 31452
## <none>                    407184 31455
## + bmi         1     83.59 407100 31455
## + marital     2     40.11 407144 31458
## 
## Step:  AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##              Df Sum of Sq    RSS   AIC
## + depression  1    861.57 405159 31423
## + education   3    953.62 405067 31425
## + smoking     2    305.02 405716 31436
## + sex         1    203.01 405818 31436
## <none>                    406021 31438
## + bmi         1     74.91 405946 31438
## + marital     2    153.70 405867 31439
## 
## Step:  AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression
## 
##             Df Sum of Sq    RSS   AIC
## + education  3    837.21 404322 31412
## + smoking    2    259.11 404900 31422
## + sex        1    110.16 405049 31422
## + bmi        1    105.12 405054 31423
## <none>                   405159 31423
## + marital    2    169.72 404990 31423
## 
## Step:  AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education
## 
##           Df Sum of Sq    RSS   AIC
## + smoking  2    332.18 403990 31410
## + bmi      1    110.19 404212 31412
## <none>                 404322 31412
## + sex      1     77.87 404244 31413
## + marital  2    168.26 404154 31413
## 
## Step:  AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking
## 
##           Df Sum of Sq    RSS   AIC
## + sex      1    102.92 403887 31410
## <none>                 403990 31410
## + bmi      1     98.80 403891 31410
## + marital  2    169.54 403820 31410
## 
## Step:  AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
## 
##           Df Sum of Sq    RSS   AIC
## <none>                 403887 31410
## + bmi      1    97.861 403789 31410
## + marital  2   170.601 403716 31410

cat("\nFinal model variables (forward):\n")

## 
## Final model variables (forward):

print(names(coef(mod_forward)))

##  [1] "(Intercept)"               "gen_healthGood"           
##  [3] "gen_healthFair/Poor"       "menthlth_days"            
##  [5] "exerciseYes"               "age_group35-49"           
##  [7] "age_group50-64"            "age_group65+"             
##  [9] "income$25K-$49K"           "income$50K-$99K"          
## [11] "income$100K+"              "depressionYes"            
## [13] "educationHS/GED"           "educationSome college"    
## [15] "educationCollege graduate" "smokingCurrent"           
## [17] "smokingFormer"             "sexFemale"

# Stepwise selection: start from null, both directions
cat("=== STEPWISE SELECTION ===\n")

## === STEPWISE SELECTION ===

mod_stepwise <- step(
  mod_null,
  scope     = list(lower = mod_null, upper = mod_max),
  direction = "both",
  trace     = 1
)

## Start:  AIC=34524.38
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    164022 434665 31967
## + menthlth_days  1     60303 538384 33677
## + exercise       1     34542 564145 34051
## + income         3     28842 569845 34135
## + depression     1     20750 577937 34244
## + smoking        2     12790 585897 34356
## + education      3      8593 590094 34415
## + marital        2      7321 591366 34430
## + bmi            1      6253 592434 34442
## + age_group      3      6527 592160 34443
## + sex            1      1649 597038 34504
## <none>                       598687 34524
## 
## Step:  AIC=31967.07
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1     17940 416725 31632
## + exercise       1      6467 428198 31849
## + depression     1      6070 428595 31857
## + income         3      3231 431434 31913
## + smoking        2      1776 432889 31938
## + age_group      3      1536 433129 31945
## + sex            1      1254 433411 31946
## + marital        2       830 433835 31956
## + education      3       416 434250 31965
## <none>                       434665 31967
## + bmi            1         0 434665 31969
## - gen_health     2    164022 598687 34524
## 
## Step:  AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
## 
##                 Df Sum of Sq    RSS   AIC
## + exercise       1      5821 410904 31521
## + age_group      3      4661 412064 31548
## + income         3      1987 414738 31600
## + marital        2      1402 415322 31609
## + smoking        2      1035 415690 31616
## + depression     1       738 415987 31620
## + sex            1       605 416120 31622
## + education      3       319 416406 31632
## <none>                       416725 31632
## + bmi            1         4 416721 31634
## - menthlth_days  1     17940 434665 31967
## - gen_health     2    121660 538384 33677
## 
## Step:  AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##                 Df Sum of Sq    RSS   AIC
## + age_group      3      3720 407184 31455
## + income         3      1198 409706 31504
## + marital        2      1065 409839 31505
## + depression     1       739 410165 31509
## + smoking        2       706 410198 31512
## + education      3       687 410217 31514
## + sex            1       449 410455 31515
## <none>                       410904 31521
## + bmi            1        83 410821 31522
## - exercise       1      5821 416725 31632
## - menthlth_days  1     17294 428198 31849
## - gen_health     2    101805 512709 33288
## 
## Step:  AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##                 Df Sum of Sq    RSS   AIC
## + income         3      1163 406021 31438
## + depression     1       933 406251 31438
## + education      3       502 406682 31451
## + sex            1       261 406923 31451
## + smoking        2       361 406823 31451
## <none>                       407184 31455
## + bmi            1        84 407100 31455
## + marital        2        40 407144 31458
## - age_group      3      3720 410904 31521
## - exercise       1      4880 412064 31548
## - menthlth_days  1     19957 427141 31835
## - gen_health     2     94875 502059 33126
## 
## Step:  AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##                 Df Sum of Sq    RSS   AIC
## + depression     1       862 405159 31423
## + education      3       954 405067 31425
## + smoking        2       305 405716 31436
## + sex            1       203 405818 31436
## <none>                       406021 31438
## + bmi            1        75 405946 31438
## + marital        2       154 405867 31439
## - income         3      1163 407184 31455
## - age_group      3      3685 409706 31504
## - exercise       1      4214 410235 31518
## - menthlth_days  1     18945 424966 31801
## - gen_health     2     87509 493530 32995
## 
## Step:  AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression
## 
##                 Df Sum of Sq    RSS   AIC
## + education      3       837 404322 31412
## + smoking        2       259 404900 31422
## + sex            1       110 405049 31423
## + bmi            1       105 405054 31423
## <none>                       405159 31423
## + marital        2       170 404990 31423
## - depression     1       862 406021 31438
## - income         3      1092 406251 31438
## - age_group      3      3871 409030 31493
## - exercise       1      4219 409378 31504
## - menthlth_days  1     13674 418834 31686
## - gen_health     2     86610 491770 32969
## 
## Step:  AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education
## 
##                 Df Sum of Sq    RSS   AIC
## + smoking        2       332 403990 31410
## + bmi            1       110 404212 31412
## <none>                       404322 31412
## + sex            1        78 404244 31413
## + marital        2       168 404154 31413
## - education      3       837 405159 31423
## - depression     1       745 405067 31425
## - income         3      1495 405818 31436
## - age_group      3      3558 407880 31476
## - exercise       1      4604 408926 31501
## - menthlth_days  1     13607 417929 31675
## - gen_health     2     86813 491135 32964
## 
## Step:  AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking
## 
##                 Df Sum of Sq    RSS   AIC
## + sex            1       103 403887 31410
## <none>                       403990 31410
## + bmi            1        99 403891 31410
## + marital        2       170 403820 31410
## - smoking        2       332 404322 31412
## - depression     1       691 404681 31421
## - education      3       910 404900 31422
## - income         3      1457 405447 31432
## - age_group      3      3178 407168 31466
## - exercise       1      4502 408492 31496
## - menthlth_days  1     13347 417337 31668
## - gen_health     2     85552 489542 32942
## 
## Step:  AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       403887 31410
## - sex            1       103 403990 31410
## + bmi            1        98 403789 31410
## + marital        2       171 403716 31410
## - smoking        2       357 404244 31413
## - depression     1       612 404499 31420
## - education      3       875 404762 31421
## - income         3      1394 405281 31431
## - age_group      3      3049 406936 31464
## - exercise       1      4451 408338 31495
## - menthlth_days  1     13238 417125 31666
## - gen_health     2     85645 489532 32944

cat("\nFinal model variables (stepwise):\n")

## 
## Final model variables (stepwise):

print(names(coef(mod_stepwise)))

##  [1] "(Intercept)"               "gen_healthGood"           
##  [3] "gen_healthFair/Poor"       "menthlth_days"            
##  [5] "exerciseYes"               "age_group35-49"           
##  [7] "age_group50-64"            "age_group65+"             
##  [9] "income$25K-$49K"           "income$50K-$99K"          
## [11] "income$100K+"              "depressionYes"            
## [13] "educationHS/GED"           "educationSome college"    
## [15] "educationCollege graduate" "smokingCurrent"           
## [17] "smokingFormer"             "sexFemale"

# Compare final variables across methods
backward_vars  <- names(coef(mod_backward))[-1]  
forward_vars   <- names(coef(mod_forward))[-1]
stepwise_vars  <- names(coef(mod_stepwise))[-1]

# All unique predictor terms (from max model)
all_terms <- names(coef(mod_max))[-1]

comparison_df <- tibble(
  Term        = all_terms,
  Backward    = ifelse(all_terms %in% backward_vars,  "✓", "✗"),
  Forward     = ifelse(all_terms %in% forward_vars,   "✓", "✗"),
  Stepwise    = ifelse(all_terms %in% stepwise_vars,  "✓", "✗")
)

comparison_df |>
  kable(
    caption = "Variable Across Stepwise Selection Methods",
    align = c("l", "c", "c", "c")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Variable Across Stepwise Selection Methods
Term	Backward	Forward	Stepwise
menthlth_days	✓	✓	✓
bmi	✗	✗	✗
sexFemale	✓	✓	✓
exerciseYes	✓	✓	✓
depressionYes	✓	✓	✓
age_group35-49	✓	✓	✓
age_group50-64	✓	✓	✓
age_group65+	✓	✓	✓
income$25K-$49K	✓	✓	✓
income$50K-$99K	✓	✓	✓
income$100K+	✓	✓	✓
educationHS/GED	✓	✓	✓
educationSome college	✓	✓	✓
educationCollege graduate	✓	✓	✓
smokingCurrent	✓	✓	✓
smokingFormer	✓	✓	✓
gen_healthGood	✓	✓	✓
gen_healthFair/Poor	✓	✓	✓
maritalPreviously married	✗	✗	✗
maritalNever married	✗	✗	✗

# AIC comparison across final models
aic_compare <- tibble(
  Method     = c("Maximum Model", "Backward", "Forward", "Stepwise"),
  Predictors = c(length(names(coef(mod_max))),
                 length(names(coef(mod_backward))),
                 length(names(coef(mod_forward))),
                 length(names(coef(mod_stepwise)))),
  AIC        = round(c(AIC(mod_max), AIC(mod_backward),
                       AIC(mod_forward), AIC(mod_stepwise)), 2),
  AdjR2      = round(c(summary(mod_max)$adj.r.squared,
                       summary(mod_backward)$adj.r.squared,
                       summary(mod_forward)$adj.r.squared,
                       summary(mod_stepwise)$adj.r.squared), 4)
)

aic_compare |>
  kable(
    caption = "Comparison of Model Fit Criteria Across Selection Methods",
    col.names = c("Method", "# Parameters", "AIC", "Adjusted R²"),
    align = c("l", "c", "r", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Comparison of Model Fit Criteria Across Selection Methods
Method	# Parameters	AIC	Adjusted R²
Maximum Model	21	54115.05	0.3242
Backward	18	54114.57	0.3239
Forward	18	54114.57	0.3239
Stepwise	18	54114.57	0.3239

Comparison of Stepwise Methods: All three selection methods—backward elimination, forward selection, and stepwise—converge on the same final model, retaining all predictors from the maximum model. This consistency suggests that, under the AIC criterion, removing any variable does not sufficiently improve model fit to justify a simpler specification. The agreement across methods is not unexpected given the large sample size (N = 8,000), which provides high statistical power to detect even small associations; as a result, AIC-based procedures tend to retain variables that may have modest effects. However, the inclusion of all predictors does not necessarily imply that each contributes meaningfully in a practical sense. Rather, it indicates that each variable adds some incremental predictive value relative to the others, even if the magnitude of that contribution is small.

4.5 Step 4: Final Model Selection and Interpretation

Final model selection: All three stepwise procedures retained the same predictors as the maximum model, indicating that under the AIC criterion, the full 11-predictor model provides the best balance of fit and complexity. While BIC favored a smaller model in the best subsets approach—due to its stronger penalty for model complexity, especially with a large sample size—the stepwise methods, which treat categorical variables as grouped factors, consistently supported retaining all predictors.

Given the agreement across stepwise procedures, the theoretical relevance of the included variables, and the acceptable adjusted R², the full model is a reasonable choice. However, it is important to note that a more parsimonious model (as suggested by BIC) may offer similar predictive performance with greater simplicity.

# Final model = maximum model 
mod_final <- mod_max

# Display coefficient table
tidy(mod_final, conf.int = TRUE) |>
  mutate(across(where(is.numeric), ~ round(., 3))) |>
  kable(
    caption = "Final Linear Regression Model: Coefficient Table (N = 8,000)",
    col.names = c("Term", "Estimate", "Std. Error", "t-statistic",
                  "p-value", "95% CI Lower", "95% CI Upper"),
    align = c("l", rep("r", 6))
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) |>
  scroll_box(width = "100%")

Final Linear Regression Model: Coefficient Table (N = 8,000)
Term	Estimate	Std. Error	t-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	2.201	0.627	3.508	0.000	0.971	3.431
menthlth_days	0.184	0.011	16.172	0.000	0.161	0.206
bmi	-0.019	0.013	-1.459	0.145	-0.044	0.006
sexFemale	0.234	0.164	1.431	0.153	-0.087	0.556
exerciseYes	-1.933	0.204	-9.471	0.000	-2.333	-1.533
depressionYes	0.775	0.215	3.602	0.000	0.353	1.197
age_group35-49	0.769	0.284	2.712	0.007	0.213	1.325
age_group50-64	1.401	0.279	5.026	0.000	0.855	1.947
age_group65+	1.727	0.279	6.202	0.000	1.181	2.273
income$25K-$49K	-1.084	0.276	-3.925	0.000	-1.626	-0.543
income$50K-$99K	-1.524	0.276	-5.520	0.000	-2.065	-0.983
income$100K+	-1.316	0.398	-3.304	0.001	-2.096	-0.535
educationHS/GED	0.294	0.401	0.733	0.464	-0.492	1.080
educationSome college	1.002	0.403	2.488	0.013	0.213	1.791
educationCollege graduate	1.053	0.404	2.603	0.009	0.260	1.845
smokingCurrent	0.464	0.266	1.744	0.081	-0.058	0.986
smokingFormer	0.438	0.188	2.332	0.020	0.070	0.806
gen_healthGood	1.410	0.186	7.565	0.000	1.045	1.775
gen_healthFair/Poor	10.065	0.252	39.879	0.000	9.570	10.560
maritalPreviously married	-0.267	0.207	-1.291	0.197	-0.672	0.138
maritalNever married	-0.412	0.249	-1.656	0.098	-0.901	0.076

Interpretation

The final model identifies several key predictors of physically unhealthy days. Mental health days (β = 0.184, p < 0.001), depression (β = 0.775, p < 0.001), and older age groups are positively associated with more unhealthy days, while exercise shows a significant protective effect (β = -1.933, p < 0.001). Higher income is associated with fewer unhealthy days.

General health is the strongest predictor, with individuals reporting fair/poor health experiencing substantially more unhealthy days (β = 10.065, p < 0.001).

4.6 Variables such as BMI, sex, education (HS/GED), and marital status are not statistically significant, suggesting limited independent contribution after adjustment. Overall, mental health, general health, age, income, and exercise are the most influential factors in the model.

4.7 Step 5: LINE Assumption Check

par(mfrow = c(2, 2))
plot(mod_final, which = 1:4,
     sub.caption = "Diagnostic Plots: Final Linear Regression Model")

par(mfrow = c(1, 1))

Interpretation of Diagnostic Plots:

Residuals vs. Fitted: There is a non-linear pattern, with residuals showing a fan-shaped spread that widens at higher fitted values. This suggests both non-linearity and heteroscedasticity (non-constant variance), which is a common finding with count-like outcome variables bounded at zero.
Normal Q-Q: The residuals deviate substantially from the diagonal reference line, particularly in the right tail, indicating a right-skewed residual distribution.
Scale-Location: The smoothed line is not flat, it increases across fitted values, confirming that residual variance grows with the mean (heteroscedasticity). The assumption of equal variance (homoscedasticity) appears violated.
Residuals vs. Leverage: Most observations cluster near the origin. A small number of points have higher Cook’s distance suggesting no single observation is unduly driving the model estimates.

5 Part 2: Logistic Regression

5.1 Step 1: Create the Binary Outcome

# Create frequent_distress: 1 if physhlth_days >= 14, 0 otherwise
brfss_analytic <- brfss_analytic |>
  mutate(
    frequent_distress = factor(
      ifelse(physhlth_days >= 14, "Yes", "No"),
      levels = c("No", "Yes")   # "No" = reference level
    )
  )

# Report prevalence
distress_table <- brfss_analytic |>
  count(frequent_distress) |>
  mutate(Percentage = round(n / sum(n) * 100, 1))

distress_table |>
  kable(
    caption = "Prevalence of Frequent Physical Distress (≥14 days in past 30)",
    col.names = c("Frequent Physical Distress", "N", "Percentage (%)"),
    align = c("l", "r", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Prevalence of Frequent Physical Distress (≥14 days in past 30)
Frequent Physical Distress	N	Percentage (%)
No	6948	86.9
Yes	1052	13.2

Comment on outcome balance: The outcome shows moderate class imbalance: 13.2% of the sample meets the threshold for frequent physical distress (≥14 days), while 86.9% does not. This prevalence is typical for population-based health data and is unlikely to compromise estimation in standard logistic regression, so specialized remedies (e.g., oversampling or class weighting) are not necessary. However, because the minority class is of primary substantive interest, model evaluation should emphasize metrics that reflect performance for that group—such as sensitivity, precision, and the AUC—rather than relying on overall accuracy alone.

5.2 Step 2: Simple (Unadjusted) Logistic Regression

Predictor chosen: gen_health (general health status): as a key predictor because it showed the largest coefficient magnitude in the linear regression, indicating the strongest unadjusted association with physically unhealthy days. Substantively, this aligns with expectations: individuals reporting Fair or Poor health are more likely to experience sustained physical distress. Thus, gen_health is both empirically influential and theoretically well-justified as a predictor of the binary frequent distress outcome (≥14 days).

# Fit simple logistic regression
mod_simple <- glm(
  frequent_distress ~ gen_health,
  data   = brfss_analytic,
  family = binomial
)

summary(mod_simple)

## 
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial, 
##     data = brfss_analytic)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.39831    0.09021  -37.67   <2e-16 ***
## gen_healthGood       1.14179    0.11198   10.20   <2e-16 ***
## gen_healthFair/Poor  3.28880    0.10465   31.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4754.1  on 7997  degrees of freedom
## AIC: 4760.1
## 
## Number of Fisher Scoring iterations: 6

# Odds ratios and 95% CIs
or_simple  <- exp(coef(mod_simple))
ci_simple  <- exp(confint(mod_simple))

tibble(
  Term         = names(or_simple),
  `Odds Ratio` = round(or_simple, 3),
  `95% CI Lower` = round(ci_simple[, 1], 3),
  `95% CI Upper` = round(ci_simple[, 2], 3)
) |>
  kable(
    caption = "Simple Logistic Regression: Odds Ratios for General Health Status",
    align = c("l", "r", "r", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Simple Logistic Regression: Odds Ratios for General Health Status
Term	Odds Ratio	95% CI Lower	95% CI Upper
(Intercept)	0.033	0.028	0.040
gen_healthGood	3.132	2.521	3.911
gen_healthFair/Poor	26.811	21.915	33.038

Interpretation:

Interpretation of odds ratios Intercept (OR = 0.033, 95% CI: 0.028–0.040) This is the baseline odds of frequent physical distress for people reporting Excellent/Very good health. An odds of 0.033 corresponds to a very low probability (roughly ~3%), indicating that frequent distress is uncommon in this healthiest group. Good health (OR = 3.13, 95% CI: 2.52–3.91) Individuals reporting Good health have about 3.1 times higher odds of frequent physical distress compared to those in Excellent/Very good health. The confidence interval does not include 1, so this is a statistically significant increase. Fair/Poor health (OR = 26.81, 95% CI: 21.92–33.04) Individuals reporting Fair or Poor health have about 27 times higher odds of frequent physical distress compared to the reference group. This is a very large effect size, and the tight confidence interval indicates a strong and precise association.

5.3 Step 3: Multiple Logistic Regression

# Fit multiple logistic regression using same predictors as final linear model
mod_logistic <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data   = brfss_analytic,
  family = binomial
)

summary(mod_logistic)

## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days + bmi + sex + 
##     exercise + depression + age_group + income + education + 
##     smoking + gen_health + marital, family = binomial, data = brfss_analytic)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.639413   0.291867 -12.469  < 2e-16 ***
## menthlth_days              0.044787   0.004358  10.277  < 2e-16 ***
## bmi                       -0.004822   0.005556  -0.868  0.38546    
## sexFemale                  0.176389   0.081382   2.167  0.03020 *  
## exerciseYes               -0.586264   0.085768  -6.835 8.17e-12 ***
## depressionYes              0.267888   0.095954   2.792  0.00524 ** 
## age_group35-49             0.512672   0.162401   3.157  0.00160 ** 
## age_group50-64             0.767269   0.154816   4.956 7.20e-07 ***
## age_group65+               0.964882   0.155220   6.216 5.09e-10 ***
## income$25K-$49K           -0.215375   0.111712  -1.928  0.05386 .  
## income$50K-$99K           -0.501752   0.118036  -4.251 2.13e-05 ***
## income$100K+              -0.541456   0.227454  -2.381  0.01729 *  
## educationHS/GED            0.049334   0.165069   0.299  0.76504    
## educationSome college      0.331979   0.165058   2.011  0.04430 *  
## educationCollege graduate  0.283640   0.171335   1.655  0.09783 .  
## smokingCurrent             0.208048   0.116374   1.788  0.07381 .  
## smokingFormer              0.200388   0.089714   2.234  0.02551 *  
## gen_healthGood             0.895044   0.116270   7.698 1.38e-14 ***
## gen_healthFair/Poor        2.696530   0.115309  23.385  < 2e-16 ***
## maritalPreviously married -0.138611   0.096205  -1.441  0.14965    
## maritalNever married      -0.143542   0.127789  -1.123  0.26132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4420.7  on 7979  degrees of freedom
## AIC: 4462.7
## 
## Number of Fisher Scoring iterations: 6

# Adjusted odds ratios with 95% CIs
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
  mutate(across(where(is.numeric), ~ round(., 3))) |>
  kable(
    caption = "Multiple Logistic Regression: Adjusted Odds Ratios (N = 8,000)",
    col.names = c("Term", "Adj. OR", "Std. Error", "z-statistic",
                  "p-value", "95% CI Lower", "95% CI Upper"),
    align = c("l", rep("r", 6))
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) |>
  scroll_box(width = "100%")

Multiple Logistic Regression: Adjusted Odds Ratios (N = 8,000)
Term	Adj. OR	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.026	0.292	-12.469	0.000	0.015	0.046
menthlth_days	1.046	0.004	10.277	0.000	1.037	1.055
bmi	0.995	0.006	-0.868	0.385	0.984	1.006
sexFemale	1.193	0.081	2.167	0.030	1.017	1.400
exerciseYes	0.556	0.086	-6.835	0.000	0.470	0.658
depressionYes	1.307	0.096	2.792	0.005	1.082	1.576
age_group35-49	1.670	0.162	3.157	0.002	1.217	2.302
age_group50-64	2.154	0.155	4.956	0.000	1.596	2.929
age_group65+	2.624	0.155	6.216	0.000	1.944	3.573
income$25K-$49K	0.806	0.112	-1.928	0.054	0.648	1.004
income$50K-$99K	0.605	0.118	-4.251	0.000	0.481	0.763
income$100K+	0.582	0.227	-2.381	0.017	0.368	0.899
educationHS/GED	1.051	0.165	0.299	0.765	0.762	1.456
educationSome college	1.394	0.165	2.011	0.044	1.011	1.932
educationCollege graduate	1.328	0.171	1.655	0.098	0.952	1.864
smokingCurrent	1.231	0.116	1.788	0.074	0.979	1.545
smokingFormer	1.222	0.090	2.234	0.026	1.024	1.456
gen_healthGood	2.447	0.116	7.698	0.000	1.952	3.081
gen_healthFair/Poor	14.828	0.115	23.385	0.000	11.863	18.648
maritalPreviously married	0.871	0.096	-1.441	0.150	0.720	1.050
maritalNever married	0.866	0.128	-1.123	0.261	0.673	1.111

Interpretation of Adjusted Odds Ratios:

After adjustment, self-rated general health is by far the strongest predictor of frequent physical distress, with a clear gradient (Fair/Poor: OR ≈ 14.8; Good: OR ≈ 2.4 vs Excellent/Very good). Mental health burden (more poor mental health days) and depression increase odds, while exercise is strongly protective (~44% lower odds). There are also age and income gradients (older age ↑ risk; higher income ↓ risk). Sex (female) shows a small increase in odds. BMI and marital status are not significant. Overall, distress is driven primarily by general health, mental health, and lifestyle factors.

5.4 Step 4: Likelihood Ratio Test

Testing the income group of predictors: For this, I will collectively test all income level dummy variables (i.e., income$25K-$49K, income$50K-$99K, and income$100K+) to determine whether socioeconomic status, as captured by income, significantly improves the logistic model beyond the other predictors.

Hypotheses: - H₀: The income predictors do not improve model fit; the coefficients for all three income levels are simultaneously equal to zero (β_$25K-$49K = β_$50K-$99K = β_$100K+ = 0). - H₁: At least one income level coefficient is not equal to zero; income collectively improves model fit.

# Fit reduced model excluding income
mod_reduced <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + education + smoking + gen_health + marital,
  data   = brfss_analytic,
  family = binomial
)

# Likelihood ratio test
lrt_result <- anova(mod_reduced, mod_logistic, test = "Chisq")
print(lrt_result)

## Analysis of Deviance Table
## 
## Model 1: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + education + smoking + gen_health + marital
## Model 2: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health + marital
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7982     4439.7                          
## 2      7979     4420.7  3   19.082 0.0002629 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Report LRT results
tibble(
  Model       = c("Reduced (no income)", "Full (with income)"),
  Deviance    = round(c(mod_reduced$deviance, mod_logistic$deviance), 2),
  df          = c(mod_reduced$df.residual, mod_logistic$df.residual)
) |>
  kable(
    caption = "Model Deviances for Likelihood Ratio Test",
    align = c("l", "r", "r")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model Deviances for Likelihood Ratio Test
Model	Deviance	df
Reduced (no income)	4439.74	7982
Full (with income)	4420.66	7979

cat("LRT Chi-square statistic:",
    round(lrt_result$Deviance[2], 3), "\n")

## LRT Chi-square statistic: 19.082

cat("Degrees of freedom:", lrt_result$Df[2], "\n")

## Degrees of freedom: 3

cat("p-value:", format.pval(lrt_result$`Pr(>Chi)`[2], digits = 4), "\n")

## p-value: 0.0002629

Conclusion: We reject the null hypothesis at conventional significance levels (e.g., α = 0.05). This indicates that income variables jointly improve model fit, meaning at least one income category contributes significantly to explaining frequent physical distress.

Interpretation

Even though some individual income coefficients may be modest, the set of income indicators provides meaningful explanatory power in predicting the outcome.

5.5 Step 5: ROC Curve and AUC

# Generate ROC curve
roc_obj <- roc(
  brfss_analytic$frequent_distress,
  fitted(mod_logistic),
  levels = c("No", "Yes"),
  direction = "<"
)

# Plot ROC curve
plot(
  roc_obj,
  main      = "ROC Curve: Frequent Physical Distress Model",
  col       = "blue",
  lwd       = 2,
  print.auc = TRUE,
  auc.polygon = TRUE,
  auc.polygon.col = "white",
  print.auc.y = 0.45
)
abline(a = 0, b = 1, lty = 2, col = "gray50")

# Report AUC with 95% CI
auc_val <- auc(roc_obj)
auc_ci  <- ci.auc(roc_obj)

cat("AUC:", round(auc_val, 4), "\n")

## AUC: 0.8555

cat("95% CI:", round(auc_ci[1], 4), "–", round(auc_ci[3], 4), "\n")

## 95% CI: 0.8426 – 0.8683

Interpretation: The Area Under the ROC Curve (AUC) is 0.856 (95% CI: 0.843–0.868). Based on standard discrimination guidelines, this falls in the excellent (0.80–0.90) range. This indicates that the model has strong ability to distinguish between individuals who do and do not experience frequent physical distress. In practical terms, if we randomly select one individual with frequent distress and one without, the model will correctly assign a higher predicted risk to the distressed individual about 85.6% of the time, compared to 50% for a model with no predictive ability. Overall, this suggests that the combination of self-rated general health, mental health days, depression status, and sociodemographic and behavioral factors provides substantial and reliable discrimination for identifying individuals at risk of frequent physical distress.

5.6 Step 6: Hosmer-Lemeshow Goodness-of-Fit Test

Hypotheses: (logistic regression goodness-of-fit):

H₀: The model fits the data adequately; there is no significant difference between observed and expected frequencies of the outcome across groups of predicted risk. H₁: The model does not fit the data adequately; there is a significant difference between observed and expected frequencies across groups of predicted risk.

# Hosmer-Lemeshow goodness-of-fit test (g = 10 groups)
hl_test <- hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
print(hl_test)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 5.5773, df = 8, p-value = 0.6945

# Observed vs. expected
hl_obs_exp <- cbind(hl_test$observed, hl_test$expected)
colnames(hl_obs_exp) <- c("Obs. No", "Obs. Yes", "Exp. No", "Exp. Yes")

hl_obs_exp |>
  as.data.frame() |>
  rownames_to_column("Decile") |>
  mutate(across(where(is.numeric), ~ round(., 1))) |>
  kable(
    caption = "Hosmer-Lemeshow Test: Observed vs. Expected by Predicted Probability Decile",
    align = c("l", rep("r", 4))
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Hosmer-Lemeshow Test: Observed vs. Expected by Predicted Probability Decile
Decile	Obs. No	Obs. Yes	Exp. No	Exp. Yes
[0.00675,0.0185]	786	14	789.0	11.0
(0.0185,0.025]	784	16	782.6	17.4
(0.025,0.0305]	785	15	778.1	21.9
(0.0305,0.0378]	777	23	773.1	26.9
(0.0378,0.0507]	765	35	764.9	35.1
(0.0507,0.0679]	753	47	752.7	47.3
(0.0679,0.0946]	728	72	736.1	63.9
(0.0946,0.187]	692	108	695.8	104.2
(0.187,0.425]	546	254	552.1	247.9
(0.425,0.893]	332	468	323.6	476.4

cat("\nHosmer-Lemeshow Test Summary:\n")

## 
## Hosmer-Lemeshow Test Summary:

cat("Chi-square statistic:", round(hl_test$statistic, 3), "\n")

## Chi-square statistic: 5.577

cat("Degrees of freedom:", hl_test$parameter, "\n")

## Degrees of freedom: 8

cat("p-value:", format.pval(hl_test$p.value, digits = 4), "\n")

## p-value: 0.6945

Interpretation: The Hosmer–Lemeshow test yields a chi-square statistic of r round(hl_test$statistic, 2) with r hl_test$parameter degrees of freedom. The p-value is r format.pval(hl_test$p.value, digits = 3). At α = 0.05, we r ifelse(hl_test$p.value < 0.05, “reject the null hypothesis and conclude there is evidence of poor model fit”, “fail to reject the null hypothesis, indicating no evidence of poor model misfit”).

Complementarity with ROC/AUC: The ROC/AUC and Hosmer–Lemeshow test evaluate different aspects of model performance. The AUC assesses discrimination, or how well the model distinguishes between individuals with and without the outcome, without considering calibration. In contrast, the Hosmer–Lemeshow test evaluates calibration, or how closely predicted probabilities align with observed event rates across levels of risk. Because these metrics capture different properties, they should be interpreted together. A model may show strong discrimination but imperfect calibration, or vice versa. In this case, the model demonstrates r ifelse(auc_val >= 0.8, “excellent discrimination”, “acceptable discrimination”) alongside r ifelse(hl_test$p.value >= 0.05, “adequate calibration based on the HL test”, “potential calibration concerns based on the HL test”), suggesting overall strong but not solely sufficient evidence of model performance. —

6 Part 3: Reflection

7 A. Statistical Insights

The findings from both the linear and logistic regression models present a consistent view of the multifactorial determinants of physical health burden among U.S. adults. Self-rated general health stands out as the strongest predictor in both models: individuals reporting Fair or Poor health experience substantially more physically unhealthy days and significantly higher odds of frequent physical distress, even after adjustment for other covariates. This pattern likely reflects strong construct validity, as self-rated health captures the cumulative impact of chronic conditions that are not fully accounted for by other variables in the model. Mental health days is the next most influential predictor across both approaches, highlighting the well-established interplay between mental and physical health; each additional day of poor mental health is associated with incremental increases in physical health burden and higher odds of exceeding the distress threshold. A history of depression also remains consistently significant, reinforcing these associations.

Several additional factors show consistent effects across both models, including protective associations for exercise, a clear socioeconomic gradient with lower income linked to worse outcomes, and increased burden among older age groups. However, a key limitation of this analysis is its cross-sectional design, which prevents any conclusions about causality or temporal ordering. Important unmeasured confounders likely include (1) chronic disease burden (e.g., cardiovascular disease, diabetes, arthritis), which strongly influences both physical symptoms and self-rated health, and (2) healthcare access and insurance coverage, which affect symptom management and are closely tied to income, education, and employment.

#B. Linear versus Logistic Regression

The linear regression model estimates the magnitude of association by quantifying the change in the number of physically unhealthy days associated with each predictor. This provides an interpretable, clinically meaningful measure of population burden—for example, Fair/Poor health corresponds to an estimated increase of approximately r round(coef(mod_final)[“gen_healthFair/Poor”], 1) physically unhealthy days per month. However, this approach relies on assumptions of normality and continuity that are not fully appropriate for a bounded, zero-inflated count outcome.

In contrast, logistic regression reframes the outcome as whether individuals meet the CDC threshold for frequent physical distress, making it more suitable for classification and risk identification. This binary outcome aligns well with public health screening practices, where identifying high-risk individuals is often the primary goal. The odds ratios provide a relative measure of risk across groups, while the AUC (r round(auc_val, 3)) summarizes overall discriminatory performance—something not available in linear regression. Conversely, the linear model’s R² (r round(summary(mod_final)$r.squared, 3)) quantifies explained variance, a metric without a direct analogue in logistic models. In practice, logistic regression is preferable for classification and risk stratification, whereas linear regression is more appropriate for estimating average differences in symptom burden and informing resource allocation.

7.1 C. AI Transparency

I took the help of Google to verify and better understand certain statistical interpretations (e.g., model diagnostics, and ROC/AUC interpretation). This was done to confirm my understanding and ensure that my interpretations were consistent with standard statistical definitions and guidance.

End of Assignment

EPI 553 / STA 553: Computer Homework 4

Model Selection and Introduction to Logistic Regression

Samriddhi Ranjan

2026-04-24