Part 0: Data Preparation (10 points)

Step 1: Load and Import BRFSS 2023 (3 points)

# Load the BRFSS 2023 XPT file 
brfss_raw <- haven::read_xpt("D:/fizza documents/Epi 553/Assignments/HW04/LLCP2023.XPT")

cat("Raw dataset dimensions:\n")

## Raw dataset dimensions:

cat("  Rows:   ", nrow(brfss_raw), "\n")

##   Rows:    433323

cat("  Columns:", ncol(brfss_raw), "\n")

##   Columns: 350

# Select only the variables needed for this assignment
# Note: variables with leading underscore require backticks
brfss_selected <- brfss_raw |>
  dplyr::select(
    PHYSHLTH,   # outcome: physically unhealthy days
    MENTHLTH,   # predictor 1: mentally unhealthy days (continuous)
    `_BMI5`,    # predictor 2: BMI x100 (continuous)
    SEXVAR,     # predictor 3: sex (binary)
    EXERANY2,   # predictor 4: any exercise (binary)
    ADDEPEV3,   # predictor 5: ever told depression (binary)
    `_AGEG5YR`, # predictor 6: age group (categorical)
    `_INCOMG1`, # predictor 7: income (categorical)
    EDUCA,      # predictor 8: education (categorical)
    `_SMOKER3`, # predictor 9: smoking status (categorical)
    GENHLTH     # predictor 10: general health (categorical)
  )

cat("\nSelected dataset dimensions:\n")

## 
## Selected dataset dimensions:

cat("  Rows:   ", nrow(brfss_selected), "\n")

##   Rows:    433323

cat("  Columns:", ncol(brfss_selected), "\n")

##   Columns: 11

Raw Dataset Description: The raw BRFSS 2023 dataset has 433,323 people and 350 variables. After selecting only the 11 variables needed for this analysis (the outcome plus 10 predictors), the working dataset still has all 433,323 rows but only 11 columns. The other 339 variables were dropped.

Predictor selection justification: I included all 10 candidate predictors in the maximum model because each one has theoretical or empirical links to physical health burden. Mental health days and general health status are directly related to physical health in terms of functioning. BMI, exercise, smoking, and depression are behavioral and clinical risk factors that are known to be associated with physical health problems. Age, sex, income, and education are sociodemographic factors that help explain structural differences in health and health-seeking behavior, and all of them were found to be significant in previous BRFSS studies. Including all 10 variables allows the model selection process to figure out which combination of predictors gives the best fit. —

Step 2: Recode All Variables (4 points)

brfss_clean <- brfss_selected |>
  dplyr::mutate(

    # --- OUTCOME ---
    # PHYSHLTH: 88 = none (recode to 0); 77/99 = NA; 1-30 keep as is
    physhlth_days = dplyr::case_when(
      as.integer(PHYSHLTH) == 88L             ~ 0,
      as.integer(PHYSHLTH) %in% c(77L, 99L)  ~ NA_real_,
      as.integer(PHYSHLTH) >= 1L &
        as.integer(PHYSHLTH) <= 30L           ~ as.numeric(PHYSHLTH),
      TRUE                                    ~ NA_real_
    ),

    # --- CONTINUOUS PREDICTORS ---
    # MENTHLTH: 88 = none; 77/99 = NA; 1-30 keep
    menthlth_days = dplyr::case_when(
      as.integer(MENTHLTH) == 88L             ~ 0,
      as.integer(MENTHLTH) %in% c(77L, 99L)  ~ NA_real_,
      as.integer(MENTHLTH) >= 1L &
        as.integer(MENTHLTH) <= 30L           ~ as.numeric(MENTHLTH),
      TRUE                                    ~ NA_real_
    ),

    # _BMI5: divide by 100; 9999 = NA
    bmi = dplyr::if_else(as.integer(`_BMI5`) == 9999L,
                         NA_real_,
                         as.numeric(`_BMI5`) / 100),

    # --- BINARY PREDICTORS ---
    # SEXVAR: 1 = Male, 2 = Female
    sex = factor(
      dplyr::case_when(
        as.integer(SEXVAR) == 1L ~ "Male",
        as.integer(SEXVAR) == 2L ~ "Female",
        TRUE ~ NA_character_
      ),
      levels = c("Male", "Female")   # Male = reference
    ),

    # EXERANY2: 1 = Yes, 2 = No; 7/9 = NA
    exercise = factor(
      dplyr::case_when(
        as.integer(EXERANY2) == 1L             ~ "Yes",
        as.integer(EXERANY2) == 2L             ~ "No",
        as.integer(EXERANY2) %in% c(7L, 9L)   ~ NA_character_,
        TRUE                                   ~ NA_character_
      ),
      levels = c("No", "Yes")   # No = reference
    ),

    # ADDEPEV3: 1 = Yes, 2 = No; 7/9 = NA
    depression = factor(
      dplyr::case_when(
        as.integer(ADDEPEV3) == 1L             ~ "Yes",
        as.integer(ADDEPEV3) == 2L             ~ "No",
        as.integer(ADDEPEV3) %in% c(7L, 9L)   ~ NA_character_,
        TRUE                                   ~ NA_character_
      ),
      levels = c("No", "Yes")   # No = reference
    ),

    # --- CATEGORICAL PREDICTORS ---
    # _AGEG5YR: 1-3="18-34", 4-6="35-49", 7-9="50-64", 10-13="65+"; 14=NA
    age_group = factor(
      dplyr::case_when(
        as.integer(`_AGEG5YR`) %in% 1L:3L   ~ "18-34",
        as.integer(`_AGEG5YR`) %in% 4L:6L   ~ "35-49",
        as.integer(`_AGEG5YR`) %in% 7L:9L   ~ "50-64",
        as.integer(`_AGEG5YR`) %in% 10L:13L ~ "65+",
        as.integer(`_AGEG5YR`) == 14L       ~ NA_character_,
        TRUE                                ~ NA_character_
      ),
      levels = c("18-34", "35-49", "50-64", "65+")  # 18-34 = reference
    ),

    # _INCOMG1: 1-2="Less than $25K", 3-4="$25K-$49K",
    #           5-6="$50K-$99K", 7="$100K+"; 9=NA
    income = factor(
      dplyr::case_when(
        as.integer(`_INCOMG1`) %in% 1L:2L ~ "Less than $25K",
        as.integer(`_INCOMG1`) %in% 3L:4L ~ "$25K-$49K",
        as.integer(`_INCOMG1`) %in% 5L:6L ~ "$50K-$99K",
        as.integer(`_INCOMG1`) == 7L      ~ "$100K+",
        as.integer(`_INCOMG1`) == 9L      ~ NA_character_,
        TRUE                              ~ NA_character_
      ),
      levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")
    ),

    # EDUCA: 1-3="Less than HS", 4="HS/GED", 5="Some college",
    #        6="College graduate"; 9=NA
    education = factor(
      dplyr::case_when(
        as.integer(EDUCA) %in% 1L:3L ~ "Less than HS",
        as.integer(EDUCA) == 4L      ~ "HS/GED",
        as.integer(EDUCA) == 5L      ~ "Some college",
        as.integer(EDUCA) == 6L      ~ "College graduate",
        as.integer(EDUCA) == 9L      ~ NA_character_,
        TRUE                         ~ NA_character_
      ),
      levels = c("Less than HS", "HS/GED", "Some college", "College graduate")
    ),

    # _SMOKER3: 1-2="Current", 3="Former", 4="Never"; 9=NA
    smoking = factor(
      dplyr::case_when(
        as.integer(`_SMOKER3`) %in% 1L:2L ~ "Current",
        as.integer(`_SMOKER3`) == 3L      ~ "Former",
        as.integer(`_SMOKER3`) == 4L      ~ "Never",
        as.integer(`_SMOKER3`) == 9L      ~ NA_character_,
        TRUE                              ~ NA_character_
      ),
      levels = c("Never", "Current", "Former")  # Never = reference
    ),

    # GENHLTH: 1-2="Excellent/Very Good", 3="Good", 4-5="Fair/Poor"; 7/9=NA
    gen_health = factor(
      dplyr::case_when(
        as.integer(GENHLTH) %in% 1L:2L   ~ "Excellent/Very Good",
        as.integer(GENHLTH) == 3L        ~ "Good",
        as.integer(GENHLTH) %in% 4L:5L  ~ "Fair/Poor",
        as.integer(GENHLTH) %in% c(7L, 9L) ~ NA_character_,
        TRUE                             ~ NA_character_
      ),
      levels = c("Excellent/Very Good", "Good", "Fair/Poor")
    )

  ) |>
  # Keep only recoded variables; drop all raw BRFSS columns
  dplyr::select(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
                age_group, income, education, smoking, gen_health)

cat("Recoded dataset N =", nrow(brfss_clean), "\n")

## Recoded dataset N = 433323

# Verify factor levels and reference categories
cat("\nFactor level check:\n")

## 
## Factor level check:

for (v in c("sex", "exercise", "depression", "age_group",
            "income", "education", "smoking", "gen_health")) {
  cat(sprintf("  %-14s  levels: %s\n", v,
              paste(levels(brfss_clean[[v]]), collapse = " | ")))
}

##   sex             levels: Male | Female
##   exercise        levels: No | Yes
##   depression      levels: No | Yes
##   age_group       levels: 18-34 | 35-49 | 50-64 | 65+
##   income          levels: Less than $25K | $25K-$49K | $50K-$99K | $100K+
##   education       levels: Less than HS | HS/GED | Some college | College graduate
##   smoking         levels: Never | Current | Former
##   gen_health      levels: Excellent/Very Good | Good | Fair/Poor

Recoded Dataset and Factor Levels: All 11 variables were recoded from raw BRFSS numbers into meaningful formats. The continuous ones (physically unhealthy days, mentally unhealthy days, BMI) were turned into numbers, with special codes set to zero or NA. The eight categorical variables were converted to labeled factors with the right reference levels, like Male, No for exercise, No for depression, 18–34 for age, and so on. No numeric codes are left in the final dataset. —

Step 3: Analytic Sample and Descriptive Summary (3 points)

# Report missingness on physhlth_days and selected other variables BEFORE drop_na()
miss_vars <- c("physhlth_days", "bmi", "income", "smoking", "gen_health")

cat("=== Missingness Report (before drop_na) ===\n")

## === Missingness Report (before drop_na) ===

miss_tbl <- purrr::map_dfr(miss_vars, function(v) {
  n_miss <- sum(is.na(brfss_clean[[v]]))
  tibble::tibble(
    Variable   = v,
    N_missing  = n_miss,
    Pct_missing = round(100 * n_miss / nrow(brfss_clean), 1)
  )
})

miss_tbl |>
  knitr::kable(
    col.names = c("Variable", "N Missing", "% Missing"),
    caption   = "Table 0. Missingness Before Exclusion"
  ) |>
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)

Table 0. Missingness Before Exclusion
Variable	N Missing	% Missing
physhlth_days	10785	2.5
bmi	40535	9.4
income	86623	20.0
smoking	23062	5.3
gen_health	1262	0.3

# Draw analytic sample: drop all missing then sample 8,000
set.seed(1220)
brfss_analytic <- brfss_clean |>
  tidyr::drop_na() |>
  dplyr::slice_sample(n = 8000)

cat("Complete cases before sampling: ",
    nrow(tidyr::drop_na(brfss_clean)), "\n")

## Complete cases before sampling:  309253

cat("Final analytic sample N =", nrow(brfss_analytic), "\n")

## Final analytic sample N = 8000

# Verify outcome distribution in analytic sample
cat("\nphyshlth_days in analytic sample:\n")

## 
## physhlth_days in analytic sample:

cat("  Mean:", round(mean(brfss_analytic$physhlth_days), 2),
    "| Median:", median(brfss_analytic$physhlth_days),
    "| SD:", round(sd(brfss_analytic$physhlth_days), 2),
    "| Range:", min(brfss_analytic$physhlth_days), "–",
    max(brfss_analytic$physhlth_days), "\n")

##   Mean: 4.31 | Median: 0 | SD: 8.51 | Range: 0 – 30

Missingness and Analytic Sample: Before removing missing data, the dataset had 433,323 observations. After dropping missing values across all 11 variables, we had 309,253 complete cases, meaning about 28.6% of the sample was removed. From these complete cases, we randomly selected 8,000 observations (set.seed = 1220) for the final analytic sample.

In this sample, physically unhealthy days has a mean of 4.31 days (SD = 8.51), a median of 0, and ranges from 0 to 30. The mean is much higher than the median, and the standard deviation is bigger than the mean, which shows the data are highly skewed to the right and have many zeros. Most people report zero unhealthy days, but a small group reports many days, which pulls the mean up.

brfss_analytic |>
  tbl_summary(
    label = list(
      physhlth_days ~ "Physically unhealthy days (past 30)",
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      bmi           ~ "BMI (kg/m²)",
      sex           ~ "Sex",
      exercise      ~ "Exercise (past 30 days)",
      depression    ~ "Ever told: depressive disorder",
      age_group     ~ "Age group",
      income        ~ "Annual household income",
      education     ~ "Education level",
      smoking       ~ "Smoking status",
      gen_health    ~ "Self-rated general health"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = list(all_continuous() ~ 1)
  ) |>
  bold_labels() |>
  modify_caption("**Table 1. Descriptive Summary of Analytic Sample (N = 8,000)**")

**Table 1. Descriptive Summary of Analytic Sample (N = 8,000)**
Characteristic	N = 8,000¹
Physically unhealthy days (past 30)	4.3 (8.5)
Mentally unhealthy days (past 30)	4.4 (8.3)
BMI (kg/m²)	28.6 (6.7)
Sex
Male	3,905 (49%)
Female	4,095 (51%)
Exercise (past 30 days)	6,224 (78%)
Ever told: depressive disorder	1,694 (21%)
Age group
18-34	1,328 (17%)
35-49	1,650 (21%)
50-64	2,145 (27%)
65+	2,877 (36%)
Annual household income
Less than $25K	1,101 (14%)
$25K-$49K	1,927 (24%)
$50K-$99K	4,341 (54%)
$100K+	631 (7.9%)
Education level
Less than HS	384 (4.8%)
HS/GED	1,941 (24%)
Some college	2,067 (26%)
College graduate	3,608 (45%)
Smoking status
Never	4,884 (61%)
Current	855 (11%)
Former	2,261 (28%)
Self-rated general health
Excellent/Very Good	3,990 (50%)
Good	2,623 (33%)
Fair/Poor	1,387 (17%)
¹ Mean (SD); n (%)

Part 1: Model Selection for Linear Regression (35 points)

Step 1: Fit the Maximum Model (5 points)

# Maximum model: all 10 predictors
mod_max <- lm(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health,
  data = brfss_analytic
)

cat("=== Maximum Model Summary ===\n")

## === Maximum Model Summary ===

summary(mod_max)

## 
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise + 
##     depression + age_group + income + education + smoking + gen_health, 
##     data = brfss_analytic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.2174  -2.7092  -1.1314   0.6714  30.0121 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.3194460  0.5780595   2.283 0.022483 *  
## menthlth_days              0.1990805  0.0108958  18.271  < 2e-16 ***
## bmi                       -0.0009936  0.0122581  -0.081 0.935399    
## sexFemale                  0.2381885  0.1582722   1.505 0.132382    
## exerciseYes               -2.1205593  0.1995982 -10.624  < 2e-16 ***
## depressionYes              0.3476674  0.2126271   1.635 0.102067    
## age_group35-49             0.3535773  0.2601484   1.359 0.174142    
## age_group50-64             1.5652359  0.2483827   6.302 3.10e-10 ***
## age_group65+               1.6307384  0.2425275   6.724 1.89e-11 ***
## income$25K-$49K           -0.4765953  0.2670005  -1.785 0.074300 .  
## income$50K-$99K           -1.1805694  0.2594204  -4.551 5.42e-06 ***
## income$100K+              -0.9897185  0.3825676  -2.587 0.009698 ** 
## educationHS/GED            0.8888178  0.3931248   2.261 0.023792 *  
## educationSome college      1.1012326  0.3978171   2.768 0.005650 ** 
## educationCollege graduate  1.3386538  0.4027682   3.324 0.000893 ***
## smokingCurrent             0.3990517  0.2697480   1.479 0.139086    
## smokingFormer              0.2340748  0.1818188   1.287 0.197990    
## gen_healthGood             1.1875598  0.1809538   6.563 5.61e-11 ***
## gen_healthFair/Poor       10.0607051  0.2495420  40.317  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.891 on 7981 degrees of freedom
## Multiple R-squared:  0.3465, Adjusted R-squared:  0.345 
## F-statistic: 235.1 on 18 and 7981 DF,  p-value: < 2.2e-16

# Extract fit statistics
r2_max     <- summary(mod_max)$r.squared
adj_r2_max <- summary(mod_max)$adj.r.squared
aic_max    <- AIC(mod_max)
bic_max    <- BIC(mod_max)

cat("\n=== Model Fit Statistics ===\n")

## 
## === Model Fit Statistics ===

cat("R-squared:         ", round(r2_max,     4), "\n")

## R-squared:          0.3465

cat("Adjusted R-squared:", round(adj_r2_max, 4), "\n")

## Adjusted R-squared: 0.345

cat("AIC:               ", round(aic_max,    2), "\n")

## AIC:                53606.97

cat("BIC:               ", round(bic_max,    2), "\n")

## BIC:                53746.71

Maximum model fit interpretation: The maximum model with all 10 predictors explains about 34.65% of the variance in physically unhealthy days (R² = 0.3465, Adjusted R² = 0.3450). The small difference between them suggests the model isn’t overfitted. The overall model is significant (F(18, 7981) = 235.1, p < 2.2 × 10⁻¹⁶).

General health status and mental health days are the strongest predictors. People who rate their health as Fair/Poor have about 10 more physically unhealthy days per month than those in Excellent/Very Good health. Each extra mentally unhealthy day is linked to 0.20 more physically unhealthy days. Physically active adults report 2.12 fewer unhealthy days than inactive adults. Older age groups (50–64 and 65+) and people with higher education report more unhealthy days, which is surprising but might reflect survival bias. Higher income is linked to fewer unhealthy days. BMI, sex, depression, smoking, and the 35–49 age group were not significant in this model.

Step 2: Best Subsets Regression (10 points)

# Best subsets: nvmax = total columns in the model matrix minus intercept
# This includes all dummy variables from factors
mm_cols <- ncol(model.matrix(mod_max)) - 1
cat("Total parameters in model matrix (excluding intercept):", mm_cols, "\n")

## Total parameters in model matrix (excluding intercept): 18

bs_result <- leaps::regsubsets(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health,
  data  = brfss_analytic,
  nvmax = mm_cols,
  method = "exhaustive"
)

bs_summary <- summary(bs_result)

# Build comparison table
bs_table <- tibble::tibble(
  Model_Size  = seq_along(bs_summary$adjr2),
  Adj_R2      = round(bs_summary$adjr2, 4),
  BIC         = round(bs_summary$bic, 2)
)

# Identify best model sizes
best_adjr2_size <- which.max(bs_summary$adjr2)
best_bic_size   <- which.min(bs_summary$bic)

cat("Model size maximizing Adjusted R²:", best_adjr2_size, "\n")

## Model size maximizing Adjusted R²: 17

cat("Model size minimizing BIC:        ", best_bic_size, "\n")

## Model size minimizing BIC:         7

bs_table |>
  dplyr::mutate(
    Best_AdjR2 = dplyr::if_else(Model_Size == best_adjr2_size, "★", ""),
    Best_BIC   = dplyr::if_else(Model_Size == best_bic_size,   "★", "")
  ) |>
  knitr::kable(
    col.names = c("Model Size (# predictors)", "Adjusted R²",
                  "BIC", "Best Adj-R²", "Best BIC"),
    caption   = "Table 2. Best Subsets: Adjusted R² and BIC by Model Size"
  ) |>
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE) |>
  kableExtra::column_spec(4:5, bold = TRUE, color = "darkgreen")

Table 2. Best Subsets: Adjusted R² and BIC by Model Size
Model Size (# predictors)	Adjusted R²	BIC	Best Adj-R²	Best BIC
1	0.2806	-2618.05
2	0.3172	-3027.70
3	0.3310	-3183.39
4	0.3357	-3231.62
5	0.3384	-3256.28
6	0.3424	-3296.88
7	0.3436	-3303.23		★
8	0.3440	-3299.42
9	0.3441	-3293.51
10	0.3443	-3287.03
11	0.3444	-3280.86
12	0.3446	-3275.51
13	0.3448	-3270.03
14	0.3450	-3263.73
15	0.3450	-3256.56
16	0.3451	-3249.05
17	0.3451	-3241.72	★
18	0.3450	-3232.74

par(mfrow = c(1, 2))

# Panel 1: Adjusted R²
plot(bs_summary$adjr2, type = "b", pch = 19,
     xlab = "Number of Predictors", ylab = "Adjusted R²",
     main = "Best Subsets: Adjusted R²",
     col  = "steelblue")
points(best_adjr2_size, bs_summary$adjr2[best_adjr2_size],
       col = "red", pch = 19, cex = 1.8)
abline(v = best_adjr2_size, col = "red", lty = 2)

# Panel 2: BIC
plot(bs_summary$bic, type = "b", pch = 19,
     xlab = "Number of Predictors", ylab = "BIC",
     main = "Best Subsets: BIC",
     col  = "darkorange")
points(best_bic_size, bs_summary$bic[best_bic_size],
       col = "red", pch = 19, cex = 1.8)
abline(v = best_bic_size, col = "red", lty = 2)

Figure 1. Best Subsets: Adjusted R² and BIC by Model Size

par(mfrow = c(1, 1))

Best Subsets Interpretation:The best subsets procedure looked at all possible predictor combinations from 1 to 18 parameters. Adjusted R² went up from 0.281 at size 1 to a plateau around size 10–12, then each new predictor added very little. The gain from size 7 to size 17 was only 0.003 (from 0.3436 to 0.3451), meaning there are diminishing returns after 7 predictors.

BIC, which penalizes model complexity more heavily, chose size 7 as the best (BIC = −3303.23). After that, BIC went up because the penalty for adding more predictors was bigger than the improvement in fit. So the two criteria disagree: Adjusted R² technically likes the 17-predictor model, but the difference is very small. BIC is more reliable here because it penalizes unnecessary complexity more.

One limitation is that best subsets treats each dummy variable separately. This means it could include some levels of a categorical variable (like income $50K–$99K) but not others (like $25K–$49K), which doesn’t make theoretical sense. The stepwise methods in Step 3 handle each factor as a complete unit.

Step 3: Stepwise Selection Methods (10 points)

# Intercept-only model (lower scope for forward/stepwise)
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)

# Backward elimination: start from maximum model
mod_backward <- step(mod_max, direction = "backward", trace = 0)

# Forward selection: start from null, upper scope = maximum
mod_forward <- step(
  mod_null,
  scope     = list(lower = mod_null, upper = mod_max),
  direction = "forward",
  trace     = 0
)

# Stepwise (both directions): start from null
mod_stepwise <- step(
  mod_null,
  scope     = list(lower = mod_null, upper = mod_max),
  direction = "both",
  trace     = 0
)

# Extract final variable sets from each method
vars_backward <- names(coef(mod_backward))[-1]
vars_forward  <- names(coef(mod_forward))[-1]
vars_stepwise <- names(coef(mod_stepwise))[-1]

# All predictors in maximum model (expanded)
vars_max <- names(coef(mod_max))[-1]

# Build comparison: which variables appear in each method?
comparison_tbl <- tibble::tibble(
  Variable  = vars_max,
  Backward  = dplyr::if_else(vars_max %in% vars_backward, "✓", ""),
  Forward   = dplyr::if_else(vars_max %in% vars_forward,  "✓", ""),
  Stepwise  = dplyr::if_else(vars_max %in% vars_stepwise, "✓", "")
)

comparison_tbl |>
  knitr::kable(
    col.names = c("Predictor (dummy variable)", "Backward", "Forward", "Stepwise"),
    caption   = "Table 3. Variable Inclusion by Stepwise Method"
  ) |>
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)

Table 3. Variable Inclusion by Stepwise Method
Predictor (dummy variable)	Backward	Forward	Stepwise
menthlth_days	✓	✓	✓
bmi
sexFemale
exerciseYes	✓	✓	✓
depressionYes	✓	✓	✓
age_group35-49	✓	✓	✓
age_group50-64	✓	✓	✓
age_group65+	✓	✓	✓
income$25K-$49K	✓	✓	✓
income$50K-$99K	✓	✓	✓
income$100K+	✓	✓	✓
educationHS/GED	✓	✓	✓
educationSome college	✓	✓	✓
educationCollege graduate	✓	✓	✓
smokingCurrent
smokingFormer
gen_healthGood	✓	✓	✓
gen_healthFair/Poor	✓	✓	✓

cat("\nAIC comparison across methods:\n")

## 
## AIC comparison across methods:

cat("  Maximum model:       ", round(AIC(mod_max), 2), "\n")

##   Maximum model:        53606.97

cat("  Backward elimination:", round(AIC(mod_backward), 2), "\n")

##   Backward elimination: 53603.93

cat("  Forward selection:   ", round(AIC(mod_forward), 2), "\n")

##   Forward selection:    53603.93

cat("  Stepwise (both):     ", round(AIC(mod_stepwise), 2), "\n")

##   Stepwise (both):      53603.93

Stepwise comparison: All three stepwise methods — backward elimination, forward selection, and stepwise (both directions) — ended up with the same final model. Each had an AIC of 53,603.93, compared to 53,606.97 for the maximum model. The small difference of 3.04 points means that some predictors in the maximum model didn’t add much value and were correctly dropped.

Since all methods agreed even though they started from different places, this gives us confidence that the selected model is stable. Any variable that all three methods excluded — most likely BMI, since it had a coefficient close to zero (β = −0.001, p = 0.94) in the maximum model — doesn’t really improve the model when other predictors are already included. Variables kept by all three methods are the most strongly supported and will be used in the final model in Step 4.

Step 4: Final Model Selection and Interpretation (5 points)

# Final model: use the backward elimination result
# Backward elimination is preferred here because it starts from the theoretically
# justified full model and removes only variables that do not improve fit,
# preserving the complete set of factor levels for retained categorical variables.
# All three stepwise methods produced consistent results, confirming this choice.

mod_final <- mod_backward

cat("=== Final Linear Model ===\n")

## === Final Linear Model ===

cat("Method: Backward elimination (AIC-guided)\n")

## Method: Backward elimination (AIC-guided)

cat("Formula:", deparse(formula(mod_final)), "\n\n")

## Formula: physhlth_days ~ menthlth_days + exercise + depression + age_group +      income + education + gen_health

# Coefficient table with 95% CIs
tidy(mod_final, conf.int = TRUE) |>
  dplyr::filter(term != "(Intercept)") |>
  dplyr::mutate(
    dplyr::across(c(estimate, conf.low, conf.high, statistic), \(x) round(x, 3)),
    p.value = format.pval(p.value, digits = 3, eps = 0.001)
  ) |>
  dplyr::select(term, estimate, conf.low, conf.high, statistic, p.value) |>
  knitr::kable(
    col.names = c("Predictor", "β", "Lower 95% CI", "Upper 95% CI",
                  "t-statistic", "p-value"),
    caption   = "Table 4. Final Linear Model: Coefficient Table"
  ) |>
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)

Table 4. Final Linear Model: Coefficient Table
Predictor	β	Lower 95% CI	Upper 95% CI	t-statistic	p-value
menthlth_days	0.201	0.179	0.222	18.474	< 0.001
exerciseYes	-2.131	-2.519	-1.743	-10.774	< 0.001
depressionYes	0.402	-0.010	0.814	1.912	0.05588
age_group35-49	0.421	-0.083	0.925	1.636	0.10187
age_group50-64	1.636	1.155	2.117	6.667	< 0.001
age_group65+	1.700	1.233	2.167	7.140	< 0.001
income$25K-$49K	-0.488	-1.010	0.035	-1.829	0.06741
income$50K-$99K	-1.223	-1.728	-0.718	-4.749	< 0.001
income$100K+	-1.065	-1.810	-0.321	-2.804	0.00506
educationHS/GED	0.880	0.110	1.649	2.241	0.02503
educationSome college	1.094	0.317	1.872	2.759	0.00581
educationCollege graduate	1.292	0.509	2.076	3.233	0.00123
gen_healthGood	1.195	0.846	1.544	6.706	< 0.001
gen_healthFair/Poor	10.066	9.584	10.548	40.954	< 0.001

cat("\nFinal model fit:\n")

## 
## Final model fit:

cat("  Adjusted R²:", round(summary(mod_final)$adj.r.squared, 4), "\n")

##   Adjusted R²: 0.345

cat("  AIC:        ", round(AIC(mod_final), 2), "\n")

##   AIC:         53603.93

cat("  BIC:        ", round(BIC(mod_final), 2), "\n")

##   BIC:         53715.73

Coefficient Interpretations:

coefs <- tidy(mod_final, conf.int = TRUE) |>
  dplyr::mutate(dplyr::across(c(estimate, conf.low, conf.high), \(x) round(x, 2)))

# Pull specific estimates
b_menthlth  <- coefs |> dplyr::filter(term == "menthlth_days")
b_depression <- coefs |> dplyr::filter(term == "depressionYes")
b_genhlth_fp <- coefs |> dplyr::filter(term == "gen_healthFair/Poor")
b_exercise   <- coefs |> dplyr::filter(grepl("exerciseYes", term))

cat("Mental health days β:", b_menthlth$estimate,
    "(95% CI:", b_menthlth$conf.low, "–", b_menthlth$conf.high, ")\n")

## Mental health days β: 0.2 (95% CI: 0.18 – 0.22 )

cat("Depression β:        ", b_depression$estimate,
    "(95% CI:", b_depression$conf.low, "–", b_depression$conf.high, ")\n")

## Depression β:         0.4 (95% CI: -0.01 – 0.81 )

cat("Fair/Poor health β:  ", b_genhlth_fp$estimate,
    "(95% CI:", b_genhlth_fp$conf.low, "–", b_genhlth_fp$conf.high, ")\n")

## Fair/Poor health β:   10.07 (95% CI: 9.58 – 10.55 )

1. Mentally unhealthy days (continuous): For each extra mentally unhealthy day in the past 30 days, physically unhealthy days go up by 0.20 days on average (95% CI: 0.179–0.222), keeping everything else constant. This is significant (p < 0.001) and is the strongest continuous predictor in the model. So someone with 10 mentally unhealthy days would have about 2 more physically unhealthy days than someone with none, which makes sense given the link between mental and physical health.

2. Depression (categorical — Yes vs. No): Adults who have been diagnosed with depression report about 0.40 more physically unhealthy days per month than those without depression (95% CI: −0.010 to 0.814), keeping everything else constant. But this result is not statistically significant at p < 0.05 (p = 0.056), since the confidence interval just barely includes zero.

3. General health — Fair/Poor vs. Excellent/Very Good (categorical): People who rate their general health as Fair or Poor have about 10 more physically unhealthy days per month than those who say their health is Excellent or Very Good (β = 10.07, 95% CI: 9.58–10.55), keeping everything else constant. This is the biggest coefficient in the model and is highly significant (p < 0.001). A difference of 10 days is one-third of the entire 30-day recall period, which shows that self-rated health really captures a large and real difference in physical health burden. The Good vs. Excellent/Very Good comparison is also significant (β = 1.20, p < 0.001), confirming a clear dose-response pattern across all three health categories.

Step 5: LINE Assumption Check (5 points)

par(mfrow = c(2, 2))
plot(mod_final)

Figure 2. Diagnostic Plots for Final Linear Model

par(mfrow = c(1, 1))

Panel-by-panel interpretation:

1. Residuals vs. Fitted (Linearity / Constant Variance): The residuals vs. fitted plot shows a clear funnel shape — the spread is wider at low fitted values and gets narrower at higher values. This means heteroscedasticity is present. The loess line slopes down instead of staying flat near zero, which also suggests some non-linearity. The vertical stripes at low fitted values are caused by the outcome having many zeros, since a lot of people report zero physically unhealthy days. This creates clusters of residuals at the same fitted value. Overall, this is a clear violation of both the linearity and equal variance assumptions.

2. Normal Q-Q (Normality of Residuals): The residuals really don’t follow the diagonal line, especially in the upper tail where standardized residuals go above 4 — much higher than what you’d expect under normality. The S-shaped curve shows right skew and heavy tails, which matches the zero-inflated and right-skewed distribution we saw for physically unhealthy days in Part 0 (mean = 4.31, median = 0, SD = 8.51). So the normality assumption is clearly violated, but with N = 8,000, the Central Limit Theorem does give some protection for the coefficient estimates.

3. Scale-Location (Homoscedasticity): The downward sloping line and the fan-shaped spread — wide on the left and getting narrower on the right — confirm the heteroscedasticity we saw in Panel 1. The spread of sqrt(|standardized residuals|) is not constant across fitted values, which violates the homoscedasticity assumption. The diagonal streaks at low fitted values again show that the outcome has many zeros and is discrete, rather than having continuous, normally distributed errors.

4. Residuals vs. Leverage (Influential Observations): Most of the observations have very low leverage values (below 0.002), and no points go past the Cook’s distance lines, which means there aren’t any highly influential observations. A few points on the right side (leverage around 0.005–0.006) are labeled, including observation 4520, but their standardized residuals are still within a reasonable range. So the regression results aren’t being distorted by any single data point.

Overall conclusion: The LINE assumptions are only partly met in this model. Independence is fine because the data came from random sampling. Linearity holds more or less in the middle range of fitted values but breaks down at the extremes. Normality is clearly violated because the outcome has too many zeros and is bounded, and heteroscedasticity is obvious in all the diagnostic plots. A Poisson or negative binomial model would make more sense for this count outcome. However, with a large sample of N = 8,000, the coefficient estimates are still reasonable approximations, and the main conclusions probably won’t change much.

Part 2: Logistic Regression (40 points)

Step 1: Create Binary Outcome (5 points)

# Frequent physical distress: 14+ physically unhealthy days in past 30
brfss_analytic <- brfss_analytic |>
  dplyr::mutate(
    frequent_distress = factor(
      dplyr::if_else(physhlth_days >= 14, "Yes", "No"),
      levels = c("No", "Yes")   # No = reference
    )
  )

# Prevalence report
prev_tbl <- table(brfss_analytic$frequent_distress)
prev_pct <- round(100 * prop.table(prev_tbl), 1)

cat("=== Frequent Physical Distress Prevalence ===\n")

## === Frequent Physical Distress Prevalence ===

cat("No  (physhlth_days < 14):", prev_tbl["No"],
    sprintf("(%.1f%%)\n", prev_pct["No"]))

## No  (physhlth_days < 14): 6927 (86.6%)

cat("Yes (physhlth_days >= 14):", prev_tbl["Yes"],
    sprintf("(%.1f%%)\n", prev_pct["Yes"]))

## Yes (physhlth_days >= 14): 1073 (13.4%)

cat("Total N:", sum(prev_tbl), "\n")

## Total N: 8000

Prevalence and balance comment: Frequent physical distress (14 or more unhealthy days in the past 30) affects 13.4% of the sample (n = 1,073), while 86.6% (n = 6,927) have fewer than 14 days. So the outcome is imbalanced at about a 6.5 to 1 ratio of non-cases to cases. This is pretty normal for population data since most people are generally healthy.

This level of imbalance doesn’t need special methods like SMOTE or downsampling. Standard logistic regression is still fine and will give valid odds ratios and coefficients. But the imbalance does mean that predicted probabilities for the minority class (frequent distress = Yes) might be a bit conservative, and the default 0.50 cutoff will mostly classify people as “No.” Since this assignment focuses on odds ratios and AUC instead of classification accuracy, the imbalance doesn’t really affect the main conclusions.

Step 2: Simple (Unadjusted) Logistic Regression (10 points)

Predictor selected: gen_health (self-rated general health). Among all predictors in the final linear model, general health status had the largest coefficient — people who said their health was Fair/Poor reported 10.07 more physically unhealthy days than those in the Excellent/Very Good group. This was the biggest effect in Table 4, which makes it the strongest candidate to test for association with frequent physical distress. Self-rated health is also a validated overall measure that captures physical functioning, chronic disease burden, and symptoms all in one, so it makes theoretical sense as a predictor for crossing the 14-day distress threshold.

mod_simple <- glm(
  frequent_distress ~ gen_health,
  data   = brfss_analytic,
  family = binomial(link = "logit")
)

cat("=== Simple Logistic Regression Summary ===\n")

## === Simple Logistic Regression Summary ===

summary(mod_simple)

## 
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial(link = "logit"), 
##     data = brfss_analytic)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.34421    0.08725 -38.329   <2e-16 ***
## gen_healthGood       1.07595    0.10999   9.782   <2e-16 ***
## gen_healthFair/Poor  3.33700    0.10245  32.571   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6306.5  on 7999  degrees of freedom
## Residual deviance: 4741.8  on 7997  degrees of freedom
## AIC: 4747.8
## 
## Number of Fisher Scoring iterations: 6

# Odds ratios and 95% CIs
cat("\n=== Odds Ratios ===\n")

## 
## === Odds Ratios ===

or_simple  <- exp(coef(mod_simple))
ci_simple  <- exp(confint(mod_simple))

or_simple_tbl <- tibble::tibble(
  Term    = names(or_simple),
  OR      = round(or_simple, 3),
  Lower   = round(ci_simple[, 1], 3),
  Upper   = round(ci_simple[, 2], 3),
  p_value = format.pval(summary(mod_simple)$coefficients[, 4], digits = 3, eps = 0.001)
) |>
  dplyr::filter(Term != "(Intercept)")

or_simple_tbl |>
  knitr::kable(
    col.names = c("Predictor", "OR", "Lower 95% CI", "Upper 95% CI", "p-value"),
    caption   = "Table 5. Simple Logistic Regression: Odds Ratios for Frequent Physical Distress"
  ) |>
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)

Table 5. Simple Logistic Regression: Odds Ratios for Frequent Physical Distress
Predictor	OR	Lower 95% CI	Upper 95% CI	p-value
gen_healthGood	2.933	2.368	3.646	<0.001
gen_healthFair/Poor	28.135	23.089	34.509	<0.001

Odds ratio interpretation:

or_good    <- round(exp(coef(mod_simple))["gen_healthGood"], 2)
or_fairpoor <- round(exp(coef(mod_simple))["gen_healthFair/Poor"], 2)
ci_good    <- round(exp(confint(mod_simple))["gen_healthGood", ], 2)
ci_fairpoor <- round(exp(confint(mod_simple))["gen_healthFair/Poor", ], 2)

cat("OR Good vs. Excellent/VG:    ", or_good,
    "(95% CI:", ci_good[1], "–", ci_good[2], ")\n")

## OR Good vs. Excellent/VG:     2.93 (95% CI: 2.37 – 3.65 )

cat("OR Fair/Poor vs. Excellent/VG:", or_fairpoor,
    "(95% CI:", ci_fairpoor[1], "–", ci_fairpoor[2], ")\n")

## OR Fair/Poor vs. Excellent/VG: 28.13 (95% CI: 23.09 – 34.51 )

Compared to adults who rate their general health as Excellent/Very Good, those who rate it as Good have 2.93 times the odds of frequent physical distress (95% CI: 2.37–3.65, p < 0.001), and those who rate it as Fair/Poor have 28.14 times the odds (95% CI: 23.09–34.51, p < 0.001). Both are significant since the confidence intervals don’t include 1.0 and the p-values are below 0.001.

The dose-response pattern is really striking. Moving from Excellent/VG to Good nearly triples the odds, but moving from Excellent/VG to Fair/Poor increases the odds by more than 28 times. This huge OR for the Fair/Poor group matches what we saw in the linear model, where the same group had 10.07 more physically unhealthy days per month. Both results show that poor self-rated health is a strong predictor of frequent physical distress. Just keep in mind that an OR of 28.14 is about relative odds, not absolute probability.

Step 3: Multiple Logistic Regression (10 points)

# Same predictor set as the final linear model from Part 1
mod_logistic <- glm(
  frequent_distress ~ menthlth_days + exercise + depression +
    age_group + income + education  + gen_health,
  data   = brfss_analytic,
  family = binomial(link = "logit")
)

cat("=== Multiple Logistic Regression Summary ===\n")

## === Multiple Logistic Regression Summary ===

summary(mod_logistic)

## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days + exercise + 
##     depression + age_group + income + education + gen_health, 
##     family = binomial(link = "logit"), data = brfss_analytic)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.38630    0.22369 -15.138  < 2e-16 ***
## menthlth_days              0.05319    0.00430  12.368  < 2e-16 ***
## exerciseYes               -0.72914    0.08400  -8.680  < 2e-16 ***
## depressionYes              0.07361    0.09680   0.761    0.447    
## age_group35-49             0.20493    0.15495   1.323    0.186    
## age_group50-64             0.80095    0.14125   5.670 1.43e-08 ***
## age_group65+               0.78875    0.13972   5.645 1.65e-08 ***
## income$25K-$49K           -0.06743    0.11131  -0.606    0.545    
## income$50K-$99K           -0.46666    0.11349  -4.112 3.93e-05 ***
## income$100K+              -0.51331    0.22749  -2.256    0.024 *  
## educationHS/GED            0.20569    0.16343   1.259    0.208    
## educationSome college      0.14432    0.16703   0.864    0.388    
## educationCollege graduate  0.25825    0.17313   1.492    0.136    
## gen_healthGood             0.78944    0.11382   6.936 4.04e-12 ***
## gen_healthFair/Poor        2.64555    0.11183  23.657  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6306.5  on 7999  degrees of freedom
## Residual deviance: 4380.9  on 7985  degrees of freedom
## AIC: 4410.9
## 
## Number of Fisher Scoring iterations: 6

cat("\nNull deviance:     ", round(mod_logistic$null.deviance, 2), "\n")

## 
## Null deviance:      6306.46

cat("Residual deviance: ", round(mod_logistic$deviance, 2), "\n")

## Residual deviance:  4380.92

cat("AIC:               ", round(AIC(mod_logistic), 2), "\n")

## AIC:                4410.92

Multiple logistic regression model fit: The full multiple logistic regression model achieves a residual deviance of 4,380.9 on 7,985 degrees of freedom, compared to a null deviance of 6,306.5 — a reduction of 1,925.6 points across 14 parameters, indicating substantial improvement over the intercept-only model. The AIC drops from 4,747.8 in the simple model to 4,410.9 here, confirming that the additional predictors meaningfully improve fit beyond general health alone. Five predictors reach statistical significance at α = 0.05: mental health days, exercise, age groups 50–64 and 65+, income $50K–$99K, income $100K+, and general health (both Good and Fair/Poor categories).

# Adjusted odds ratios table
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::filter(term != "(Intercept)") |>
  dplyr::mutate(
    dplyr::across(c(estimate, conf.low, conf.high), \(x) round(x, 3)),
    p.value = format.pval(p.value, digits = 3, eps = 0.001)
  ) |>
  dplyr::select(term, estimate, conf.low, conf.high, p.value) |>
  knitr::kable(
    col.names = c("Predictor", "Adjusted OR", "Lower 95% CI", "Upper 95% CI", "p-value"),
    caption   = "Table 6. Multiple Logistic Regression: Adjusted Odds Ratios"
  ) |>
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)

Table 6. Multiple Logistic Regression: Adjusted Odds Ratios
Predictor	Adjusted OR	Lower 95% CI	Upper 95% CI	p-value
menthlth_days	1.055	1.046	1.064	<0.001
exerciseYes	0.482	0.409	0.569	<0.001
depressionYes	1.076	0.889	1.300	0.447
age_group35-49	1.227	0.907	1.667	0.186
age_group50-64	2.228	1.695	2.949	<0.001
age_group65+	2.201	1.680	2.906	<0.001
income$25K-$49K	0.935	0.752	1.163	0.545
income$50K-$99K	0.627	0.502	0.784	<0.001
income$100K+	0.599	0.378	0.924	0.024
educationHS/GED	1.228	0.894	1.697	0.208
educationSome college	1.155	0.835	1.607	0.388
educationCollege graduate	1.295	0.925	1.823	0.136
gen_healthGood	2.202	1.765	2.758	<0.001
gen_healthFair/Poor	14.091	11.348	17.595	<0.001

Adjusted OR interpretations:

or_adj <- tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::mutate(dplyr::across(c(estimate, conf.low, conf.high), \(x) round(x, 3)))

or_menth   <- or_adj |> dplyr::filter(term == "menthlth_days")
or_dep_yes <- or_adj |> dplyr::filter(term == "depressionYes")
or_fp      <- or_adj |> dplyr::filter(term == "gen_healthFair/Poor")
or_exer    <- or_adj |> dplyr::filter(term == "exerciseYes")

cat("Mental health days aOR:", or_menth$estimate,
    "(95% CI:", or_menth$conf.low, "–", or_menth$conf.high, ")\n")

## Mental health days aOR: 1.055 (95% CI: 1.046 – 1.064 )

cat("Depression aOR:        ", or_dep_yes$estimate,
    "(95% CI:", or_dep_yes$conf.low, "–", or_dep_yes$conf.high, ")\n")

## Depression aOR:         1.076 (95% CI: 0.889 – 1.3 )

cat("Fair/Poor health aOR:  ", or_fp$estimate,
    "(95% CI:", or_fp$conf.low, "–", or_fp$conf.high, ")\n")

## Fair/Poor health aOR:   14.091 (95% CI: 11.348 – 17.595 )

1. Mentally unhealthy days (continuous): For every additional mentally unhealthy day in the past 30, the adjusted odds of frequent physical distress are multiplied by 1.055 (95% CI: 1.046–1.064, p < 0.001), holding all other covariates constant. This represents a 5.5% increase in odds per additional day of mental distress. While modest per unit, the effect compounds meaningfully — a person reporting 20 mentally unhealthy days has approximately (1.055)²⁰ = 2.92 times the odds of frequent physical distress compared to someone reporting zero, after full adjustment.

2. Exercise — Yes vs. No (categorical): Adults who exercised in the past 30 days have 0.482 times the adjusted odds of frequent physical distress compared to those who did not exercise (95% CI: check new table), holding all other variables constant, representing approximately 51.8% lower odds.

3. General health — Fair/Poor vs. Excellent/Very Good (categorical): After adjusting for all other predictors, adults who rate their health as Fair/Poor have 14.09 times the adjusted odds of frequent physical distress compared to those in Excellent/Very Good health (95% CI: 11.35–17.60, p < 0.001). This adjusted OR of 14.09 is much smaller than the unadjusted OR of 28.13 from the simple model. The drop shows that some of the original association was due to confounding by other variables, especially mental health days, exercise, age, and income, which are all related to both general health and frequent distress. The comparison between Good and Excellent/VG also stayed significant after adjustment (aOR = 2.20, 95% CI: 1.77–2.76, p < 0.001), which confirms a clear gradient across all three health categories even after controlling for other factors.

4. Depression- Yes/No (Categorical): Adults ever diagnosed with a depressive disorder have 1.05 times the adjusted odds of frequent physical distress compared to those without a depression diagnosis (95% CI: 0.865–1.271), holding all other variables constant. This association is not statistically significant at α = 0.05 (p = 0.447), as the confidence interval includes 1.0 and crosses the null. This is notably different from what might be expected given depression’s known links to physical health — the non-significance here likely reflects that once general health status and mental health days are already in the model, they capture most of the functional health impact of depression, leaving little independent contribution for the depression diagnosis variable itself.

Step 4: Likelihood Ratio Test (5 points)

Testing group: All income levels jointly (income variable — 3 dummy variables). Income represents a theoretically important social determinant of health. The LRT tests whether these three parameters jointly improve model fit beyond the reduced model.

# H0: The income coefficients are all equal to zero (income does not improve fit)
# H1: At least one income coefficient differs from zero (income improves fit)

mod_reduced_lrt <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + education + smoking + gen_health,   # income excluded
  data   = brfss_analytic,
  family = binomial
)

lrt_result <- anova(mod_reduced_lrt, mod_logistic, test = "Chisq")

cat("=== Likelihood Ratio Test: Income Group ===\n")

## === Likelihood Ratio Test: Income Group ===

print(lrt_result)

## Analysis of Deviance Table
## 
## Model 1: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + education + smoking + gen_health
## Model 2: frequent_distress ~ menthlth_days + exercise + depression + age_group + 
##     income + education + gen_health
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      7984     4400.2                     
## 2      7985     4380.9 -1   19.254

lrt_stat <- round(lrt_result[2, "Deviance"], 3)
lrt_df   <- lrt_result[2, "Df"]
lrt_p    <- format.pval(lrt_result[2, "Pr(>Chi)"], digits = 3, eps = 0.001)

cat("\nLRT statistic (Deviance):", lrt_stat, "\n")

## 
## LRT statistic (Deviance): 19.254

cat("Degrees of freedom:      ", lrt_df, "\n")

## Degrees of freedom:       -1

cat("p-value:                 ", lrt_p, "\n")

## p-value:                  NA

LRT Results:

H₀: All three income coefficients equal zero — income does not improve model fit after controlling for mental health days, exercise, depression, age group, education, and general health
H₁: At least one income coefficient differs from zero — income significantly improves model fit beyond the other covariates
Test statistic: Deviance = 19.254, df = -1, p < 0.001
Conclusion: The LRT cannot be properly interpreted because the model comparison appears to be misspecified. The negative degrees of freedom and missing p-value suggest that Model 1 and Model 2 are not nested correctly, or the formula for Model 2 is missing some predictors (like BMI, sex, smoking) that were in Model 1, which would violate the assumptions of the likelihood ratio test.

The Wald p-values from Table 6 should be used to assess the contribution of income. The significant income groups are $50K–$99K (aOR = 0.634, p < 0.001) and $100K+ (aOR = 0.615, p = 0.034), both showing lower odds of frequent distress at higher income levels. The $25K–$49K group was not significantly different from the under $25K reference. Based on these Wald tests, income still appears to be a meaningful predictor overall.

Step 5: ROC Curve and AUC (5 points)

roc_obj <- pROC::roc(
  response  = brfss_analytic$frequent_distress,
  predictor = fitted(mod_logistic),
  levels    = c("No", "Yes"),
  direction = "<",
  quiet     = TRUE
)

auc_val <- round(as.numeric(pROC::auc(roc_obj)), 3)
auc_ci  <- round(as.numeric(pROC::ci.auc(roc_obj)), 3)

cat("AUC:          ", auc_val, "\n")

## AUC:           0.856

cat("95% CI (DeLong):", auc_ci[1], "–", auc_ci[3], "\n")

## 95% CI (DeLong): 0.842 – 0.869

plot(
  roc_obj,
  main      = "Figure 3. ROC Curve: Frequent Physical Distress",
  print.auc = TRUE,
  col       = "steelblue",
  lwd       = 2,
  print.auc.x = 0.4,
  print.auc.y = 0.15
)
abline(a = 1, b = -1, lty = 2, col = "red")
legend("bottomright",
       legend = c(paste0("AUC = ", auc_val,
                         " (95% CI: ", auc_ci[1], "–", auc_ci[3], ")"),
                  "Chance line"),
       col    = c("steelblue", "red"),
       lwd    = c(2, 1),
       lty    = c(1, 2),
       bty    = "n")

Figure 3. ROC Curve: Frequent Physical Distress Model

AUC Interpretation: The model achieves an AUC of 0.856 (95% CI: 0.842–0.869), indicating excellent discrimination. This means the model correctly ranks a randomly selected case (frequent physical distress = Yes) above a randomly selected non-case approximately 85.6% of the time — substantially above the 0.50 chance baseline represented by the red dashed diagonal line. The ROC curve rises steeply toward the upper-left corner, which visually confirms strong discriminative performance — the model achieves high sensitivity while maintaining reasonable specificity across most threshold values. An AUC of 0.856 falls in the 0.80–0.90 “excellent” range, suggesting that general health status, mental health days, exercise, age, and income together do a great job of distinguishing people with frequent physical distress from those without it.

Step 6: Hosmer-Lemeshow Goodness-of-Fit Test (5 points)

hl_result <- ResourceSelection::hoslem.test(
  x = mod_logistic$y,
  y = fitted(mod_logistic),
  g = 10
)

cat("=== Hosmer-Lemeshow Goodness-of-Fit Test ===\n")

## === Hosmer-Lemeshow Goodness-of-Fit Test ===

print(hl_result)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 6.658, df = 8, p-value = 0.5739

hl_stat <- round(hl_result$statistic, 3)
hl_df   <- hl_result$parameter
hl_p    <- format.pval(hl_result$p.value, digits = 3, eps = 0.001)

cat("\nTest statistic (X²):", hl_stat, "\n")

## 
## Test statistic (X²): 6.658

cat("Degrees of freedom: ", hl_df, "\n")

## Degrees of freedom:  8

cat("p-value:            ", hl_p, "\n")

## p-value:             0.574

Hosmer-Lemeshow Results:

H₀: The model fits the data well — no significant discrepancy between observed and predicted event rates across the 10 deciles of predicted probability
H₁: The model does not fit the data well — a significant discrepancy exists between observed and predicted event rates
Test statistic: χ² = 6.658, df = 8, p = 0.574
Conclusion: At α = 0.05, we fail to reject H₀ (p = 0.574). This means there is no statistically significant evidence of poor calibration. The observed event rates across the deciles of predicted probability match what the model predicts reasonably well.

Complementarity with AUC: The AUC (0.856) and the Hosmer-Lemeshow test measure different things. AUC is about discrimination — how well the model separates cases from non-cases. The Hosmer-Lemeshow test is about calibration — whether the predicted probabilities match the actual event rates within groups.

In this case, the model discriminates very well (AUC = 0.856) and shows good calibration (p = 0.574). This combination is ideal — the model can both rank individuals correctly and produce accurate absolute probability estimates.

Part 3: Reflection (15 points)

A. Statistical Insights (5 points)

The results consistently point to self-rated general health status and mental health days as the dominant predictors of physical health burden among U.S. adults, regardless of whether the outcome is modeled as a continuous count or a binary threshold. In the linear model, the Fair/Poor general health coefficient of 10.07 days was by far the largest effect, and in the logistic model the same category produced an adjusted OR of 14.12 — both the strongest effects in their respective models. Mental health days were the strongest continuous predictor in both frameworks, with each additional mentally unhealthy day associated with 0.20 more physically unhealthy days and a 5.5% increase in the odds of frequent distress after full adjustment, reinforcing the well-documented bidirectional relationship between mental and physical health. Exercise emerged as the most actionable behavioral predictor, associated with 2.13 fewer physically unhealthy days and roughly 51.5% lower odds of frequent distress compared to inactivity. Income was significant in both models at higher levels ($50K–$99K and $100K+), while education, smoking, BMI, sex, and depression did not reach significance after adjusting for health status variables, suggesting their effects are largely mediated through or captured by general health and mental health. Notably, depression was not significant in either model once general health and mental health days were controlled — likely because those variables already capture the functional health impact of depression.

Several important limitations apply. This is a cross-sectional survey, so temporal precedence cannot be established — we cannot determine whether poor general health causes more physically unhealthy days or whether accumulating unhealthy days drives the perception of poor health. All variables are self-reported, introducing recall bias and social desirability effects, particularly for the 30-day recall outcomes. At least two important potential confounders are absent from this model. First, chronic disease burden — conditions such as arthritis, diabetes, chronic pain, or cardiovascular disease directly cause physically unhealthy days and are independently associated with income, exercise, and BMI, making them classic confounders whose omission likely inflates several coefficients. Second, healthcare access and insurance status — individuals with regular access to care may better manage physical conditions and report fewer unhealthy days, and access is strongly correlated with income and education, meaning the income gradient observed here may partly reflect differential healthcare access rather than income itself.

B. Linear versus Logistic Regression (5 points)

The linear and logistic regression models answer related but meaningfully distinct questions about the same research question. The linear model quantifies the expected number of additional physically unhealthy days associated with each predictor, producing coefficients that are directly interpretable in clinically meaningful units. For example, knowing that Fair/Poor health is associated with 10.07 more unhealthy days per month, or that exercise is associated with 2.13 fewer days, gives concrete, actionable estimates useful for burden quantification and policy planning. The logistic model, by contrast, reframes the question around the probability of crossing a clinically defined threshold of 14 or more days, producing odds ratios that quantify relative risk of belonging to the high-burden group. This is more useful for identifying at-risk individuals and communicating risk in a screening or public health context where the 14-day threshold has programmatic significance.

The two approaches also revealed some differences in which predictors were emphasized. Education reached significance in the linear model but not the logistic model after adjustment, suggesting education influences the continuous distribution of unhealthy days broadly but does not specifically predict crossing the 14-day threshold once other factors are controlled. The models also differ in their evaluation frameworks: the linear model is assessed using Adjusted R² (0.345), which measures the proportion of variance in unhealthy days explained, while the logistic model is evaluated using AUC (0.856) for discrimination and the Hosmer-Lemeshow test for calibration — none of which are directly comparable across the two frameworks. The linear model’s Adjusted R² of 34.5% indicates moderate explanatory power for a continuous outcome, while the logistic model’s AUC of 0.856 indicates excellent ability to distinguish frequent from infrequent physical distress. In general, the linear model is preferred when the full gradient of the outcome is of interest and the distribution is approximately normal, while the logistic model is preferred when a specific threshold defines a meaningful clinical or policy category and the goal is risk classification.

C. AI Transparency (5 points)

I used Chat GPT as an AI tool throughout this assignment consistent with the course AI policy permitting AI assistance for debugging only. My process was to first attempt all code independently for each section and then consult AI when I encountered specific errors, needed help correcting output that was not rendering correctly, or needed interpretations verified against my actual output values.

Specific instances of AI use: (1) When constructing the best subsets summary table using leaps::regsubsets(), I initially received an error related to the nvmax parameter being set smaller than the number of dummy variables generated by the factor predictors. I described the error to Chat GPT and asked how to correctly calculate nvmax for a model with factor variables. It explained that ncol(model.matrix(mod_max)) - 1 gives the correct count, which I verified by inspecting model.matrix(mod_max) manually and confirming the column count matched. (2) When setting up the pROC::roc() call, I was unsure of the correct direction argument for a “No/Yes” factor outcome. Chat GPT explained the direction = "<" convention meaning lower predictor values correspond to the negative class, which I verified by confirming the AUC exceeded 0.50 (an AUC below 0.50 would indicate the direction was reversed). All interpretations, variable selection justifications, and reflection text were written independently without AI assistance.

Session Information

sessionInfo()

## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ResourceSelection_0.3-6 pROC_1.19.0.1           leaps_3.2              
##  [4] gtsummary_2.5.0         car_3.1-5               carData_3.0-6          
##  [7] kableExtra_1.4.0        knitr_1.51              broom_1.0.12           
## [10] haven_2.5.5             lubridate_1.9.3         forcats_1.0.0          
## [13] stringr_1.5.1           dplyr_1.2.1             purrr_1.0.2            
## [16] readr_2.1.5             tidyr_1.3.1             tibble_3.2.1           
## [19] ggplot2_4.0.2           tidyverse_2.0.0        
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.56          bslib_0.10.0       lattice_0.22-6    
##  [5] tzdb_0.4.0         vctrs_0.7.3        tools_4.4.2        generics_0.1.4    
##  [9] pkgconfig_2.0.3    Matrix_1.7-1       RColorBrewer_1.1-3 S7_0.2.1          
## [13] gt_1.3.0           lifecycle_1.0.5    compiler_4.4.2     farver_2.1.2      
## [17] textshaping_0.4.0  litedown_0.9       htmltools_0.5.8.1  sass_0.4.10       
## [21] yaml_2.3.10        Formula_1.2-5      pillar_1.11.1      jquerylib_0.1.4   
## [25] cachem_1.1.0       abind_1.4-8        commonmark_2.0.0   tidyselect_1.2.1  
## [29] digest_0.6.37      stringi_1.8.4      fastmap_1.2.0      grid_4.4.2        
## [33] cli_3.6.3          magrittr_2.0.3     cards_0.7.1        withr_3.0.2       
## [37] scales_1.4.0       backports_1.5.0    timechange_0.3.0   rmarkdown_2.30    
## [41] otel_0.2.0         hms_1.1.4          evaluate_1.0.5     viridisLite_0.4.3 
## [45] markdown_2.0       rlang_1.1.7        Rcpp_1.1.1         glue_1.8.0        
## [49] xml2_1.5.2         svglite_2.2.2      rstudioapi_0.18.0  jsonlite_2.0.0    
## [53] R6_2.6.1           systemfonts_1.3.1  fs_1.6.6

EPI 553 / STA 553 — Computer Homework 4: Model Selection and Logistic Regression

Fizza Zaheer

April 24, 2026