Part 0: Data Preparation (10 points)

Step 1: Load Packages and Import Data

library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(leaps)
library(pROC)
library(ResourceSelection)

brfss_raw <- read_xpt("LLCP2023.XPT")

cat("Rows in raw dataset:", nrow(brfss_raw), "\n")

## Rows in raw dataset: 433323

cat("Columns in raw dataset:", ncol(brfss_raw), "\n")

## Columns in raw dataset: 350

brfss_selected <- brfss_raw |>
  select(
    PHYSHLTH,
    MENTHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    ADDEPEV3,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`,
    GENHLTH,
    MARITAL
  )

Step 2: Recode Variables

brfss_clean <- brfss_selected |>
  mutate(
    # Outcome: physically unhealthy days
    physhlth_days = case_when(
      PHYSHLTH == 88                   ~ 0,
      PHYSHLTH >= 1 & PHYSHLTH <= 30  ~ as.numeric(PHYSHLTH),
      TRUE                             ~ NA_real_
    ),

    # Mental health days
    menthlth_days = case_when(
      MENTHLTH == 88                   ~ 0,
      MENTHLTH >= 1 & MENTHLTH <= 30  ~ as.numeric(MENTHLTH),
      TRUE                             ~ NA_real_
    ),

    # BMI
    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE            ~ as.numeric(`_BMI5`) / 100
    ),

    # Sex
    sex = factor(case_when(
      SEXVAR == 1 ~ "Male",
      SEXVAR == 2 ~ "Female",
      TRUE        ~ NA_character_
    ), levels = c("Male", "Female")),

    # Exercise
    exercise = factor(case_when(
      EXERANY2 == 1 ~ "Yes",
      EXERANY2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),

    # Depression
    depression = factor(case_when(
      ADDEPEV3 == 1 ~ "Yes",
      ADDEPEV3 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),

    # Age group
    age_group = factor(case_when(
      `_AGEG5YR` %in% 1:3  ~ "18-34",
      `_AGEG5YR` %in% 4:6  ~ "35-49",
      `_AGEG5YR` %in% 7:9  ~ "50-64",
      `_AGEG5YR` %in% 10:13 ~ "65+",
      TRUE                  ~ NA_character_
    ), levels = c("18-34", "35-49", "50-64", "65+")),

    # Income
    income = factor(case_when(
      `_INCOMG1` %in% 1:2 ~ "Less than $25K",
      `_INCOMG1` %in% 3:4 ~ "$25K-$49K",
      `_INCOMG1` %in% 5:6 ~ "$50K-$99K",
      `_INCOMG1` == 7     ~ "$100K+",
      TRUE                 ~ NA_character_
    ), levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")),

    # Education
    education = factor(case_when(
      EDUCA %in% 1:3 ~ "Less than HS",
      EDUCA == 4     ~ "HS/GED",
      EDUCA == 5     ~ "Some college",
      EDUCA == 6     ~ "College graduate",
      TRUE           ~ NA_character_
    ), levels = c("Less than HS", "HS/GED", "Some college", "College graduate")),

    # Smoking
    smoking = factor(case_when(
      `_SMOKER3` %in% 1:2 ~ "Current",
      `_SMOKER3` == 3     ~ "Former",
      `_SMOKER3` == 4     ~ "Never",
      TRUE                ~ NA_character_
    ), levels = c("Never", "Current", "Former")),

    # General health
    gen_health = factor(case_when(
      GENHLTH %in% 1:2 ~ "Excellent/Very Good",
      GENHLTH == 3     ~ "Good",
      GENHLTH %in% 4:5 ~ "Fair/Poor",
      TRUE             ~ NA_character_
    ), levels = c("Excellent/Very Good", "Good", "Fair/Poor")),

    # Marital status
    marital = factor(case_when(
      MARITAL %in% c(1, 6) ~ "Married/Partnered",
      MARITAL %in% 2:4     ~ "Previously married",
      MARITAL == 5         ~ "Never married",
      TRUE                 ~ NA_character_
    ), levels = c("Married/Partnered", "Previously married", "Never married"))
  )

Step 3: Analytic Sample

tibble(
  Variable = c("physhlth_days", "menthlth_days", "bmi",
               "depression", "smoking", "gen_health"),
  `N Missing` = c(
    sum(is.na(brfss_clean$physhlth_days)),
    sum(is.na(brfss_clean$menthlth_days)),
    sum(is.na(brfss_clean$bmi)),
    sum(is.na(brfss_clean$depression)),
    sum(is.na(brfss_clean$smoking)),
    sum(is.na(brfss_clean$gen_health))
  ),
  `% Missing` = round(c(
    mean(is.na(brfss_clean$physhlth_days)),
    mean(is.na(brfss_clean$menthlth_days)),
    mean(is.na(brfss_clean$bmi)),
    mean(is.na(brfss_clean$depression)),
    mean(is.na(brfss_clean$smoking)),
    mean(is.na(brfss_clean$gen_health))
  ) * 100, 2)
) |>
  kable(caption = "Table 0. Missing Values Before Exclusions") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 0. Missing Values Before Exclusions
Variable	N Missing	% Missing
physhlth_days	10785	2.49
menthlth_days	8108	1.87
bmi	40535	9.35
depression	2587	0.60
smoking	23062	5.32
gen_health	1262	0.29

set.seed(1220)
brfss_analytic <- brfss_clean |>
  drop_na() |>
  slice_sample(n = 8000)

cat("Final analytic n:", nrow(brfss_analytic), "\n")

## Final analytic n: 8000

brfss_analytic |>
  dplyr::select(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
                age_group, income, education, smoking, gen_health, marital) |>
  tbl_summary(
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    label = list(
      physhlth_days ~ "Physically unhealthy days (past 30)",
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      bmi           ~ "BMI",
      sex           ~ "Sex",
      exercise      ~ "Exercise (past 30 days)",
      depression    ~ "Ever depressive disorder",
      age_group     ~ "Age group",
      income        ~ "Annual household income",
      education     ~ "Education level",
      smoking       ~ "Smoking status",
      gen_health    ~ "General health status",
      marital       ~ "Marital status"
    ),
    missing = "no"
  ) |>
  bold_labels() |>
  modify_caption("**Table 1. Descriptive Statistics -- BRFSS 2023 (n = 8,000)**")

**Table 1. Descriptive Statistics – BRFSS 2023 (n = 8,000)**
Characteristic	N = 8,000¹
Physically unhealthy days (past 30)	4 (9)
Mentally unhealthy days (past 30)	4 (8)
BMI	29 (7)
Sex
Male	3,971 (50%)
Female	4,029 (50%)
Exercise (past 30 days)	6,157 (77%)
Ever depressive disorder	1,776 (22%)
Age group
18-34	1,294 (16%)
35-49	1,657 (21%)
50-64	2,109 (26%)
65+	2,940 (37%)
Annual household income
Less than $25K	1,090 (14%)
$25K-$49K	1,931 (24%)
$50K-$99K	4,297 (54%)
$100K+	682 (8.5%)
Education level
Less than HS	391 (4.9%)
HS/GED	1,887 (24%)
Some college	2,115 (26%)
College graduate	3,607 (45%)
Smoking status
Never	4,808 (60%)
Current	962 (12%)
Former	2,230 (28%)
General health status
Excellent/Very Good	3,926 (49%)
Good	2,648 (33%)
Fair/Poor	1,426 (18%)
Marital status
Married/Partnered	4,582 (57%)
Previously married	2,050 (26%)
Never married	1,368 (17%)
¹ Mean (SD); n (%)

Predictor justification: Ten predictors were selected based on their known associations with physical health burden in population surveys. Mental health days, depression, and general health status capture the mind-body connection and overall health perception. BMI, exercise, and smoking represent behavioral and physiological risk factors. Age group, income, education, and marital status capture social and demographic determinants of physical health.

Part 1: Model Selection for Linear Regression (35 points)

Step 1: Fit the Maximum Model

mod_max <- lm(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data = brfss_analytic
)

summary(mod_max)

## 
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise + 
##     depression + age_group + income + education + smoking + gen_health + 
##     marital, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.795  -2.839  -1.145   0.690  31.047 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.20132    0.62746   3.508 0.000453 ***
## menthlth_days              0.18362    0.01135  16.172  < 2e-16 ***
## bmi                       -0.01877    0.01287  -1.459 0.144638    
## sexFemale                  0.23449    0.16389   1.431 0.152538    
## exerciseYes               -1.93289    0.20408  -9.471  < 2e-16 ***
## depressionYes              0.77536    0.21528   3.602 0.000318 ***
## age_group35-49             0.76935    0.28365   2.712 0.006696 ** 
## age_group50-64             1.40108    0.27874   5.026 5.11e-07 ***
## age_group65+               1.72725    0.27852   6.202 5.87e-10 ***
## income$25K-$49K           -1.08440    0.27627  -3.925 8.74e-05 ***
## income$50K-$99K           -1.52385    0.27607  -5.520 3.50e-08 ***
## income$100K+              -1.31580    0.39821  -3.304 0.000956 ***
## educationHS/GED            0.29371    0.40089   0.733 0.463806    
## educationSome college      1.00198    0.40273   2.488 0.012867 *  
## educationCollege graduate  1.05262    0.40445   2.603 0.009269 ** 
## smokingCurrent             0.46414    0.26617   1.744 0.081241 .  
## smokingFormer              0.43777    0.18776   2.332 0.019749 *  
## gen_healthGood             1.40982    0.18635   7.565 4.30e-14 ***
## gen_healthFair/Poor       10.06509    0.25239  39.879  < 2e-16 ***
## maritalPreviously married -0.26701    0.20679  -1.291 0.196680    
## maritalNever married      -0.41244    0.24899  -1.656 0.097666 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared:  0.3258, Adjusted R-squared:  0.3242 
## F-statistic: 192.8 on 20 and 7979 DF,  p-value: < 2.2e-16

glance(mod_max) |>
  dplyr::select(r.squared, adj.r.squared, sigma, statistic, p.value, df.residual) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption   = "Table 2. Maximum Model Fit Statistics",
    col.names = c("R-squared", "Adj. R-squared", "RSE", "F-statistic", "p-value", "Residual df")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 2. Maximum Model Fit Statistics
R-squared	Adj. R-squared	RSE	F-statistic	p-value	Residual df
0.3258	0.3242	7.1122	192.8264	0	7979

cat("AIC:", round(AIC(mod_max), 2), "\n")

## AIC: 54115.05

cat("BIC:", round(BIC(mod_max), 2), "\n")

## BIC: 54268.77

r2_max    <- round(summary(mod_max)$r.squared, 4)
adj_r2_max <- round(summary(mod_max)$adj.r.squared, 4)

Interpretation: R-squared = 0.3258, meaning the maximum model explains 32.58% of the variance in physically unhealthy days. Adjusted R-squared = 0.3242 is slightly lower due to the penalty for including 11 predictors (with multiple dummy variables). AIC and BIC serve as baselines for comparing simpler models. Lower values are preferred. The highly significant overall F-test confirms that the model as a whole explains a statistically significant amount of variance.

Step 2: Best Subsets Regression

best_sub <- regsubsets(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data   = brfss_analytic,
  nvmax  = 25,
  method = "exhaustive"
)

best_sum <- summary(best_sub)

subset_metrics <- tibble(
  p         = 1:length(best_sum$adjr2),
  `Adj. R2` = best_sum$adjr2,
  BIC       = best_sum$bic
)

p1 <- ggplot(subset_metrics, aes(x = p, y = `Adj. R2`)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.max(best_sum$adjr2),
             linetype = "dashed", color = "tomato") +
  labs(title = "Adjusted R2 by Model Size", x = "Number of Variables", y = "Adjusted R2") +
  theme_minimal(base_size = 12)

p2 <- ggplot(subset_metrics, aes(x = p, y = BIC)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.min(best_sum$bic),
             linetype = "dashed", color = "tomato") +
  labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
  theme_minimal(base_size = 12)

gridExtra::grid.arrange(p1, p2, ncol = 2)

cat("Model size maximizing Adj. R2:", which.max(best_sum$adjr2), "\n")

## Model size maximizing Adj. R2: 19

cat("Model size minimizing BIC:", which.min(best_sum$bic), "\n")

## Model size minimizing BIC: 9

Note regsubsets() treats each dummy variable from a factor as a separate predictor. It may include some levels of a categorical variable while excluding others. This can produce models that are difficult to interpret and violates the convention of including or excluding an entire factor together. The stepwise methods in Step 3 handle factors as complete units, which is more appropriate for inference.

Interpretation: Adjusted R2 reaches its maximum at 19 variables. Afterwards, additional predictors fail to meaningfully improve fit. BIC minimizes at 9 variables. It favors a more stingy model. BIC penalizes complexity more heavily (log(n) vs. 2 per parameter in AIC), so it prefers the simpler model when the additional parameters provide only small gains in fit.

Step 3: Stepwise Selection Methods

mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)

# Backward elimination
mod_backward <- step(mod_max, direction = "backward", trace = 1)

## Start:  AIC=31410.04
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health + marital
## 
##                 Df Sum of Sq    RSS   AIC
## - marital        2       180 403789 31410
## <none>                       403609 31410
## - sex            1       104 403712 31410
## - bmi            1       108 403716 31410
## - smoking        2       347 403956 31413
## - depression     1       656 404265 31421
## - education      3       876 404485 31421
## - income         3      1550 405158 31435
## - age_group      3      2235 405844 31448
## - exercise       1      4537 408146 31497
## - menthlth_days  1     13230 416839 31666
## - gen_health     2     84670 488279 32930
## 
## Step:  AIC=31409.61
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## - bmi            1        98 403887 31410
## <none>                       403789 31410
## - sex            1       102 403891 31410
## - smoking        2       345 404134 31412
## - depression     1       637 404427 31420
## - education      3       875 404665 31421
## - income         3      1388 405177 31431
## - age_group      3      3048 406837 31464
## - exercise       1      4538 408327 31497
## - menthlth_days  1     13195 416984 31665
## - gen_health     2     84707 488496 32929
## 
## Step:  AIC=31409.55
## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       403887 31410
## - sex            1       103 403990 31410
## - smoking        2       357 404244 31413
## - depression     1       612 404499 31420
## - education      3       875 404762 31421
## - income         3      1394 405281 31431
## - age_group      3      3049 406936 31464
## - exercise       1      4451 408338 31495
## - menthlth_days  1     13238 417125 31666
## - gen_health     2     85645 489532 32944

# Forward selection
mod_forward <- step(mod_null,
                    scope     = list(lower = mod_null, upper = mod_max),
                    direction = "forward",
                    trace     = 1)

## Start:  AIC=34524.38
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    164022 434665 31967
## + menthlth_days  1     60303 538384 33677
## + exercise       1     34542 564145 34051
## + income         3     28842 569845 34135
## + depression     1     20750 577937 34244
## + smoking        2     12790 585897 34356
## + education      3      8593 590094 34415
## + marital        2      7321 591366 34430
## + bmi            1      6253 592434 34442
## + age_group      3      6527 592160 34443
## + sex            1      1649 597038 34504
## <none>                       598687 34524
## 
## Step:  AIC=31967.07
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1   17940.1 416725 31632
## + exercise       1    6466.7 428198 31849
## + depression     1    6070.1 428595 31857
## + income         3    3231.0 431434 31913
## + smoking        2    1776.3 432889 31938
## + age_group      3    1535.7 433129 31945
## + sex            1    1254.2 433411 31946
## + marital        2     830.3 433835 31956
## + education      3     415.5 434250 31965
## <none>                       434665 31967
## + bmi            1       0.2 434665 31969
## 
## Step:  AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
## 
##              Df Sum of Sq    RSS   AIC
## + exercise    1    5820.8 410904 31521
## + age_group   3    4661.5 412064 31548
## + income      3    1987.0 414738 31600
## + marital     2    1402.5 415322 31609
## + smoking     2    1034.6 415690 31616
## + depression  1     738.1 415987 31620
## + sex         1     605.2 416120 31622
## + education   3     318.9 416406 31632
## <none>                    416725 31632
## + bmi         1       3.6 416721 31634
## 
## Step:  AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##              Df Sum of Sq    RSS   AIC
## + age_group   3    3720.5 407184 31455
## + income      3    1198.4 409706 31504
## + marital     2    1064.8 409839 31505
## + depression  1     739.1 410165 31509
## + smoking     2     706.2 410198 31512
## + education   3     686.9 410217 31514
## + sex         1     449.3 410455 31515
## <none>                    410904 31521
## + bmi         1      82.9 410821 31522
## 
## Step:  AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##              Df Sum of Sq    RSS   AIC
## + income      3   1162.69 406021 31438
## + depression  1    932.70 406251 31438
## + education   3    501.69 406682 31451
## + sex         1    261.03 406923 31452
## + smoking     2    360.51 406823 31452
## <none>                    407184 31455
## + bmi         1     83.59 407100 31455
## + marital     2     40.11 407144 31458
## 
## Step:  AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##              Df Sum of Sq    RSS   AIC
## + depression  1    861.57 405159 31423
## + education   3    953.62 405067 31425
## + smoking     2    305.02 405716 31436
## + sex         1    203.01 405818 31436
## <none>                    406021 31438
## + bmi         1     74.91 405946 31438
## + marital     2    153.70 405867 31439
## 
## Step:  AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression
## 
##             Df Sum of Sq    RSS   AIC
## + education  3    837.21 404322 31412
## + smoking    2    259.11 404900 31422
## + sex        1    110.16 405049 31423
## + bmi        1    105.12 405054 31423
## <none>                   405159 31423
## + marital    2    169.72 404990 31423
## 
## Step:  AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education
## 
##           Df Sum of Sq    RSS   AIC
## + smoking  2    332.18 403990 31410
## + bmi      1    110.19 404212 31412
## <none>                 404322 31412
## + sex      1     77.87 404244 31413
## + marital  2    168.26 404154 31413
## 
## Step:  AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking
## 
##           Df Sum of Sq    RSS   AIC
## + sex      1    102.92 403887 31410
## <none>                 403990 31410
## + bmi      1     98.80 403891 31410
## + marital  2    169.54 403820 31410
## 
## Step:  AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
## 
##           Df Sum of Sq    RSS   AIC
## <none>                 403887 31410
## + bmi      1    97.861 403789 31410
## + marital  2   170.601 403716 31410

# Stepwise
mod_stepwise <- step(mod_null,
                     scope     = list(lower = mod_null, upper = mod_max),
                     direction = "both",
                     trace     = 1)

## Start:  AIC=34524.38
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    164022 434665 31967
## + menthlth_days  1     60303 538384 33677
## + exercise       1     34542 564145 34051
## + income         3     28842 569845 34135
## + depression     1     20750 577937 34244
## + smoking        2     12790 585897 34356
## + education      3      8593 590094 34415
## + marital        2      7321 591366 34430
## + bmi            1      6253 592434 34442
## + age_group      3      6527 592160 34443
## + sex            1      1649 597038 34504
## <none>                       598687 34524
## 
## Step:  AIC=31967.07
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1     17940 416725 31632
## + exercise       1      6467 428198 31849
## + depression     1      6070 428595 31857
## + income         3      3231 431434 31913
## + smoking        2      1776 432889 31938
## + age_group      3      1536 433129 31945
## + sex            1      1254 433411 31946
## + marital        2       830 433835 31956
## + education      3       416 434250 31965
## <none>                       434665 31967
## + bmi            1         0 434665 31969
## - gen_health     2    164022 598687 34524
## 
## Step:  AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
## 
##                 Df Sum of Sq    RSS   AIC
## + exercise       1      5821 410904 31521
## + age_group      3      4661 412064 31548
## + income         3      1987 414738 31600
## + marital        2      1402 415322 31609
## + smoking        2      1035 415690 31616
## + depression     1       738 415987 31620
## + sex            1       605 416120 31622
## + education      3       319 416406 31632
## <none>                       416725 31632
## + bmi            1         4 416721 31634
## - menthlth_days  1     17940 434665 31967
## - gen_health     2    121660 538384 33677
## 
## Step:  AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##                 Df Sum of Sq    RSS   AIC
## + age_group      3      3720 407184 31455
## + income         3      1198 409706 31504
## + marital        2      1065 409839 31505
## + depression     1       739 410165 31509
## + smoking        2       706 410198 31512
## + education      3       687 410217 31514
## + sex            1       449 410455 31515
## <none>                       410904 31521
## + bmi            1        83 410821 31522
## - exercise       1      5821 416725 31632
## - menthlth_days  1     17294 428198 31849
## - gen_health     2    101805 512709 33288
## 
## Step:  AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##                 Df Sum of Sq    RSS   AIC
## + income         3      1163 406021 31438
## + depression     1       933 406251 31438
## + education      3       502 406682 31451
## + sex            1       261 406923 31451
## + smoking        2       361 406823 31451
## <none>                       407184 31455
## + bmi            1        84 407100 31455
## + marital        2        40 407144 31458
## - age_group      3      3720 410904 31521
## - exercise       1      4880 412064 31548
## - menthlth_days  1     19957 427141 31835
## - gen_health     2     94875 502059 33126
## 
## Step:  AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##                 Df Sum of Sq    RSS   AIC
## + depression     1       862 405159 31423
## + education      3       954 405067 31425
## + smoking        2       305 405716 31436
## + sex            1       203 405818 31436
## <none>                       406021 31438
## + bmi            1        75 405946 31438
## + marital        2       154 405867 31439
## - income         3      1163 407184 31455
## - age_group      3      3685 409706 31504
## - exercise       1      4214 410235 31518
## - menthlth_days  1     18945 424966 31801
## - gen_health     2     87509 493530 32995
## 
## Step:  AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression
## 
##                 Df Sum of Sq    RSS   AIC
## + education      3       837 404322 31412
## + smoking        2       259 404900 31422
## + sex            1       110 405049 31423
## + bmi            1       105 405054 31423
## <none>                       405159 31423
## + marital        2       170 404990 31423
## - depression     1       862 406021 31438
## - income         3      1092 406251 31438
## - age_group      3      3871 409030 31493
## - exercise       1      4219 409378 31504
## - menthlth_days  1     13674 418834 31686
## - gen_health     2     86610 491770 32969
## 
## Step:  AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education
## 
##                 Df Sum of Sq    RSS   AIC
## + smoking        2       332 403990 31410
## + bmi            1       110 404212 31412
## <none>                       404322 31412
## + sex            1        78 404244 31413
## + marital        2       168 404154 31413
## - education      3       837 405159 31423
## - depression     1       745 405067 31425
## - income         3      1495 405818 31436
## - age_group      3      3558 407880 31476
## - exercise       1      4604 408926 31501
## - menthlth_days  1     13607 417929 31675
## - gen_health     2     86813 491135 32964
## 
## Step:  AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking
## 
##                 Df Sum of Sq    RSS   AIC
## + sex            1       103 403887 31410
## <none>                       403990 31410
## + bmi            1        99 403891 31410
## + marital        2       170 403820 31410
## - smoking        2       332 404322 31412
## - depression     1       691 404681 31421
## - education      3       910 404900 31422
## - income         3      1457 405447 31432
## - age_group      3      3178 407168 31466
## - exercise       1      4502 408492 31496
## - menthlth_days  1     13347 417337 31668
## - gen_health     2     85552 489542 32942
## 
## Step:  AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       403887 31410
## - sex            1       103 403990 31410
## + bmi            1        98 403789 31410
## + marital        2       171 403716 31410
## - smoking        2       357 404244 31413
## - depression     1       612 404499 31420
## - education      3       875 404762 31421
## - income         3      1394 405281 31431
## - age_group      3      3049 406936 31464
## - exercise       1      4451 408338 31495
## - menthlth_days  1     13238 417125 31666
## - gen_health     2     85645 489532 32944

tibble(
  Method           = c("Backward", "Forward", "Stepwise"),
  Variables        = c(
    paste(names(coef(mod_backward))[-1], collapse = ", "),
    paste(names(coef(mod_forward))[-1], collapse = ", "),
    paste(names(coef(mod_stepwise))[-1], collapse = ", ")
  ),
  `Adj. R2` = c(
    round(glance(mod_backward)$adj.r.squared, 4),
    round(glance(mod_forward)$adj.r.squared, 4),
    round(glance(mod_stepwise)$adj.r.squared, 4)
  ),
  AIC = c(round(AIC(mod_backward), 1),
          round(AIC(mod_forward), 1),
          round(AIC(mod_stepwise), 1)),
  BIC = c(round(BIC(mod_backward), 1),
          round(BIC(mod_forward), 1),
          round(BIC(mod_stepwise), 1))
) |>
  kable(caption = "Table 3. Stepwise Selection Results Comparison") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 3. Stepwise Selection Results Comparison
Method	Variables	Adj. R2	AIC	BIC
Backward	menthlth_days, sexFemale, exerciseYes, depressionYes, age_group35-49, age_group50-64, age_group65+, income$25K-$49K, income$50K-$99K, income$100K+, educationHS/GED, educationSome college, educationCollege graduate, smokingCurrent, smokingFormer, gen_healthGood, gen_healthFair/Poor	0.3239	54114.6	54247.3
Forward	gen_healthGood, gen_healthFair/Poor, menthlth_days, exerciseYes, age_group35-49, age_group50-64, age_group65+, income$25K-$49K, income$50K-$99K, income$100K+, depressionYes, educationHS/GED, educationSome college, educationCollege graduate, smokingCurrent, smokingFormer, sexFemale	0.3239	54114.6	54247.3
Stepwise	gen_healthGood, gen_healthFair/Poor, menthlth_days, exerciseYes, age_group35-49, age_group50-64, age_group65+, income$25K-$49K, income$50K-$99K, income$100K+, depressionYes, educationHS/GED, educationSome college, educationCollege graduate, smokingCurrent, smokingFormer, sexFemale	0.3239	54114.6	54247.3

Comparison: All three methods meet on the same final model with the same Adjusted R², AIC, and BIC, which indicates strong agreement. This convergence gives confidence that the selected predictors are the genuinely informative ones. Variables excluded by all three methods contribute very little explanatory power and are dropped. Variables retained by all three (including menthlth_days, gen_health, depression, and exercise) are the most important predictors of physically unhealthy days.

Step 4: Final Model Selection and Interpretation

mod_final <- mod_backward

tidy(mod_final, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption   = "Table 4. Final Linear Model Coefficients",
    col.names = c("Term", "Estimate", "Std. Error", "t-statistic",
                  "p-value", "95% CI Lower", "95% CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 4. Final Linear Model Coefficients
Term	Estimate	Std. Error	t-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	1.3853	0.4874	2.8425	0.0045	0.4300	2.3407
menthlth_days	0.1836	0.0114	16.1748	0.0000	0.1614	0.2059
sexFemale	0.2329	0.1633	1.4262	0.1539	-0.0872	0.5530
exerciseYes	-1.9048	0.2031	-9.3793	0.0000	-2.3029	-1.5067
depressionYes	0.7471	0.2149	3.4769	0.0005	0.3259	1.1683
age_group35-49	0.8374	0.2703	3.0980	0.0020	0.3075	1.3672
age_group50-64	1.4778	0.2583	5.7206	0.0000	0.9714	1.9841
age_group65+	1.8366	0.2513	7.3099	0.0000	1.3441	2.3292
income$25K-$49K	-1.0324	0.2749	-3.7557	0.0002	-1.5713	-0.4936
income$50K-$99K	-1.3884	0.2658	-5.2239	0.0000	-1.9094	-0.8674
income$100K+	-1.1122	0.3849	-2.8897	0.0039	-1.8666	-0.3577
educationHS/GED	0.2576	0.4006	0.6431	0.5202	-0.5276	1.0428
educationSome college	0.9636	0.4025	2.3943	0.0167	0.1747	1.7525
educationCollege graduate	1.0310	0.4043	2.5502	0.0108	0.2385	1.8236
smokingCurrent	0.4776	0.2647	1.8038	0.0713	-0.0414	0.9965
smokingFormer	0.4399	0.1876	2.3442	0.0191	0.0720	0.8077
gen_healthGood	1.3636	0.1839	7.4161	0.0000	1.0032	1.7241
gen_healthFair/Poor	10.0009	0.2486	40.2245	0.0000	9.5136	10.4883

b <- round(coef(mod_final), 3)

Choice justification: The final model is the result of backward elimination which meets on the same solution. This model is preferred over the maximum model because it drops predictors that did not improve fit while maintaining very similar Adjusted R². It is also preferred over the BIC-best model because it handles factors as complete units.

Coefficient interpretations:

menthlth_days: Each additional mentally unhealthy day in the past 30 days is associated with approximately 0.184 more physically unhealthy days on average, holding all other variables constant. This is one of the strongest continuous predictors which reflects the bidirectional relationship between mental and physical health.

exerciseYes: Compared to individuals who did not exercise in the past 30 days (the reference group), those who exercised report approximately 1.905 fewer physically unhealthy days on average, holding all other variables constant. The negative sign confirms exercise as independently protective against physical health burden.

gen_healthFair/Poor: Compared to individuals reporting Excellent/Very Good general health (the reference group), those reporting Fair/Poor general health report approximately 10.001 more physically unhealthy days on average, holding all other variables constant. This is the largest coefficient in the model which highlights the strong alignment between self-rated health and physical health burden.

Step 5: LINE Assumption Check

par(mfrow = c(2, 2))
plot(mod_final,
     col = adjustcolor("steelblue", alpha.f = 0.5),
     pch = 16)

Figure 1. Diagnostic Plots for the Final Linear Model

par(mfrow = c(1, 1))

Residuals vs. Fitted: The residuals show a slight fan pattern with greater spread at higher fitted values –> mild heteroscedasticity. There is also a visible cluster near zero fitted values corresponding to the large number of respondents reporting zero physically unhealthy days, which introduces non-linearity in this region.

Normal Q-Q Plot: The upper tail deviates from the diagonal reference line, indicating right-skewed residuals. This is expected given the large number of zero and near-zero responses. The right skew of the outcome itself carries through to the residuals.

Scale-Location Plot: The smoother shows an upward trend, confirming that residual variance increases with fitted values (heteroscedasticity).

Residuals vs. Leverage: A small number of observations show elevated leverage, but none appear to cross the Cook’s distance = 0.5 threshold. This suggests that no single observation is influencing the regression estimates.

Conclusion: The LINE assumptions are not fully satisfied. The residuals are right-skewed and heteroscedastic. Both are attributable to the zero-inflated, count-like distribution of physically unhealthy days. A log-transformation or a generalized linear model may be considered in a final report. However, the large sample size and the robustness of OLS to moderate assumption violations suggest that the coefficient estimates and their standard errors are valid for descriptive purposes.

Part 2: Logistic Regression (40 points)

Step 1: Create the Binary Outcome

brfss_analytic <- brfss_analytic |>
  mutate(
    frequent_distress = factor(
      ifelse(physhlth_days >= 14, "Yes", "No"),
      levels = c("No", "Yes")
    )
  )

brfss_analytic |>
  count(frequent_distress) |>
  mutate(`%` = round(n / sum(n) * 100, 1)) |>
  kable(
    caption   = "Table 5. Frequent Physical Distress (>= 14 days)",
    col.names = c("Frequent Distress", "N", "%")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 5. Frequent Physical Distress (>= 14 days)
Frequent Distress	N	%
No	6948	86.9
Yes	1052	13.2

prev <- round(mean(brfss_analytic$frequent_distress == "Yes") * 100, 1)

The prevalence of frequent physical distress is 13.2%, meaning approximately 1 in 8 adults in this sample reports 14 or more physically unhealthy days in the past 30 days. The outcome has fewer cases than non-cases.

Step 2: Simple Logistic Regression

Mental health days (menthlth_days) is selected as the predictor for the simple model because it was the strongest continuous predictor in the final linear model.

mod_simple <- glm(frequent_distress ~ menthlth_days,
                  data   = brfss_analytic,
                  family = binomial)

summary(mod_simple)

## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days, family = binomial, 
##     data = brfss_analytic)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.346000   0.042986  -54.58   <2e-16 ***
## menthlth_days  0.073236   0.003181   23.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 5734.5  on 7998  degrees of freedom
## AIC: 5738.5
## 
## Number of Fisher Scoring iterations: 5

# Odds ratio and 95% CI
or_simple <- round(exp(coef(mod_simple)), 4)
ci_simple <- round(exp(confint(mod_simple)), 4)

tibble(
  Term = names(or_simple),
  OR   = or_simple,
  `CI Lower` = ci_simple[, 1],
  `CI Upper` = ci_simple[, 2]
) |>
  kable(caption = "Table 6. Simple Logistic Regression: OR and 95% CI") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 6. Simple Logistic Regression: OR and 95% CI
Term	OR	CI Lower	CI Upper
(Intercept)	0.0958	0.0879	0.1041
menthlth_days	1.0760	1.0693	1.0827

or_val <- or_simple["menthlth_days"]
ci_lo  <- ci_simple["menthlth_days", 1]
ci_hi  <- ci_simple["menthlth_days", 2]
p_val  <- round(tidy(mod_simple)$p.value[2], 4)

Interpretation: For every one-day increase in mentally unhealthy days in the past 30 days, the odds of frequent physical distress are multiplied by 1.076 (95% CI: 1.0693 to 1.0827). Since the confidence interval does not include 1.0 and the Wald test p-value is 0 (< 0.001), the association is statistically significant at alpha = 0.05. Each additional mentally unhealthy day is associated with meaningfully higher odds of crossing the 14-day physical distress threshold.

Step 3: Multiple Logistic Regression

mod_logistic <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data   = brfss_analytic,
  family = binomial
)

summary(mod_logistic)

## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days + bmi + sex + 
##     exercise + depression + age_group + income + education + 
##     smoking + gen_health + marital, family = binomial, data = brfss_analytic)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.639413   0.291867 -12.469  < 2e-16 ***
## menthlth_days              0.044787   0.004358  10.277  < 2e-16 ***
## bmi                       -0.004822   0.005556  -0.868  0.38546    
## sexFemale                  0.176389   0.081382   2.167  0.03020 *  
## exerciseYes               -0.586264   0.085768  -6.835 8.17e-12 ***
## depressionYes              0.267888   0.095954   2.792  0.00524 ** 
## age_group35-49             0.512672   0.162401   3.157  0.00160 ** 
## age_group50-64             0.767269   0.154816   4.956 7.20e-07 ***
## age_group65+               0.964882   0.155220   6.216 5.09e-10 ***
## income$25K-$49K           -0.215375   0.111712  -1.928  0.05386 .  
## income$50K-$99K           -0.501752   0.118036  -4.251 2.13e-05 ***
## income$100K+              -0.541456   0.227454  -2.381  0.01729 *  
## educationHS/GED            0.049334   0.165069   0.299  0.76504    
## educationSome college      0.331979   0.165058   2.011  0.04430 *  
## educationCollege graduate  0.283640   0.171335   1.655  0.09783 .  
## smokingCurrent             0.208048   0.116374   1.788  0.07381 .  
## smokingFormer              0.200388   0.089714   2.234  0.02551 *  
## gen_healthGood             0.895044   0.116270   7.698 1.38e-14 ***
## gen_healthFair/Poor        2.696530   0.115309  23.385  < 2e-16 ***
## maritalPreviously married -0.138611   0.096205  -1.441  0.14965    
## maritalNever married      -0.143542   0.127789  -1.123  0.26132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4420.7  on 7979  degrees of freedom
## AIC: 4462.7
## 
## Number of Fisher Scoring iterations: 6

tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption   = "Table 7. Multiple Logistic Regression: Adjusted Odds Ratios",
    col.names = c("Term", "OR", "Std. Error", "z-statistic",
                  "p-value", "95% CI Lower", "95% CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 7. Multiple Logistic Regression: Adjusted Odds Ratios
Term	OR	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.0263	0.2919	-12.4694	0.0000	0.0148	0.0463
menthlth_days	1.0458	0.0044	10.2769	0.0000	1.0369	1.0548
bmi	0.9952	0.0056	-0.8679	0.3855	0.9844	1.0061
sexFemale	1.1929	0.0814	2.1674	0.0302	1.0172	1.3995
exerciseYes	0.5564	0.0858	-6.8355	0.0000	0.4704	0.6585
depressionYes	1.3072	0.0960	2.7918	0.0052	1.0821	1.5764
age_group35-49	1.6697	0.1624	3.1568	0.0016	1.2174	2.3020
age_group50-64	2.1539	0.1548	4.9560	0.0000	1.5956	2.9287
age_group65+	2.6245	0.1552	6.2162	0.0000	1.9437	3.5731
income$25K-$49K	0.8062	0.1117	-1.9279	0.0539	0.6478	1.0039
income$50K-$99K	0.6055	0.1180	-4.2508	0.0000	0.4806	0.7634
income$100K+	0.5819	0.2275	-2.3805	0.0173	0.3680	0.8993
educationHS/GED	1.0506	0.1651	0.2989	0.7650	0.7621	1.4562
educationSome college	1.3937	0.1651	2.0113	0.0443	1.0114	1.9323
educationCollege graduate	1.3280	0.1713	1.6555	0.0978	0.9520	1.8641
smokingCurrent	1.2313	0.1164	1.7878	0.0738	0.9788	1.5448
smokingFormer	1.2219	0.0897	2.2336	0.0255	1.0243	1.4561
gen_healthGood	2.4474	0.1163	7.6980	0.0000	1.9524	3.0809
gen_healthFair/Poor	14.8282	0.1153	23.3853	0.0000	11.8632	18.6475
maritalPreviously married	0.8706	0.0962	-1.4408	0.1496	0.7204	1.0505
maritalNever married	0.8663	0.1278	-1.1233	0.2613	0.6730	1.1108

b_log <- round(exp(coef(mod_logistic)), 3)

Interpretation of adjusted ORs:

menthlth_days: Each additional mentally unhealthy day is associated with 1.046 times the odds of frequent physical distress, holding all other variables constant. This is one of the strongest continuous predictors in the multiple model which confirms the pattern seen in the simple model.

exerciseYes: Compared to non-exercisers (reference), those who exercised in the past 30 days have 0.556 times the odds of frequent physical distress, holding all other variables constant. Since this OR is below 1, exercise is independently protective.

depressionYes: Compared to those without a history of depressive disorder (reference), those who were told they have depression have 1.307 times the odds of frequent physical distress, holding all other variables constant. This is a substantially elevated risk that underscores the burden of depression.

Step 4: Likelihood Ratio Test

# Test smoking as a group (Current vs. Never, Former vs. Never)
mod_logistic_no_smoking <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + gen_health + marital,
  data   = brfss_analytic,
  family = binomial
)

anova(mod_logistic_no_smoking, mod_logistic, test = "Chisq") |>
  as.data.frame() |>
  rownames_to_column("Model") |>
  mutate(
    Model = c("Reduced (no smoking)", "Full (+ smoking)"),
    across(where(is.numeric), \(x) round(x, 4))
  ) |>
  kable(
    caption   = "Table 8. Likelihood Ratio Test: Smoking as a Group",
    col.names = c("Model", "Res. df", "Residual Dev.", "df", "Dev. diff.", "p-value")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 8. Likelihood Ratio Test: Smoking as a Group
Model	Res. df	Residual Dev.	df	Dev. diff.	p-value
Reduced (no smoking)	7981	4426.873	NA	NA	NA
Full (+ smoking)	7979	4420.659	2	6.2138	0.0447

H₀: The smoking coefficients for Current and Former (vs. Never reference) are both equal to zero. Smoking as a group does not improve prediction of frequent physical distress after adjusting for all other predictors.

H_A: At least one smoking coefficient differs from zero. Smoking collectively adds predictive information.

The LR test yielded a deviance difference of 6.214 on 2 degrees of freedom (p = 0.0447). Since p < 0.05, we reject H₀ and conclude that smoking status as a group significantly improves the prediction of frequent physical distress after adjusting for all other predictors. The two smoking dummy variables contribute meaningful information that would be lost if smoking were excluded from the model.

Step 5: ROC Curve and AUC

roc_obj <- roc(brfss_analytic$frequent_distress, fitted(mod_logistic))

plot(roc_obj,
     main      = "ROC Curve: Frequent Physical Distress Model",
     print.auc = TRUE,
     col       = "steelblue",
     lwd       = 2)

Figure 2. ROC Curve: Frequent Physical Distress Model

auc_val    <- round(auc(roc_obj), 3)
auc_ci     <- round(ci.auc(roc_obj), 3)

tibble(
  AUC            = auc_val,
  `95% CI Lower` = auc_ci[1],
  `95% CI Upper` = auc_ci[3],
  Interpretation = case_when(
    auc_val >= 0.9 ~ "Outstanding",
    auc_val >= 0.8 ~ "Excellent",
    auc_val >= 0.7 ~ "Acceptable",
    auc_val >= 0.5 ~ "Poor",
    TRUE           ~ "No discrimination"
  )
) |>
  kable(caption = "Table 9. AUC and Discrimination Interpretation") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 9. AUC and Discrimination Interpretation
AUC	95% CI Lower	95% CI Upper	Interpretation
0.855	0.843	0.868	Excellent

Interpretation: The AUC of 0.855 (95% CI: 0.843 to 0.868) indicates excellent discrimination. The model correctly ranks a randomly selected individual with frequent physical distress wabove a randomly selected individual without it with a probability of 0.855, substantially above the 0.5 chance level.

Step 6: Hosmer-Lemeshow Goodness-of-Fit Test

hl_test <- hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)

hl_test

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 5.5773, df = 8, p-value = 0.6945

tibble(
  `X-squared` = round(hl_test$statistic, 3),
  df          = hl_test$parameter,
  `p-value`   = round(hl_test$p.value, 4)
) |>
  kable(caption = "Table 10. Hosmer-Lemeshow Goodness-of-Fit Test") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 10. Hosmer-Lemeshow Goodness-of-Fit Test
X-squared	df	p-value
5.577	8	0.6945

H₀: The model fits the data well. Predicted and observed frequencies agree across deciles of predicted probability.

H_A: The model does not fit the data well. There are significant discrepancies between predicted and observed frequencies.

The Hosmer-Lemeshow test produced X² = 5.577, df = 8, p = 0.6945. Since p > 0.05, we fail to reject H₀ and conclude that the model fits the data well. The predicted and observed FPD proportions agree across the ten deciles of predicted probability. The Hosmer-Lemeshow result builds on the AUC finding. While the AUC of 0.855 confirms that the model discriminates excellently between individuals with and without frequent physical distress, the Hosmer-Lemeshow test confirms that the predicted probabilities are also well-calibrated to the observed rates. Both criteria are satisfied. Therefore, a model that is both accurate in its rankings and honest in its probability estimates.

Part 3: Reflection (15 points)

A. Statistical Insights

The results suggest that mental health burden, general health status, depression history, and exercise are the most consistently strong predictors of physical health burden among U.S. adults. They appear significant across both the linear and logistic models. Mental health days emerged as the strongest continuous predictor in both frameworks. This is consistent with the bidirectional relationship between mental and physical health. Individuals experiencing psychological distress are more likely to report symptoms and physical health limitations. General health status produced the largest coefficient in the linear model. This reflects that self-rated health is a measure that captures physical symptoms directly. Exercise was significant in both models as a protective factor, while depression history increased odds of frequent physical distress substantially in the logistic model. Income and education were significant in the linear model. However, their effects were reduced after adjusting for behavioral and health status variables. The key limitation of cross-sectional BRFSS data is that temporal ordering cannot be established. We cannot determine whether: - poor mental health causes more physical health days - physical illness depresses mood, or - both are driven by a common factor. Two specific unmeasured confounders that could skew these associations are chronic pain conditions and access to primary care.

B. Linear versus Logistic Regression

The linear regression model answers “by how much does physical health burden increase for a one-unit change in each predictor”. It provides a continuous, quantitative estimate of association in days as units. The logistic regression model answers “how do the odds of crossing a clinically meaningful threshold change”. It is more directly actionable for public health surveillance and policy because the CDC threshold of 14+ days has a meaning. Neither model is strictly superior. The linear model preserves the full information in the continuous outcome and avoids the 14-day cutoff. The logistic model is more robust to the distribution violations that afflict linear regression with this outcome type. The model selection criteria also differ. The linear framework uses Adjusted R², AIC, and BIC to select predictors based on how much variance they explain. However, logistic regression uses the AUC to evaluate how well the model discriminates between cases and non-cases, and the Hosmer-Lemeshow test to evaluate calibration. R² has no direct equivalent in logistic regression; McFadden’s pseudo-R² is not comparable to linear R². A value of 0.10 in logistic regression may indicate the same relative improvement in fit as R² = 0.30 in linear regression.

C. AI Transparency

As part of my process I encountered some syntax errors. This included missing brackets and “” sign when categorizing the variables(Error: Incomplete expression:). As the chunk of code was pretty large It was a bit of a challenge to find, I used AI to find the missing symbols. I just copied and pasted the whole error: “Error: Incomplete expression: brfss_clean <- brfss_selected |> mutate( # Outcome: physically unhealthy days physhlth_days = case_when( PHYSHLTH == 88 ~ 0, PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH), TRUE ~ NA_real_ ),

# Mental health days
menthlth_days = case_when(
  MENTHLTH == 88                   ~ 0,
  MENTHLTH >= 1 & MENTHLTH <= 30  ~ as.numeric(MENTHLTH),
  TRUE                             ~ NA_real_
),

# BMI
bmi = case_when(
  `_BMI5` == 9999 ~ NA_real_,
  TRUE            ~ as.numeric(`_BMI5`) / 100
),

# Sex
sex = factor(case_when(
  SEXVAR == 1 ~ "Male",
  SEXVAR == 2 ~ "Female",
  TRUE        ~ NA_character_
), levels = c("Male", "Female")),

# Exercise
exercise = factor(case_when(
  EXERANY2 == 1 ~ "Yes",
  EXERANY2 == 2 ~ "No",
  TRUE          ~ NA_character_
), levels = c("No", "Yes")),

” Once the I received the output pointing me to the missing ) I corrected my code and let it run again.

```

End of Homework 4

EPI 553 - Homework 4: Model Selection, Linear and Logistic Regression

Viktoriya Yatsiv

Due: Friday, April 24, 2026 at 11:59 PM ET